From 5b85cbbf8e8a37a22b0a86f1bcb785d94157625b Mon Sep 17 00:00:00 2001 From: Aleksei Smirnov Date: Tue, 1 Aug 2023 16:19:20 +0300 Subject: [PATCH 1/2] Use CultureInfo for parsing values in csv file --- src/Microsoft.Data.Analysis/DataFrame.IO.cs | 28 +++++++++----- src/Microsoft.Data.Analysis/DataFrame.cs | 16 ++++++-- .../DataFrame.IOTests.cs | 37 +++++++++++++++++++ 3 files changed, 67 insertions(+), 14 deletions(-) diff --git a/src/Microsoft.Data.Analysis/DataFrame.IO.cs b/src/Microsoft.Data.Analysis/DataFrame.IO.cs index 44448da008..6da72d2a1c 100644 --- a/src/Microsoft.Data.Analysis/DataFrame.IO.cs +++ b/src/Microsoft.Data.Analysis/DataFrame.IO.cs @@ -97,18 +97,19 @@ private static Type MaxKind(Type a, Type b) /// number of rows used to guess types /// add one column with the row index /// The character encoding. Defaults to UTF8 if not specified + /// culture info for formatting values /// DataFrame public static DataFrame LoadCsv(string filename, char separator = ',', bool header = true, string[] columnNames = null, Type[] dataTypes = null, int numRows = -1, int guessRows = 10, - bool addIndexColumn = false, Encoding encoding = null) + bool addIndexColumn = false, Encoding encoding = null, CultureInfo cultureInfo = null) { using (Stream fileStream = new FileStream(filename, FileMode.Open)) { return LoadCsv(fileStream, separator: separator, header: header, columnNames: columnNames, dataTypes: dataTypes, numberOfRowsToRead: numRows, - guessRows: guessRows, addIndexColumn: addIndexColumn, encoding: encoding); + guessRows: guessRows, addIndexColumn: addIndexColumn, encoding: encoding, cultureInfo); } } @@ -349,9 +350,14 @@ private static DataFrameColumn CreateColumn(Type kind, string[] columnNames, int private static DataFrame ReadCsvLinesIntoDataFrame(WrappedStreamReaderOrStringReader wrappedReader, char separator = ',', bool header = true, string[] columnNames = null, Type[] dataTypes = null, - long numberOfRowsToRead = -1, int guessRows = 10, bool addIndexColumn = false - ) + long numberOfRowsToRead = -1, int guessRows = 10, bool addIndexColumn = false, + CultureInfo cultureInfo = null) { + if (cultureInfo == null) + { + cultureInfo = CultureInfo.CurrentCulture; + } + if (dataTypes == null && guessRows <= 0) { throw new ArgumentException(string.Format(Strings.ExpectedEitherGuessRowsOrDataTypes, nameof(guessRows), nameof(dataTypes))); @@ -432,7 +438,7 @@ private static DataFrame ReadCsvLinesIntoDataFrame(WrappedStreamReaderOrStringRe } else { - ret.Append(fields, inPlace: true); + ret.Append(fields, inPlace: true, cultureInfo: cultureInfo); } ++rowline; } @@ -489,7 +495,6 @@ public TextReader GetTextReader() } } - } /// @@ -503,14 +508,16 @@ public TextReader GetTextReader() /// number of rows to read not including the header(if present) /// number of rows used to guess types /// add one column with the row index + /// culture info for formatting values /// public static DataFrame LoadCsvFromString(string csvString, char separator = ',', bool header = true, string[] columnNames = null, Type[] dataTypes = null, - long numberOfRowsToRead = -1, int guessRows = 10, bool addIndexColumn = false) + long numberOfRowsToRead = -1, int guessRows = 10, bool addIndexColumn = false, + CultureInfo cultureInfo = null) { WrappedStreamReaderOrStringReader wrappedStreamReaderOrStringReader = new WrappedStreamReaderOrStringReader(csvString); - return ReadCsvLinesIntoDataFrame(wrappedStreamReaderOrStringReader, separator, header, columnNames, dataTypes, numberOfRowsToRead, guessRows, addIndexColumn); + return ReadCsvLinesIntoDataFrame(wrappedStreamReaderOrStringReader, separator, header, columnNames, dataTypes, numberOfRowsToRead, guessRows, addIndexColumn, cultureInfo); } /// @@ -525,12 +532,13 @@ public static DataFrame LoadCsvFromString(string csvString, /// number of rows used to guess types /// add one column with the row index /// The character encoding. Defaults to UTF8 if not specified + /// culture info for formatting values /// public static DataFrame LoadCsv(Stream csvStream, char separator = ',', bool header = true, string[] columnNames = null, Type[] dataTypes = null, long numberOfRowsToRead = -1, int guessRows = 10, bool addIndexColumn = false, - Encoding encoding = null) + Encoding encoding = null, CultureInfo cultureInfo = null) { if (!csvStream.CanSeek) { @@ -543,7 +551,7 @@ public static DataFrame LoadCsv(Stream csvStream, } WrappedStreamReaderOrStringReader wrappedStreamReaderOrStringReader = new WrappedStreamReaderOrStringReader(csvStream, encoding ?? Encoding.UTF8); - return ReadCsvLinesIntoDataFrame(wrappedStreamReaderOrStringReader, separator, header, columnNames, dataTypes, numberOfRowsToRead, guessRows, addIndexColumn); + return ReadCsvLinesIntoDataFrame(wrappedStreamReaderOrStringReader, separator, header, columnNames, dataTypes, numberOfRowsToRead, guessRows, addIndexColumn, cultureInfo); } /// diff --git a/src/Microsoft.Data.Analysis/DataFrame.cs b/src/Microsoft.Data.Analysis/DataFrame.cs index 78d453d44a..ad02200e6e 100644 --- a/src/Microsoft.Data.Analysis/DataFrame.cs +++ b/src/Microsoft.Data.Analysis/DataFrame.cs @@ -5,6 +5,7 @@ using System; using System.Collections.Generic; using System.Diagnostics; +using System.Globalization; using System.Linq; using System.Text; @@ -484,12 +485,13 @@ private void ResizeByOneAndAppend(DataFrameColumn column, object value) /// If a in is null, a null value is appended to each column /// The rows to be appended to this DataFrame /// If set, appends in place. Otherwise, a new DataFrame is returned with the appended - public DataFrame Append(IEnumerable rows, bool inPlace = false) + /// culture info for formatting values + public DataFrame Append(IEnumerable rows, bool inPlace = false, CultureInfo cultureInfo = null) { DataFrame ret = inPlace ? this : Clone(); foreach (DataFrameRow row in rows) { - ret.Append(row, inPlace: true); + ret.Append(row, inPlace: true, cultureInfo: cultureInfo); } return ret; } @@ -501,8 +503,14 @@ public DataFrame Append(IEnumerable rows, bool inPlace = false) /// If is null, a null value is appended to each column /// /// If set, appends a in place. Otherwise, a new DataFrame is returned with an appended - public DataFrame Append(IEnumerable row = null, bool inPlace = false) + /// culture info for formatting values + public DataFrame Append(IEnumerable row = null, bool inPlace = false, CultureInfo cultureInfo = null) { + if (cultureInfo == null) + { + cultureInfo = CultureInfo.CurrentCulture; + } + DataFrame ret = inPlace ? this : Clone(); IEnumerator columnEnumerator = ret.Columns.GetEnumerator(); bool columnMoveNext = columnEnumerator.MoveNext(); @@ -530,7 +538,7 @@ public DataFrame Append(IEnumerable row = null, bool inPlace = false) } if (value != null) { - value = Convert.ChangeType(value, column.DataType); + value = Convert.ChangeType(value, column.DataType, cultureInfo); if (value is null) { diff --git a/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs b/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs index 05565673b0..bb970bb5a5 100644 --- a/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs +++ b/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs @@ -14,6 +14,7 @@ using System.Data.SQLite.EF6; using Xunit; using Microsoft.ML.TestFramework.Attributes; +using System.Threading; namespace Microsoft.Data.Analysis.Tests { @@ -154,6 +155,42 @@ void ReducedRowsTest(DataFrame reducedRows) ReducedRowsTest(csvDf); } + [Fact] + public void TestReadCsvWithHeaderCultureInfoAndSeparator() + { + string data = @$"vendor_id;rate_code;passenger_count;trip_time_in_secs;trip_distance;payment_type;fare_amount +CMT;1;1;1271;3,8;CRD;17,5 +CMT;1;1;474;1,5;CRD;8 +CMT;1;1;637;1,4;CRD;8,5 +CMT;1;1;181;0,6;CSH;4,5"; + + void RegularTest(DataFrame df) + { + Assert.Equal(4, df.Rows.Count); + Assert.Equal(7, df.Columns.Count); + + Assert.Equal(3.8f, (float)df["trip_distance"][0]); + Assert.Equal(17.5f, (float)df["fare_amount"][0]); + + Assert.Equal(1.5f, (float)df["trip_distance"][1]); + Assert.Equal(8f, (float)df["fare_amount"][1]); + + Assert.Equal(1.4f, (float)df["trip_distance"][2]); + Assert.Equal(8.5f, (float)df["fare_amount"][2]); + + VerifyColumnTypes(df); + } + + // de-DE has ',' as decimal separator + var cultureInfo = new CultureInfo("de-DE"); + DataFrame df = DataFrame.LoadCsv(GetStream(data), separator: ';', cultureInfo: cultureInfo); + + RegularTest(df); + + DataFrame csvDf = DataFrame.LoadCsvFromString(data, separator: ';', cultureInfo: cultureInfo); + RegularTest(csvDf); + } + [Fact] public void TestReadCsvSplitAcrossMultipleLines() { From a666419ccb508892a00f02381dff7a1ff9beb2d2 Mon Sep 17 00:00:00 2001 From: Aleksei Smirnov Date: Thu, 31 Aug 2023 09:56:48 +0300 Subject: [PATCH 2/2] Fix merge issues --- src/Microsoft.Data.Analysis/DataFrame.IO.cs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/Microsoft.Data.Analysis/DataFrame.IO.cs b/src/Microsoft.Data.Analysis/DataFrame.IO.cs index a478bc5855..a2a55612ae 100644 --- a/src/Microsoft.Data.Analysis/DataFrame.IO.cs +++ b/src/Microsoft.Data.Analysis/DataFrame.IO.cs @@ -98,19 +98,21 @@ private static Type MaxKind(Type a, Type b) /// number of rows used to guess types /// add one column with the row index /// The character encoding. Defaults to UTF8 if not specified + /// If set to true, columns with repeated names are auto-renamed. /// culture info for formatting values /// DataFrame public static DataFrame LoadCsv(string filename, char separator = ',', bool header = true, string[] columnNames = null, Type[] dataTypes = null, int numRows = -1, int guessRows = 10, - bool addIndexColumn = false, Encoding encoding = null, CultureInfo cultureInfo = null) + bool addIndexColumn = false, Encoding encoding = null, + bool renameDuplicatedColumns = false, CultureInfo cultureInfo = null) { using (Stream fileStream = new FileStream(filename, FileMode.Open)) { return LoadCsv(fileStream, separator: separator, header: header, columnNames: columnNames, dataTypes: dataTypes, numberOfRowsToRead: numRows, - guessRows: guessRows, addIndexColumn: addIndexColumn, encoding: encoding, cultureInfo); + guessRows: guessRows, addIndexColumn: addIndexColumn, encoding: encoding, renameDuplicatedColumns: renameDuplicatedColumns, cultureInfo: cultureInfo); } } @@ -528,16 +530,18 @@ public TextReader GetTextReader() /// number of rows to read not including the header(if present) /// number of rows used to guess types /// add one column with the row index + /// If set to true, columns with repeated names are auto-renamed. /// culture info for formatting values /// public static DataFrame LoadCsvFromString(string csvString, char separator = ',', bool header = true, string[] columnNames = null, Type[] dataTypes = null, long numberOfRowsToRead = -1, int guessRows = 10, bool addIndexColumn = false, + bool renameDuplicatedColumns = false, CultureInfo cultureInfo = null) { WrappedStreamReaderOrStringReader wrappedStreamReaderOrStringReader = new WrappedStreamReaderOrStringReader(csvString); - return ReadCsvLinesIntoDataFrame(wrappedStreamReaderOrStringReader, separator, header, columnNames, dataTypes, numberOfRowsToRead, guessRows, addIndexColumn, cultureInfo); + return ReadCsvLinesIntoDataFrame(wrappedStreamReaderOrStringReader, separator, header, columnNames, dataTypes, numberOfRowsToRead, guessRows, addIndexColumn, renameDuplicatedColumns, cultureInfo); } ///