diff --git a/src/Microsoft.Data.Analysis/DataFrame.IO.cs b/src/Microsoft.Data.Analysis/DataFrame.IO.cs index d7f53d3004..a2a55612ae 100644 --- a/src/Microsoft.Data.Analysis/DataFrame.IO.cs +++ b/src/Microsoft.Data.Analysis/DataFrame.IO.cs @@ -98,18 +98,21 @@ private static Type MaxKind(Type a, Type b) /// number of rows used to guess types /// add one column with the row index /// The character encoding. Defaults to UTF8 if not specified + /// If set to true, columns with repeated names are auto-renamed. + /// culture info for formatting values /// DataFrame public static DataFrame LoadCsv(string filename, char separator = ',', bool header = true, string[] columnNames = null, Type[] dataTypes = null, int numRows = -1, int guessRows = 10, - bool addIndexColumn = false, Encoding encoding = null) + bool addIndexColumn = false, Encoding encoding = null, + bool renameDuplicatedColumns = false, CultureInfo cultureInfo = null) { using (Stream fileStream = new FileStream(filename, FileMode.Open)) { return LoadCsv(fileStream, separator: separator, header: header, columnNames: columnNames, dataTypes: dataTypes, numberOfRowsToRead: numRows, - guessRows: guessRows, addIndexColumn: addIndexColumn, encoding: encoding); + guessRows: guessRows, addIndexColumn: addIndexColumn, encoding: encoding, renameDuplicatedColumns: renameDuplicatedColumns, cultureInfo: cultureInfo); } } @@ -351,8 +354,14 @@ private static DataFrame ReadCsvLinesIntoDataFrame(WrappedStreamReaderOrStringRe char separator = ',', bool header = true, string[] columnNames = null, Type[] dataTypes = null, long numberOfRowsToRead = -1, int guessRows = 10, bool addIndexColumn = false, - bool renameDuplicatedColumns = false) + bool renameDuplicatedColumns = false, + CultureInfo cultureInfo = null) { + if (cultureInfo == null) + { + cultureInfo = CultureInfo.CurrentCulture; + } + if (dataTypes == null && guessRows <= 0) { throw new ArgumentException(string.Format(Strings.ExpectedEitherGuessRowsOrDataTypes, nameof(guessRows), nameof(dataTypes))); @@ -452,7 +461,7 @@ private static DataFrame ReadCsvLinesIntoDataFrame(WrappedStreamReaderOrStringRe } else { - ret.Append(fields, inPlace: true); + ret.Append(fields, inPlace: true, cultureInfo: cultureInfo); } ++rowline; } @@ -508,7 +517,6 @@ public TextReader GetTextReader() } } - } /// @@ -522,14 +530,18 @@ public TextReader GetTextReader() /// number of rows to read not including the header(if present) /// number of rows used to guess types /// add one column with the row index + /// If set to true, columns with repeated names are auto-renamed. + /// culture info for formatting values /// public static DataFrame LoadCsvFromString(string csvString, char separator = ',', bool header = true, string[] columnNames = null, Type[] dataTypes = null, - long numberOfRowsToRead = -1, int guessRows = 10, bool addIndexColumn = false) + long numberOfRowsToRead = -1, int guessRows = 10, bool addIndexColumn = false, + bool renameDuplicatedColumns = false, + CultureInfo cultureInfo = null) { WrappedStreamReaderOrStringReader wrappedStreamReaderOrStringReader = new WrappedStreamReaderOrStringReader(csvString); - return ReadCsvLinesIntoDataFrame(wrappedStreamReaderOrStringReader, separator, header, columnNames, dataTypes, numberOfRowsToRead, guessRows, addIndexColumn); + return ReadCsvLinesIntoDataFrame(wrappedStreamReaderOrStringReader, separator, header, columnNames, dataTypes, numberOfRowsToRead, guessRows, addIndexColumn, renameDuplicatedColumns, cultureInfo); } /// @@ -545,12 +557,13 @@ public static DataFrame LoadCsvFromString(string csvString, /// add one column with the row index /// The character encoding. Defaults to UTF8 if not specified /// If set to true, columns with repeated names are auto-renamed. + /// culture info for formatting values /// public static DataFrame LoadCsv(Stream csvStream, char separator = ',', bool header = true, string[] columnNames = null, Type[] dataTypes = null, long numberOfRowsToRead = -1, int guessRows = 10, bool addIndexColumn = false, - Encoding encoding = null, bool renameDuplicatedColumns = false) + Encoding encoding = null, bool renameDuplicatedColumns = false, CultureInfo cultureInfo = null) { if (!csvStream.CanSeek) { @@ -563,7 +576,7 @@ public static DataFrame LoadCsv(Stream csvStream, } WrappedStreamReaderOrStringReader wrappedStreamReaderOrStringReader = new WrappedStreamReaderOrStringReader(csvStream, encoding ?? Encoding.UTF8); - return ReadCsvLinesIntoDataFrame(wrappedStreamReaderOrStringReader, separator, header, columnNames, dataTypes, numberOfRowsToRead, guessRows, addIndexColumn, renameDuplicatedColumns); + return ReadCsvLinesIntoDataFrame(wrappedStreamReaderOrStringReader, separator, header, columnNames, dataTypes, numberOfRowsToRead, guessRows, addIndexColumn, renameDuplicatedColumns, cultureInfo); } /// diff --git a/src/Microsoft.Data.Analysis/DataFrame.cs b/src/Microsoft.Data.Analysis/DataFrame.cs index 78d453d44a..ad02200e6e 100644 --- a/src/Microsoft.Data.Analysis/DataFrame.cs +++ b/src/Microsoft.Data.Analysis/DataFrame.cs @@ -5,6 +5,7 @@ using System; using System.Collections.Generic; using System.Diagnostics; +using System.Globalization; using System.Linq; using System.Text; @@ -484,12 +485,13 @@ private void ResizeByOneAndAppend(DataFrameColumn column, object value) /// If a in is null, a null value is appended to each column /// The rows to be appended to this DataFrame /// If set, appends in place. Otherwise, a new DataFrame is returned with the appended - public DataFrame Append(IEnumerable rows, bool inPlace = false) + /// culture info for formatting values + public DataFrame Append(IEnumerable rows, bool inPlace = false, CultureInfo cultureInfo = null) { DataFrame ret = inPlace ? this : Clone(); foreach (DataFrameRow row in rows) { - ret.Append(row, inPlace: true); + ret.Append(row, inPlace: true, cultureInfo: cultureInfo); } return ret; } @@ -501,8 +503,14 @@ public DataFrame Append(IEnumerable rows, bool inPlace = false) /// If is null, a null value is appended to each column /// /// If set, appends a in place. Otherwise, a new DataFrame is returned with an appended - public DataFrame Append(IEnumerable row = null, bool inPlace = false) + /// culture info for formatting values + public DataFrame Append(IEnumerable row = null, bool inPlace = false, CultureInfo cultureInfo = null) { + if (cultureInfo == null) + { + cultureInfo = CultureInfo.CurrentCulture; + } + DataFrame ret = inPlace ? this : Clone(); IEnumerator columnEnumerator = ret.Columns.GetEnumerator(); bool columnMoveNext = columnEnumerator.MoveNext(); @@ -530,7 +538,7 @@ public DataFrame Append(IEnumerable row = null, bool inPlace = false) } if (value != null) { - value = Convert.ChangeType(value, column.DataType); + value = Convert.ChangeType(value, column.DataType, cultureInfo); if (value is null) { diff --git a/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs b/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs index 2a369c5bc7..8fb4c89fdf 100644 --- a/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs +++ b/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs @@ -14,6 +14,7 @@ using System.Data.SQLite.EF6; using Xunit; using Microsoft.ML.TestFramework.Attributes; +using System.Threading; namespace Microsoft.Data.Analysis.Tests { @@ -154,6 +155,42 @@ void ReducedRowsTest(DataFrame reducedRows) ReducedRowsTest(csvDf); } + [Fact] + public void TestReadCsvWithHeaderCultureInfoAndSeparator() + { + string data = @$"vendor_id;rate_code;passenger_count;trip_time_in_secs;trip_distance;payment_type;fare_amount +CMT;1;1;1271;3,8;CRD;17,5 +CMT;1;1;474;1,5;CRD;8 +CMT;1;1;637;1,4;CRD;8,5 +CMT;1;1;181;0,6;CSH;4,5"; + + void RegularTest(DataFrame df) + { + Assert.Equal(4, df.Rows.Count); + Assert.Equal(7, df.Columns.Count); + + Assert.Equal(3.8f, (float)df["trip_distance"][0]); + Assert.Equal(17.5f, (float)df["fare_amount"][0]); + + Assert.Equal(1.5f, (float)df["trip_distance"][1]); + Assert.Equal(8f, (float)df["fare_amount"][1]); + + Assert.Equal(1.4f, (float)df["trip_distance"][2]); + Assert.Equal(8.5f, (float)df["fare_amount"][2]); + + VerifyColumnTypes(df); + } + + // de-DE has ',' as decimal separator + var cultureInfo = new CultureInfo("de-DE"); + DataFrame df = DataFrame.LoadCsv(GetStream(data), separator: ';', cultureInfo: cultureInfo); + + RegularTest(df); + + DataFrame csvDf = DataFrame.LoadCsvFromString(data, separator: ';', cultureInfo: cultureInfo); + RegularTest(csvDf); + } + [Fact] public void TestReadCsvWithHeaderAndDuplicatedColumns_WithoutRenaming() {