Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 22 additions & 9 deletions src/Microsoft.Data.Analysis/DataFrame.IO.cs
Original file line number Diff line number Diff line change
Expand Up @@ -98,18 +98,21 @@ private static Type MaxKind(Type a, Type b)
/// <param name="guessRows">number of rows used to guess types</param>
/// <param name="addIndexColumn">add one column with the row index</param>
/// <param name="encoding">The character encoding. Defaults to UTF8 if not specified</param>
/// <param name="renameDuplicatedColumns">If set to true, columns with repeated names are auto-renamed.</param>
/// <param name="cultureInfo">culture info for formatting values</param>
/// <returns>DataFrame</returns>
public static DataFrame LoadCsv(string filename,
char separator = ',', bool header = true,
string[] columnNames = null, Type[] dataTypes = null,
int numRows = -1, int guessRows = 10,
bool addIndexColumn = false, Encoding encoding = null)
bool addIndexColumn = false, Encoding encoding = null,
bool renameDuplicatedColumns = false, CultureInfo cultureInfo = null)
{
using (Stream fileStream = new FileStream(filename, FileMode.Open))
{
return LoadCsv(fileStream,
separator: separator, header: header, columnNames: columnNames, dataTypes: dataTypes, numberOfRowsToRead: numRows,
guessRows: guessRows, addIndexColumn: addIndexColumn, encoding: encoding);
guessRows: guessRows, addIndexColumn: addIndexColumn, encoding: encoding, renameDuplicatedColumns: renameDuplicatedColumns, cultureInfo: cultureInfo);
}
}

Expand Down Expand Up @@ -351,8 +354,14 @@ private static DataFrame ReadCsvLinesIntoDataFrame(WrappedStreamReaderOrStringRe
char separator = ',', bool header = true,
string[] columnNames = null, Type[] dataTypes = null,
long numberOfRowsToRead = -1, int guessRows = 10, bool addIndexColumn = false,
bool renameDuplicatedColumns = false)
bool renameDuplicatedColumns = false,
CultureInfo cultureInfo = null)
{
if (cultureInfo == null)
{
cultureInfo = CultureInfo.CurrentCulture;
}

if (dataTypes == null && guessRows <= 0)
{
throw new ArgumentException(string.Format(Strings.ExpectedEitherGuessRowsOrDataTypes, nameof(guessRows), nameof(dataTypes)));
Expand Down Expand Up @@ -452,7 +461,7 @@ private static DataFrame ReadCsvLinesIntoDataFrame(WrappedStreamReaderOrStringRe
}
else
{
ret.Append(fields, inPlace: true);
ret.Append(fields, inPlace: true, cultureInfo: cultureInfo);
}
++rowline;
}
Expand Down Expand Up @@ -508,7 +517,6 @@ public TextReader GetTextReader()
}

}

}

/// <summary>
Expand All @@ -522,14 +530,18 @@ public TextReader GetTextReader()
/// <param name="numberOfRowsToRead">number of rows to read not including the header(if present)</param>
/// <param name="guessRows">number of rows used to guess types</param>
/// <param name="addIndexColumn">add one column with the row index</param>
/// <param name="renameDuplicatedColumns">If set to true, columns with repeated names are auto-renamed.</param>
/// <param name="cultureInfo">culture info for formatting values</param>
/// <returns><see cref="DataFrame"/></returns>
public static DataFrame LoadCsvFromString(string csvString,
char separator = ',', bool header = true,
string[] columnNames = null, Type[] dataTypes = null,
long numberOfRowsToRead = -1, int guessRows = 10, bool addIndexColumn = false)
long numberOfRowsToRead = -1, int guessRows = 10, bool addIndexColumn = false,
bool renameDuplicatedColumns = false,
CultureInfo cultureInfo = null)
{
WrappedStreamReaderOrStringReader wrappedStreamReaderOrStringReader = new WrappedStreamReaderOrStringReader(csvString);
return ReadCsvLinesIntoDataFrame(wrappedStreamReaderOrStringReader, separator, header, columnNames, dataTypes, numberOfRowsToRead, guessRows, addIndexColumn);
return ReadCsvLinesIntoDataFrame(wrappedStreamReaderOrStringReader, separator, header, columnNames, dataTypes, numberOfRowsToRead, guessRows, addIndexColumn, renameDuplicatedColumns, cultureInfo);
}

/// <summary>
Expand All @@ -545,12 +557,13 @@ public static DataFrame LoadCsvFromString(string csvString,
/// <param name="addIndexColumn">add one column with the row index</param>
/// <param name="encoding">The character encoding. Defaults to UTF8 if not specified</param>
/// <param name="renameDuplicatedColumns">If set to true, columns with repeated names are auto-renamed.</param>
/// <param name="cultureInfo">culture info for formatting values</param>
/// <returns><see cref="DataFrame"/></returns>
public static DataFrame LoadCsv(Stream csvStream,
char separator = ',', bool header = true,
string[] columnNames = null, Type[] dataTypes = null,
long numberOfRowsToRead = -1, int guessRows = 10, bool addIndexColumn = false,
Encoding encoding = null, bool renameDuplicatedColumns = false)
Encoding encoding = null, bool renameDuplicatedColumns = false, CultureInfo cultureInfo = null)
{
if (!csvStream.CanSeek)
{
Expand All @@ -563,7 +576,7 @@ public static DataFrame LoadCsv(Stream csvStream,
}

WrappedStreamReaderOrStringReader wrappedStreamReaderOrStringReader = new WrappedStreamReaderOrStringReader(csvStream, encoding ?? Encoding.UTF8);
return ReadCsvLinesIntoDataFrame(wrappedStreamReaderOrStringReader, separator, header, columnNames, dataTypes, numberOfRowsToRead, guessRows, addIndexColumn, renameDuplicatedColumns);
return ReadCsvLinesIntoDataFrame(wrappedStreamReaderOrStringReader, separator, header, columnNames, dataTypes, numberOfRowsToRead, guessRows, addIndexColumn, renameDuplicatedColumns, cultureInfo);
}

/// <summary>
Expand Down
16 changes: 12 additions & 4 deletions src/Microsoft.Data.Analysis/DataFrame.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Globalization;
using System.Linq;
using System.Text;

Expand Down Expand Up @@ -484,12 +485,13 @@ private void ResizeByOneAndAppend(DataFrameColumn column, object value)
/// <remarks>If a <seealso cref="DataFrameRow"/> in <paramref name="rows"/> is null, a null value is appended to each column</remarks>
/// <param name="rows">The rows to be appended to this DataFrame </param>
/// <param name="inPlace">If set, appends <paramref name="rows"/> in place. Otherwise, a new DataFrame is returned with the <paramref name="rows"/> appended</param>
public DataFrame Append(IEnumerable<DataFrameRow> rows, bool inPlace = false)
/// <param name="cultureInfo">culture info for formatting values</param>
public DataFrame Append(IEnumerable<DataFrameRow> rows, bool inPlace = false, CultureInfo cultureInfo = null)
{
DataFrame ret = inPlace ? this : Clone();
foreach (DataFrameRow row in rows)
{
ret.Append(row, inPlace: true);
ret.Append(row, inPlace: true, cultureInfo: cultureInfo);
}
return ret;
}
Expand All @@ -501,8 +503,14 @@ public DataFrame Append(IEnumerable<DataFrameRow> rows, bool inPlace = false)
/// <remarks>If <paramref name="row"/> is null, a null value is appended to each column</remarks>
/// <param name="row"></param>
/// <param name="inPlace">If set, appends a <paramref name="row"/> in place. Otherwise, a new DataFrame is returned with an appended <paramref name="row"/> </param>
public DataFrame Append(IEnumerable<object> row = null, bool inPlace = false)
/// <param name="cultureInfo">culture info for formatting values</param>
public DataFrame Append(IEnumerable<object> row = null, bool inPlace = false, CultureInfo cultureInfo = null)
{
if (cultureInfo == null)
{
cultureInfo = CultureInfo.CurrentCulture;
}

DataFrame ret = inPlace ? this : Clone();
IEnumerator<DataFrameColumn> columnEnumerator = ret.Columns.GetEnumerator();
bool columnMoveNext = columnEnumerator.MoveNext();
Expand Down Expand Up @@ -530,7 +538,7 @@ public DataFrame Append(IEnumerable<object> row = null, bool inPlace = false)
}
if (value != null)
{
value = Convert.ChangeType(value, column.DataType);
value = Convert.ChangeType(value, column.DataType, cultureInfo);

if (value is null)
{
Expand Down
37 changes: 37 additions & 0 deletions test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
using System.Data.SQLite.EF6;
using Xunit;
using Microsoft.ML.TestFramework.Attributes;
using System.Threading;

namespace Microsoft.Data.Analysis.Tests
{
Expand Down Expand Up @@ -154,6 +155,42 @@ void ReducedRowsTest(DataFrame reducedRows)
ReducedRowsTest(csvDf);
}

[Fact]
public void TestReadCsvWithHeaderCultureInfoAndSeparator()
{
string data = @$"vendor_id;rate_code;passenger_count;trip_time_in_secs;trip_distance;payment_type;fare_amount
CMT;1;1;1271;3,8;CRD;17,5
CMT;1;1;474;1,5;CRD;8
CMT;1;1;637;1,4;CRD;8,5
CMT;1;1;181;0,6;CSH;4,5";

void RegularTest(DataFrame df)
{
Assert.Equal(4, df.Rows.Count);
Assert.Equal(7, df.Columns.Count);

Assert.Equal(3.8f, (float)df["trip_distance"][0]);
Assert.Equal(17.5f, (float)df["fare_amount"][0]);

Assert.Equal(1.5f, (float)df["trip_distance"][1]);
Assert.Equal(8f, (float)df["fare_amount"][1]);

Assert.Equal(1.4f, (float)df["trip_distance"][2]);
Assert.Equal(8.5f, (float)df["fare_amount"][2]);

VerifyColumnTypes(df);
}

// de-DE has ',' as decimal separator
var cultureInfo = new CultureInfo("de-DE");
DataFrame df = DataFrame.LoadCsv(GetStream(data), separator: ';', cultureInfo: cultureInfo);

RegularTest(df);

DataFrame csvDf = DataFrame.LoadCsvFromString(data, separator: ';', cultureInfo: cultureInfo);
RegularTest(csvDf);
}

[Fact]
public void TestReadCsvWithHeaderAndDuplicatedColumns_WithoutRenaming()
{
Expand Down