Skip to content

Commit 09bbb19

Browse files
dakersnareerhardt
andauthored
Fix problems with DataFrame WriteCsv (#6303)
* Add DataFrame.IO tests with separators in data * Add test where comma is in header * Add two versions of test cases, likely going to use the helper version * Fix separators in data * Fix separators in header * Clean up tests * Fix issue with not wrapping output with newlines in quotations * Accidental commit * Clean up * Clean up mini test framework a bit * Apply suggestions from code review Co-authored-by: Eric Erhardt <[email protected]> * Apply suggestions from code review * Apply suggestions from code review * Write to StreamWriter directly Co-authored-by: Eric Erhardt <[email protected]>
1 parent 50e5068 commit 09bbb19

File tree

2 files changed

+230
-7
lines changed

2 files changed

+230
-7
lines changed

src/Microsoft.Data.Analysis/DataFrame.IO.cs

Lines changed: 45 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -450,28 +450,25 @@ public static void WriteCsv(DataFrame dataFrame, Stream csvStream,
450450
{
451451
if (dataFrame != null)
452452
{
453-
var columnNames = dataFrame.Columns.GetColumnNames();
454-
455453
if (header)
456454
{
457-
var headerColumns = string.Join(separator.ToString(), columnNames);
458-
csvFile.WriteLine(headerColumns);
455+
WriteHeader(csvFile, dataFrame.Columns.GetColumnNames(), separator);
459456
}
460457

461458
var record = new StringBuilder();
462459

463460
foreach (var row in dataFrame.Rows)
464461
{
465-
bool firstRow = true;
462+
bool firstCell = true;
466463
foreach (var cell in row)
467464
{
468-
if (!firstRow)
465+
if (!firstCell)
469466
{
470467
record.Append(separator);
471468
}
472469
else
473470
{
474-
firstRow = false;
471+
firstCell = false;
475472
}
476473

477474
Type t = cell?.GetType();
@@ -500,6 +497,18 @@ public static void WriteCsv(DataFrame dataFrame, Stream csvStream,
500497
continue;
501498
}
502499

500+
if (t == typeof(string))
501+
{
502+
bool needsQuotes = ((string)cell).IndexOf(separator) != -1 || ((string)cell).IndexOf('\n') != -1;
503+
if (needsQuotes)
504+
{
505+
record.Append('\"');
506+
record.Append(cell);
507+
record.Append('\"');
508+
continue;
509+
}
510+
}
511+
503512
record.Append(cell);
504513
}
505514

@@ -510,5 +519,34 @@ public static void WriteCsv(DataFrame dataFrame, Stream csvStream,
510519
}
511520
}
512521
}
522+
private static void WriteHeader(StreamWriter csvFile, IReadOnlyList<string> columnNames, char separator)
523+
{
524+
bool firstColumn = true;
525+
foreach (string name in columnNames)
526+
{
527+
if (!firstColumn)
528+
{
529+
csvFile.Write(separator);
530+
}
531+
else
532+
{
533+
firstColumn = false;
534+
}
535+
536+
bool needsQuotes = name.IndexOf(separator) != -1 || name.IndexOf('\n') != -1;
537+
if (needsQuotes)
538+
{
539+
csvFile.Write('\"');
540+
csvFile.Write(name);
541+
csvFile.Write('\"');
542+
}
543+
else
544+
{
545+
csvFile.Write(name);
546+
}
547+
}
548+
549+
csvFile.WriteLine();
550+
}
513551
}
514552
}

test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs

Lines changed: 185 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
// See the LICENSE file in the project root for more information.
44

55
using System;
6+
using System.Collections.Generic;
67
using System.Globalization;
78
using System.IO;
89
using System.Linq;
@@ -1019,5 +1020,189 @@ public void TestMixedDataTypesInCsv()
10191020
Assert.Equal("", emptyColumn[i]);
10201021
}
10211022
}
1023+
1024+
public readonly struct LoadCsvVerifyingHelper
1025+
{
1026+
private readonly int _columnCount;
1027+
private readonly long _rowCount;
1028+
private readonly string[] _columnNames;
1029+
private readonly Type[] _columnTypes;
1030+
private readonly object[][] _cells;
1031+
1032+
public LoadCsvVerifyingHelper(int columnCount, long rowCount, string[] columnNames, Type[] columnTypes, object[][] cells)
1033+
{
1034+
_columnCount = columnCount;
1035+
_rowCount = rowCount;
1036+
_columnNames = columnNames;
1037+
_columnTypes = columnTypes;
1038+
_cells = cells;
1039+
1040+
}
1041+
1042+
public void VerifyLoadCsv(DataFrame df)
1043+
{
1044+
Assert.Equal(_rowCount, df.Rows.Count);
1045+
Assert.Equal(_columnCount, df.Columns.Count);
1046+
1047+
for (int j = 0; j < _columnCount; j++)
1048+
{
1049+
Assert.True(_columnTypes[j] == df.Columns[j].DataType);
1050+
Assert.Equal(_columnNames[j], df.Columns[j].Name);
1051+
1052+
}
1053+
1054+
VerifyColumnTypes(df);
1055+
1056+
for (int i = 0; i < _rowCount; i++)
1057+
{
1058+
Assert.Equal(_cells[i], df.Rows[i]);
1059+
}
1060+
}
1061+
}
1062+
1063+
public static IEnumerable<object[]> CsvWithTextQualifiers_TestData()
1064+
{
1065+
yield return new object[] // Comma Separators in Data
1066+
{
1067+
@"Name,Age,Description
1068+
Paul,34,""Paul lives in Vermont, VA.""
1069+
Victor,29,""Victor: Funny guy""
1070+
Maria,31,",
1071+
',',
1072+
new Type[] { typeof(string), typeof(int), typeof(string) },
1073+
new LoadCsvVerifyingHelper(
1074+
3,
1075+
3,
1076+
new string[] { "Name", "Age", "Description" },
1077+
new Type[] { typeof(string), typeof(int), typeof(string) },
1078+
new object[][]
1079+
{
1080+
new object[] { "Paul", 34, "Paul lives in Vermont, VA." },
1081+
new object[] { "Victor", 29, "Victor: Funny guy" },
1082+
new object[] { "Maria", 31, "" }
1083+
}
1084+
)
1085+
};
1086+
yield return new object[] // Colon Separators in Data
1087+
{
1088+
@"Name:Age:Description
1089+
Paul:34:""Paul lives in Vermont, VA.""
1090+
Victor:29:""Victor: Funny guy""
1091+
Maria:31:",
1092+
':',
1093+
new Type[] { typeof(string), typeof(int), typeof(string) },
1094+
new LoadCsvVerifyingHelper(
1095+
3,
1096+
3,
1097+
new string[] { "Name", "Age", "Description" },
1098+
new Type[] { typeof(string), typeof(int), typeof(string) },
1099+
new object[][]
1100+
{
1101+
new object[] { "Paul", 34, "Paul lives in Vermont, VA." },
1102+
new object[] { "Victor", 29, "Victor: Funny guy" },
1103+
new object[] { "Maria", 31, "" }
1104+
}
1105+
)
1106+
};
1107+
yield return new object[] // Comma Separators in Header
1108+
{
1109+
@"""Na,me"",Age,Description
1110+
Paul,34,""Paul lives in Vermont, VA.""
1111+
Victor,29,""Victor: Funny guy""
1112+
Maria,31,",
1113+
',',
1114+
new Type[] { typeof(string), typeof(int), typeof(string) },
1115+
new LoadCsvVerifyingHelper(
1116+
3,
1117+
3,
1118+
new string[] { "Na,me", "Age", "Description" },
1119+
new Type[] { typeof(string), typeof(int), typeof(string) },
1120+
new object[][]
1121+
{
1122+
new object[] { "Paul", 34, "Paul lives in Vermont, VA." },
1123+
new object[] { "Victor", 29, "Victor: Funny guy" },
1124+
new object[] { "Maria", 31, "" }
1125+
}
1126+
)
1127+
};
1128+
yield return new object[] // Newlines In Data
1129+
{
1130+
@"Name,Age,Description
1131+
Paul,34,""Paul lives in Vermont
1132+
VA.""
1133+
Victor,29,""Victor: Funny guy""
1134+
Maria,31,",
1135+
',',
1136+
new Type[] { typeof(string), typeof(int), typeof(string) },
1137+
new LoadCsvVerifyingHelper(
1138+
3,
1139+
3,
1140+
new string[] { "Name", "Age", "Description" },
1141+
new Type[] { typeof(string), typeof(int), typeof(string) },
1142+
new object[][]
1143+
{
1144+
new object[] { "Paul", 34, @"Paul lives in Vermont
1145+
VA." },
1146+
new object[] { "Victor", 29, "Victor: Funny guy" },
1147+
new object[] { "Maria", 31, "" }
1148+
}
1149+
)
1150+
};
1151+
yield return new object[] // Newlines In Header
1152+
{
1153+
@"""Na
1154+
me"":Age:Description
1155+
Paul:34:""Paul lives in Vermont, VA.""
1156+
Victor:29:""Victor: Funny guy""
1157+
Maria:31:",
1158+
':',
1159+
new Type[] { typeof(string), typeof(int), typeof(string) },
1160+
new LoadCsvVerifyingHelper(
1161+
3,
1162+
3,
1163+
new string[] { @"Na
1164+
me", "Age", "Description" },
1165+
new Type[] { typeof(string), typeof(int), typeof(string) },
1166+
new object[][]
1167+
{
1168+
new object[] { "Paul", 34, "Paul lives in Vermont, VA." },
1169+
new object[] { "Victor", 29, "Victor: Funny guy" },
1170+
new object[] { "Maria", 31, "" }
1171+
}
1172+
)
1173+
};
1174+
}
1175+
1176+
[Theory]
1177+
[MemberData(nameof(CsvWithTextQualifiers_TestData))]
1178+
public void TestLoadCsvWithTextQualifiersFromStream(string data, char separator, Type[] dataTypes, LoadCsvVerifyingHelper helper)
1179+
{
1180+
DataFrame df = DataFrame.LoadCsv(GetStream(data), dataTypes: dataTypes, separator: separator);
1181+
helper.VerifyLoadCsv(df);
1182+
}
1183+
1184+
[Theory]
1185+
[MemberData(nameof(CsvWithTextQualifiers_TestData))]
1186+
public void TestLoadCsvWithTextQualifiersFromString(string data, char separator, Type[] dataTypes, LoadCsvVerifyingHelper helper)
1187+
{
1188+
DataFrame df = DataFrame.LoadCsvFromString(data, dataTypes: dataTypes, separator: separator);
1189+
helper.VerifyLoadCsv(df);
1190+
}
1191+
1192+
[Theory]
1193+
[MemberData(nameof(CsvWithTextQualifiers_TestData))]
1194+
public void TestWriteCsvWithTextQualifiers(string data, char separator, Type[] dataTypes, LoadCsvVerifyingHelper helper)
1195+
{
1196+
DataFrame df = DataFrame.LoadCsv(GetStream(data), dataTypes: dataTypes, separator: separator);
1197+
1198+
using MemoryStream csvStream = new MemoryStream();
1199+
DataFrame.WriteCsv(df, csvStream, separator: separator);
1200+
1201+
// We are verifying that WriteCsv works by reading the result back to a DataFrame and verifying correctness,
1202+
// ensuring no information loss
1203+
csvStream.Seek(0, SeekOrigin.Begin);
1204+
DataFrame df2 = DataFrame.LoadCsv(csvStream, dataTypes: dataTypes, separator: separator);
1205+
helper.VerifyLoadCsv(df2);
1206+
}
10221207
}
10231208
}

0 commit comments

Comments
 (0)