diff --git a/src/Microsoft.Data.Analysis/DataFrame.IO.cs b/src/Microsoft.Data.Analysis/DataFrame.IO.cs index 30f395352c..4f14615b0e 100644 --- a/src/Microsoft.Data.Analysis/DataFrame.IO.cs +++ b/src/Microsoft.Data.Analysis/DataFrame.IO.cs @@ -450,28 +450,25 @@ public static void WriteCsv(DataFrame dataFrame, Stream csvStream, { if (dataFrame != null) { - var columnNames = dataFrame.Columns.GetColumnNames(); - if (header) { - var headerColumns = string.Join(separator.ToString(), columnNames); - csvFile.WriteLine(headerColumns); + WriteHeader(csvFile, dataFrame.Columns.GetColumnNames(), separator); } var record = new StringBuilder(); foreach (var row in dataFrame.Rows) { - bool firstRow = true; + bool firstCell = true; foreach (var cell in row) { - if (!firstRow) + if (!firstCell) { record.Append(separator); } else { - firstRow = false; + firstCell = false; } Type t = cell?.GetType(); @@ -500,6 +497,18 @@ public static void WriteCsv(DataFrame dataFrame, Stream csvStream, continue; } + if (t == typeof(string)) + { + bool needsQuotes = ((string)cell).IndexOf(separator) != -1 || ((string)cell).IndexOf('\n') != -1; + if (needsQuotes) + { + record.Append('\"'); + record.Append(cell); + record.Append('\"'); + continue; + } + } + record.Append(cell); } @@ -510,5 +519,34 @@ public static void WriteCsv(DataFrame dataFrame, Stream csvStream, } } } + private static void WriteHeader(StreamWriter csvFile, IReadOnlyList columnNames, char separator) + { + bool firstColumn = true; + foreach (string name in columnNames) + { + if (!firstColumn) + { + csvFile.Write(separator); + } + else + { + firstColumn = false; + } + + bool needsQuotes = name.IndexOf(separator) != -1 || name.IndexOf('\n') != -1; + if (needsQuotes) + { + csvFile.Write('\"'); + csvFile.Write(name); + csvFile.Write('\"'); + } + else + { + csvFile.Write(name); + } + } + + csvFile.WriteLine(); + } } } diff --git a/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs b/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs index cfe996b589..371a29749e 100644 --- a/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs +++ b/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs @@ -3,6 +3,7 @@ // See the LICENSE file in the project root for more information. using System; +using System.Collections.Generic; using System.Globalization; using System.IO; using System.Linq; @@ -1019,5 +1020,189 @@ public void TestMixedDataTypesInCsv() Assert.Equal("", emptyColumn[i]); } } + + public readonly struct LoadCsvVerifyingHelper + { + private readonly int _columnCount; + private readonly long _rowCount; + private readonly string[] _columnNames; + private readonly Type[] _columnTypes; + private readonly object[][] _cells; + + public LoadCsvVerifyingHelper(int columnCount, long rowCount, string[] columnNames, Type[] columnTypes, object[][] cells) + { + _columnCount = columnCount; + _rowCount = rowCount; + _columnNames = columnNames; + _columnTypes = columnTypes; + _cells = cells; + + } + + public void VerifyLoadCsv(DataFrame df) + { + Assert.Equal(_rowCount, df.Rows.Count); + Assert.Equal(_columnCount, df.Columns.Count); + + for (int j = 0; j < _columnCount; j++) + { + Assert.True(_columnTypes[j] == df.Columns[j].DataType); + Assert.Equal(_columnNames[j], df.Columns[j].Name); + + } + + VerifyColumnTypes(df); + + for (int i = 0; i < _rowCount; i++) + { + Assert.Equal(_cells[i], df.Rows[i]); + } + } + } + + public static IEnumerable CsvWithTextQualifiers_TestData() + { + yield return new object[] // Comma Separators in Data + { + @"Name,Age,Description +Paul,34,""Paul lives in Vermont, VA."" +Victor,29,""Victor: Funny guy"" +Maria,31,", + ',', + new Type[] { typeof(string), typeof(int), typeof(string) }, + new LoadCsvVerifyingHelper( + 3, + 3, + new string[] { "Name", "Age", "Description" }, + new Type[] { typeof(string), typeof(int), typeof(string) }, + new object[][] + { + new object[] { "Paul", 34, "Paul lives in Vermont, VA." }, + new object[] { "Victor", 29, "Victor: Funny guy" }, + new object[] { "Maria", 31, "" } + } + ) + }; + yield return new object[] // Colon Separators in Data + { + @"Name:Age:Description +Paul:34:""Paul lives in Vermont, VA."" +Victor:29:""Victor: Funny guy"" +Maria:31:", + ':', + new Type[] { typeof(string), typeof(int), typeof(string) }, + new LoadCsvVerifyingHelper( + 3, + 3, + new string[] { "Name", "Age", "Description" }, + new Type[] { typeof(string), typeof(int), typeof(string) }, + new object[][] + { + new object[] { "Paul", 34, "Paul lives in Vermont, VA." }, + new object[] { "Victor", 29, "Victor: Funny guy" }, + new object[] { "Maria", 31, "" } + } + ) + }; + yield return new object[] // Comma Separators in Header + { + @"""Na,me"",Age,Description +Paul,34,""Paul lives in Vermont, VA."" +Victor,29,""Victor: Funny guy"" +Maria,31,", + ',', + new Type[] { typeof(string), typeof(int), typeof(string) }, + new LoadCsvVerifyingHelper( + 3, + 3, + new string[] { "Na,me", "Age", "Description" }, + new Type[] { typeof(string), typeof(int), typeof(string) }, + new object[][] + { + new object[] { "Paul", 34, "Paul lives in Vermont, VA." }, + new object[] { "Victor", 29, "Victor: Funny guy" }, + new object[] { "Maria", 31, "" } + } + ) + }; + yield return new object[] // Newlines In Data + { + @"Name,Age,Description +Paul,34,""Paul lives in Vermont +VA."" +Victor,29,""Victor: Funny guy"" +Maria,31,", + ',', + new Type[] { typeof(string), typeof(int), typeof(string) }, + new LoadCsvVerifyingHelper( + 3, + 3, + new string[] { "Name", "Age", "Description" }, + new Type[] { typeof(string), typeof(int), typeof(string) }, + new object[][] + { + new object[] { "Paul", 34, @"Paul lives in Vermont +VA." }, + new object[] { "Victor", 29, "Victor: Funny guy" }, + new object[] { "Maria", 31, "" } + } + ) + }; + yield return new object[] // Newlines In Header + { + @"""Na +me"":Age:Description +Paul:34:""Paul lives in Vermont, VA."" +Victor:29:""Victor: Funny guy"" +Maria:31:", + ':', + new Type[] { typeof(string), typeof(int), typeof(string) }, + new LoadCsvVerifyingHelper( + 3, + 3, + new string[] { @"Na +me", "Age", "Description" }, + new Type[] { typeof(string), typeof(int), typeof(string) }, + new object[][] + { + new object[] { "Paul", 34, "Paul lives in Vermont, VA." }, + new object[] { "Victor", 29, "Victor: Funny guy" }, + new object[] { "Maria", 31, "" } + } + ) + }; + } + + [Theory] + [MemberData(nameof(CsvWithTextQualifiers_TestData))] + public void TestLoadCsvWithTextQualifiersFromStream(string data, char separator, Type[] dataTypes, LoadCsvVerifyingHelper helper) + { + DataFrame df = DataFrame.LoadCsv(GetStream(data), dataTypes: dataTypes, separator: separator); + helper.VerifyLoadCsv(df); + } + + [Theory] + [MemberData(nameof(CsvWithTextQualifiers_TestData))] + public void TestLoadCsvWithTextQualifiersFromString(string data, char separator, Type[] dataTypes, LoadCsvVerifyingHelper helper) + { + DataFrame df = DataFrame.LoadCsvFromString(data, dataTypes: dataTypes, separator: separator); + helper.VerifyLoadCsv(df); + } + + [Theory] + [MemberData(nameof(CsvWithTextQualifiers_TestData))] + public void TestWriteCsvWithTextQualifiers(string data, char separator, Type[] dataTypes, LoadCsvVerifyingHelper helper) + { + DataFrame df = DataFrame.LoadCsv(GetStream(data), dataTypes: dataTypes, separator: separator); + + using MemoryStream csvStream = new MemoryStream(); + DataFrame.WriteCsv(df, csvStream, separator: separator); + + // We are verifying that WriteCsv works by reading the result back to a DataFrame and verifying correctness, + // ensuring no information loss + csvStream.Seek(0, SeekOrigin.Begin); + DataFrame df2 = DataFrame.LoadCsv(csvStream, dataTypes: dataTypes, separator: separator); + helper.VerifyLoadCsv(df2); + } } }