From f74d1ab174f9a1c4421543d25a6709f45d1433b1 Mon Sep 17 00:00:00 2001 From: Drew Kersnar <18474647+dakersnar@users.noreply.github.com> Date: Tue, 23 Aug 2022 16:16:30 -0500 Subject: [PATCH 01/14] Add DataFrame.IO tests with separators in data --- .../DataFrame.IOTests.cs | 136 ++++++++++++++++++ 1 file changed, 136 insertions(+) diff --git a/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs b/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs index cfe996b589..014474c2ad 100644 --- a/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs +++ b/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs @@ -465,6 +465,142 @@ void Verify(DataFrame df, bool verifyDataTypes) Verify(df, false); } + [Fact] + public void TestReadCsvWithCommaSeparatorsInData() + { + string data = @"Name,Age,Description +Paul,34,""Paul lives in Vermont, VA."" +Victor,29,""Victor: Funny guy"" +Maria,31,"; + + void Verify(DataFrame df) + { + Assert.Equal(3, df.Rows.Count); + Assert.Equal(3, df.Columns.Count); + + Assert.True(typeof(string) == df.Columns[0].DataType); + Assert.True(typeof(int) == df.Columns[1].DataType); + Assert.True(typeof(string) == df.Columns[2].DataType); + + + Assert.Equal("Name", df.Columns[0].Name); + Assert.Equal("Age", df.Columns[1].Name); + Assert.Equal("Description", df.Columns[2].Name); + VerifyColumnTypes(df); + + var paulRow = df.Rows[0]; + Assert.Equal("Paul", paulRow[0]); + Assert.Equal(34, paulRow[1]); + Assert.Equal("Paul lives in Vermont, VA.", paulRow[2]); + + var victorRow = df.Rows[1]; + Assert.Equal("Victor", victorRow[0]); + Assert.Equal(29, victorRow[1]); + Assert.Equal("Victor: Funny guy", victorRow[2]); + + var mariaRow = df.Rows[2]; + Assert.Equal("Maria", mariaRow[0]); + Assert.Equal(31, mariaRow[1]); + Assert.Equal("", mariaRow[2]); + } + + DataFrame df = DataFrame.LoadCsv(GetStream(data), dataTypes: new Type[] { typeof(string), typeof(int), typeof(string) }); + Verify(df); + df = DataFrame.LoadCsvFromString(data, dataTypes: new Type[] { typeof(string), typeof(int), typeof(string) }); + Verify(df); + } + + [Fact] + public void TestReadCsvWithColonSeparatorsInData() + { + string data = @"Name:Age:Description +Paul:34:""Paul lives in Vermont, VA."" +Victor:29:""Victor: Funny guy"" +Maria:31:"; + void Verify(DataFrame df) + { + Assert.Equal(3, df.Rows.Count); + Assert.Equal(3, df.Columns.Count); + + Assert.True(typeof(string) == df.Columns[0].DataType); + Assert.True(typeof(int) == df.Columns[1].DataType); + Assert.True(typeof(string) == df.Columns[2].DataType); + + + Assert.Equal("Name", df.Columns[0].Name); + Assert.Equal("Age", df.Columns[1].Name); + Assert.Equal("Description", df.Columns[2].Name); + VerifyColumnTypes(df); + + var paulRow = df.Rows[0]; + Assert.Equal("Paul", paulRow[0]); + Assert.Equal(34, paulRow[1]); + Assert.Equal("Paul lives in Vermont, VA.", paulRow[2]); + + var victorRow = df.Rows[1]; + Assert.Equal("Victor", victorRow[0]); + Assert.Equal(29, victorRow[1]); + Assert.Equal("Victor: Funny guy", victorRow[2]); + + var mariaRow = df.Rows[2]; + Assert.Equal("Maria", mariaRow[0]); + Assert.Equal(31, mariaRow[1]); + Assert.Equal("", mariaRow[2]); + } + + DataFrame df = DataFrame.LoadCsv(GetStream(data), dataTypes: new Type[] { typeof(string), typeof(int), typeof(string) }, separator: ':'); + Verify(df); + df = DataFrame.LoadCsvFromString(data, dataTypes: new Type[] { typeof(string), typeof(int), typeof(string) }, separator: ':'); + Verify(df); + } + + [Fact] + public void TestReadCsvWithNewlinesInData() + { + string data = @"Name,Age,Description +Paul,34,""Paul lives in Vermont +VA."" +Victor,29,""Victor: Funny guy"" +Maria,31,"; + + void Verify(DataFrame df) + { + Assert.Equal(3, df.Rows.Count); + Assert.Equal(3, df.Columns.Count); + + Assert.True(typeof(string) == df.Columns[0].DataType); + Assert.True(typeof(int) == df.Columns[1].DataType); + Assert.True(typeof(string) == df.Columns[2].DataType); + + + Assert.Equal("Name", df.Columns[0].Name); + Assert.Equal("Age", df.Columns[1].Name); + Assert.Equal("Description", df.Columns[2].Name); + VerifyColumnTypes(df); + + var paulRow = df.Rows[0]; + Assert.Equal("Paul", paulRow[0]); + Assert.Equal(34, paulRow[1]); + Assert.Equal(@"Paul lives in Vermont +VA.", paulRow[2]); + + var victorRow = df.Rows[1]; + Assert.Equal("Victor", victorRow[0]); + Assert.Equal(29, victorRow[1]); + Assert.Equal("Victor: Funny guy", victorRow[2]); + + var mariaRow = df.Rows[2]; + Assert.Equal("Maria", mariaRow[0]); + Assert.Equal(31, mariaRow[1]); + Assert.Equal("", mariaRow[2]); + } + + DataFrame df = DataFrame.LoadCsv(GetStream(data), dataTypes: new Type[] { typeof(string), typeof(int), typeof(string) }); + Verify(df); + df = DataFrame.LoadCsvFromString(data, dataTypes: new Type[] { typeof(string), typeof(int), typeof(string) }); + Verify(df); + } + [Fact] public void TestReadCsvWithPipeSeparator() { From d72df26d32a8d5884604ab35520d6c5ffe09731f Mon Sep 17 00:00:00 2001 From: Drew Kersnar <18474647+dakersnar@users.noreply.github.com> Date: Tue, 23 Aug 2022 17:08:13 -0500 Subject: [PATCH 02/14] Add test where comma is in header --- .../DataFrame.IOTests.cs | 45 +++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs b/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs index 014474c2ad..9c78c947e0 100644 --- a/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs +++ b/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs @@ -554,6 +554,51 @@ void Verify(DataFrame df) Verify(df); } + [Fact] + public void TestReadCsvWithCommaSeparatorsInHeaderData() + { + string data = @"""Na,me"",Age,Description +Paul,34,""Paul lives in Vermont, VA."" +Victor,29,""Victor: Funny guy"" +Maria,31,"; + + void Verify(DataFrame df) + { + Assert.Equal(3, df.Rows.Count); + Assert.Equal(3, df.Columns.Count); + + Assert.True(typeof(string) == df.Columns[0].DataType); + Assert.True(typeof(int) == df.Columns[1].DataType); + Assert.True(typeof(string) == df.Columns[2].DataType); + + + Assert.Equal("Na,me", df.Columns[0].Name); + Assert.Equal("Age", df.Columns[1].Name); + Assert.Equal("Description", df.Columns[2].Name); + VerifyColumnTypes(df); + + var paulRow = df.Rows[0]; + Assert.Equal("Paul", paulRow[0]); + Assert.Equal(34, paulRow[1]); + Assert.Equal("Paul lives in Vermont, VA.", paulRow[2]); + + var victorRow = df.Rows[1]; + Assert.Equal("Victor", victorRow[0]); + Assert.Equal(29, victorRow[1]); + Assert.Equal("Victor: Funny guy", victorRow[2]); + + var mariaRow = df.Rows[2]; + Assert.Equal("Maria", mariaRow[0]); + Assert.Equal(31, mariaRow[1]); + Assert.Equal("", mariaRow[2]); + } + + DataFrame df = DataFrame.LoadCsv(GetStream(data), dataTypes: new Type[] { typeof(string), typeof(int), typeof(string) }); + Verify(df); + df = DataFrame.LoadCsvFromString(data, dataTypes: new Type[] { typeof(string), typeof(int), typeof(string) }); + Verify(df); + } + [Fact] public void TestReadCsvWithNewlinesInData() { From e9e4254bd22484a2b25240c7c41e926bf34892fc Mon Sep 17 00:00:00 2001 From: Drew Kersnar <18474647+dakersnar@users.noreply.github.com> Date: Wed, 24 Aug 2022 18:03:55 -0500 Subject: [PATCH 03/14] Add two versions of test cases, likely going to use the helper version --- .../DataFrame.IOTests.cs | 456 +++++++++++------- 1 file changed, 275 insertions(+), 181 deletions(-) diff --git a/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs b/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs index 9c78c947e0..18937548fb 100644 --- a/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs +++ b/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs @@ -3,6 +3,8 @@ // See the LICENSE file in the project root for more information. using System; +using System.CodeDom; +using System.Collections.Generic; using System.Globalization; using System.IO; using System.Linq; @@ -465,187 +467,6 @@ void Verify(DataFrame df, bool verifyDataTypes) Verify(df, false); } - [Fact] - public void TestReadCsvWithCommaSeparatorsInData() - { - string data = @"Name,Age,Description -Paul,34,""Paul lives in Vermont, VA."" -Victor,29,""Victor: Funny guy"" -Maria,31,"; - - void Verify(DataFrame df) - { - Assert.Equal(3, df.Rows.Count); - Assert.Equal(3, df.Columns.Count); - - Assert.True(typeof(string) == df.Columns[0].DataType); - Assert.True(typeof(int) == df.Columns[1].DataType); - Assert.True(typeof(string) == df.Columns[2].DataType); - - - Assert.Equal("Name", df.Columns[0].Name); - Assert.Equal("Age", df.Columns[1].Name); - Assert.Equal("Description", df.Columns[2].Name); - VerifyColumnTypes(df); - - var paulRow = df.Rows[0]; - Assert.Equal("Paul", paulRow[0]); - Assert.Equal(34, paulRow[1]); - Assert.Equal("Paul lives in Vermont, VA.", paulRow[2]); - - var victorRow = df.Rows[1]; - Assert.Equal("Victor", victorRow[0]); - Assert.Equal(29, victorRow[1]); - Assert.Equal("Victor: Funny guy", victorRow[2]); - - var mariaRow = df.Rows[2]; - Assert.Equal("Maria", mariaRow[0]); - Assert.Equal(31, mariaRow[1]); - Assert.Equal("", mariaRow[2]); - } - - DataFrame df = DataFrame.LoadCsv(GetStream(data), dataTypes: new Type[] { typeof(string), typeof(int), typeof(string) }); - Verify(df); - df = DataFrame.LoadCsvFromString(data, dataTypes: new Type[] { typeof(string), typeof(int), typeof(string) }); - Verify(df); - } - - [Fact] - public void TestReadCsvWithColonSeparatorsInData() - { - string data = @"Name:Age:Description -Paul:34:""Paul lives in Vermont, VA."" -Victor:29:""Victor: Funny guy"" -Maria:31:"; - void Verify(DataFrame df) - { - Assert.Equal(3, df.Rows.Count); - Assert.Equal(3, df.Columns.Count); - - Assert.True(typeof(string) == df.Columns[0].DataType); - Assert.True(typeof(int) == df.Columns[1].DataType); - Assert.True(typeof(string) == df.Columns[2].DataType); - - - Assert.Equal("Name", df.Columns[0].Name); - Assert.Equal("Age", df.Columns[1].Name); - Assert.Equal("Description", df.Columns[2].Name); - VerifyColumnTypes(df); - - var paulRow = df.Rows[0]; - Assert.Equal("Paul", paulRow[0]); - Assert.Equal(34, paulRow[1]); - Assert.Equal("Paul lives in Vermont, VA.", paulRow[2]); - - var victorRow = df.Rows[1]; - Assert.Equal("Victor", victorRow[0]); - Assert.Equal(29, victorRow[1]); - Assert.Equal("Victor: Funny guy", victorRow[2]); - - var mariaRow = df.Rows[2]; - Assert.Equal("Maria", mariaRow[0]); - Assert.Equal(31, mariaRow[1]); - Assert.Equal("", mariaRow[2]); - } - - DataFrame df = DataFrame.LoadCsv(GetStream(data), dataTypes: new Type[] { typeof(string), typeof(int), typeof(string) }, separator: ':'); - Verify(df); - df = DataFrame.LoadCsvFromString(data, dataTypes: new Type[] { typeof(string), typeof(int), typeof(string) }, separator: ':'); - Verify(df); - } - - [Fact] - public void TestReadCsvWithCommaSeparatorsInHeaderData() - { - string data = @"""Na,me"",Age,Description -Paul,34,""Paul lives in Vermont, VA."" -Victor,29,""Victor: Funny guy"" -Maria,31,"; - - void Verify(DataFrame df) - { - Assert.Equal(3, df.Rows.Count); - Assert.Equal(3, df.Columns.Count); - - Assert.True(typeof(string) == df.Columns[0].DataType); - Assert.True(typeof(int) == df.Columns[1].DataType); - Assert.True(typeof(string) == df.Columns[2].DataType); - - - Assert.Equal("Na,me", df.Columns[0].Name); - Assert.Equal("Age", df.Columns[1].Name); - Assert.Equal("Description", df.Columns[2].Name); - VerifyColumnTypes(df); - - var paulRow = df.Rows[0]; - Assert.Equal("Paul", paulRow[0]); - Assert.Equal(34, paulRow[1]); - Assert.Equal("Paul lives in Vermont, VA.", paulRow[2]); - - var victorRow = df.Rows[1]; - Assert.Equal("Victor", victorRow[0]); - Assert.Equal(29, victorRow[1]); - Assert.Equal("Victor: Funny guy", victorRow[2]); - - var mariaRow = df.Rows[2]; - Assert.Equal("Maria", mariaRow[0]); - Assert.Equal(31, mariaRow[1]); - Assert.Equal("", mariaRow[2]); - } - - DataFrame df = DataFrame.LoadCsv(GetStream(data), dataTypes: new Type[] { typeof(string), typeof(int), typeof(string) }); - Verify(df); - df = DataFrame.LoadCsvFromString(data, dataTypes: new Type[] { typeof(string), typeof(int), typeof(string) }); - Verify(df); - } - - [Fact] - public void TestReadCsvWithNewlinesInData() - { - string data = @"Name,Age,Description -Paul,34,""Paul lives in Vermont -VA."" -Victor,29,""Victor: Funny guy"" -Maria,31,"; - - void Verify(DataFrame df) - { - Assert.Equal(3, df.Rows.Count); - Assert.Equal(3, df.Columns.Count); - - Assert.True(typeof(string) == df.Columns[0].DataType); - Assert.True(typeof(int) == df.Columns[1].DataType); - Assert.True(typeof(string) == df.Columns[2].DataType); - - - Assert.Equal("Name", df.Columns[0].Name); - Assert.Equal("Age", df.Columns[1].Name); - Assert.Equal("Description", df.Columns[2].Name); - VerifyColumnTypes(df); - - var paulRow = df.Rows[0]; - Assert.Equal("Paul", paulRow[0]); - Assert.Equal(34, paulRow[1]); - Assert.Equal(@"Paul lives in Vermont -VA.", paulRow[2]); - - var victorRow = df.Rows[1]; - Assert.Equal("Victor", victorRow[0]); - Assert.Equal(29, victorRow[1]); - Assert.Equal("Victor: Funny guy", victorRow[2]); - - var mariaRow = df.Rows[2]; - Assert.Equal("Maria", mariaRow[0]); - Assert.Equal(31, mariaRow[1]); - Assert.Equal("", mariaRow[2]); - } - - DataFrame df = DataFrame.LoadCsv(GetStream(data), dataTypes: new Type[] { typeof(string), typeof(int), typeof(string) }); - Verify(df); - df = DataFrame.LoadCsvFromString(data, dataTypes: new Type[] { typeof(string), typeof(int), typeof(string) }); - Verify(df); - } - [Fact] public void TestReadCsvWithPipeSeparator() { @@ -1200,5 +1021,278 @@ public void TestMixedDataTypesInCsv() Assert.Equal("", emptyColumn[i]); } } + + public struct LoadCsvVerifyingHelper + { + int _columnCount; + long _rowCount; + string[] _columnNames; + Type[] _columnTypes; + object[][] _cells; + + public LoadCsvVerifyingHelper(int columnCount, long rowCount, string[] columnNames, Type[] columnTypes, object[][] cells) + { + _columnCount = columnCount; + _rowCount = rowCount; + _columnNames = columnNames; + _columnTypes = columnTypes; + _cells = cells; + + } + + public void VerifyLoadCsv(DataFrame df) + { + Assert.Equal(_rowCount, df.Rows.Count); + Assert.Equal(_columnCount, df.Columns.Count); + + for (int j = 0; j < _columnCount; j++) + { + Assert.True(_columnTypes[j] == df.Columns[j].DataType); + Assert.Equal(_columnNames[j], df.Columns[j].Name); + + } + + VerifyColumnTypes(df); + + for (int i = 0; i < _rowCount; i++) + { + for (int j = 0; j < _columnCount; j++) + { + Assert.Equal(_cells[i][j], df.Rows[i][j]); + } + } + } + + } + + public static IEnumerable LoadCsv_TestData() + { + yield return new object[] + { + @"Name,Age,Description +Paul,34,""Paul lives in Vermont, VA."" +Victor,29,""Victor: Funny guy"" +Maria,31,", + ',', + new LoadCsvVerifyingHelper( + 3, + 3, + new string[] { "Name", "Age", "Description" }, + new Type[] { typeof(string), typeof(int), typeof(string) }, + new object[][] + { + new object[] { "Paul", 34, "Paul lives in Vermont, VA." }, + new object[] { "Victor", 29, "Victor: Funny guy" }, + new object[] { "Maria", 31, "" } + } + ) + }; + yield return new object[] + { + @"Name:Age:Description +Paul:34:""Paul lives in Vermont, VA."" +Victor:29:""Victor: Funny guy"" +Maria:31:", + ':', + new LoadCsvVerifyingHelper( + 3, + 3, + new string[] { "Name", "Age", "Description" }, + new Type[] { typeof(string), typeof(int), typeof(string) }, + new object[][] + { + new object[] { "Paul", 34, "Paul lives in Vermont, VA." }, + new object[] { "Victor", 29, "Victor: Funny guy" }, + new object[] { "Maria", 31, "" } + } + ) + }; + } + + [Theory] + [MemberData(nameof(LoadCsv_TestData))] + public void TestReadWriteCsvWithCommaSeparatorsInData(string data, char separator, LoadCsvVerifyingHelper helper) + { + // Read data to a DataFrame in two ways and verify correctness + DataFrame df = DataFrame.LoadCsv(GetStream(data), dataTypes: new Type[] { typeof(string), typeof(int), typeof(string) }, separator: separator); + helper.VerifyLoadCsv(df); + df = DataFrame.LoadCsvFromString(data, dataTypes: new Type[] { typeof(string), typeof(int), typeof(string) }, separator: separator); + helper.VerifyLoadCsv(df); + + // Write DataFrame to a MemoryStream + using MemoryStream csvStream = new MemoryStream(); + DataFrame.WriteCsv(df, csvStream, separator: separator); + + // Read MemoryStream back to DataFrame and verify correctness + csvStream.Seek(0, SeekOrigin.Begin); + DataFrame df2 = DataFrame.LoadCsv(csvStream, dataTypes: new Type[] { typeof(string), typeof(int), typeof(string) }, separator: separator); + helper.VerifyLoadCsv(df2); + } + + [Fact] + public void TestReadWriteCsvWithColonSeparatorsInData() + { + string data = @"Name:Age:Description +Paul:34:""Paul lives in Vermont, VA."" +Victor:29:""Victor: Funny guy"" +Maria:31:"; + void Verify(DataFrame df) + { + Assert.Equal(3, df.Rows.Count); + Assert.Equal(3, df.Columns.Count); + + Assert.True(typeof(string) == df.Columns[0].DataType); + Assert.True(typeof(int) == df.Columns[1].DataType); + Assert.True(typeof(string) == df.Columns[2].DataType); + + + Assert.Equal("Name", df.Columns[0].Name); + Assert.Equal("Age", df.Columns[1].Name); + Assert.Equal("Description", df.Columns[2].Name); + VerifyColumnTypes(df); + + var paulRow = df.Rows[0]; + Assert.Equal("Paul", paulRow[0]); + Assert.Equal(34, paulRow[1]); + Assert.Equal("Paul lives in Vermont, VA.", paulRow[2]); + + var victorRow = df.Rows[1]; + Assert.Equal("Victor", victorRow[0]); + Assert.Equal(29, victorRow[1]); + Assert.Equal("Victor: Funny guy", victorRow[2]); + + var mariaRow = df.Rows[2]; + Assert.Equal("Maria", mariaRow[0]); + Assert.Equal(31, mariaRow[1]); + Assert.Equal("", mariaRow[2]); + } + + // Read data to a DataFrame in two ways and verify correctness + DataFrame df = DataFrame.LoadCsv(GetStream(data), dataTypes: new Type[] { typeof(string), typeof(int), typeof(string) }, separator: ':'); + Verify(df); + df = DataFrame.LoadCsvFromString(data, dataTypes: new Type[] { typeof(string), typeof(int), typeof(string) }, separator: ':'); + Verify(df); + + // Write DataFrame to a MemoryStream + using MemoryStream csvStream = new MemoryStream(); + DataFrame.WriteCsv(df, csvStream, separator: ':'); + + // Read MemoryStream back to DataFrame and verify correctness + csvStream.Seek(0, SeekOrigin.Begin); + DataFrame df2 = DataFrame.LoadCsv(csvStream, dataTypes: new Type[] { typeof(string), typeof(int), typeof(string) }, separator: ':'); + Verify(df2); + } + + [Fact] + public void TestReadWriteCsvWithCommaSeparatorsInHeaderData() + { + string data = @"""Na,me"",Age,Description +Paul,34,""Paul lives in Vermont, VA."" +Victor,29,""Victor: Funny guy"" +Maria,31,"; + + void Verify(DataFrame df) + { + Assert.Equal(3, df.Rows.Count); + Assert.Equal(3, df.Columns.Count); + + Assert.True(typeof(string) == df.Columns[0].DataType); + Assert.True(typeof(int) == df.Columns[1].DataType); + Assert.True(typeof(string) == df.Columns[2].DataType); + + + Assert.Equal("Na,me", df.Columns[0].Name); + Assert.Equal("Age", df.Columns[1].Name); + Assert.Equal("Description", df.Columns[2].Name); + VerifyColumnTypes(df); + + var paulRow = df.Rows[0]; + Assert.Equal("Paul", paulRow[0]); + Assert.Equal(34, paulRow[1]); + Assert.Equal("Paul lives in Vermont, VA.", paulRow[2]); + + var victorRow = df.Rows[1]; + Assert.Equal("Victor", victorRow[0]); + Assert.Equal(29, victorRow[1]); + Assert.Equal("Victor: Funny guy", victorRow[2]); + + var mariaRow = df.Rows[2]; + Assert.Equal("Maria", mariaRow[0]); + Assert.Equal(31, mariaRow[1]); + Assert.Equal("", mariaRow[2]); + } + + // Read data to a DataFrame in two ways and verify correctness + DataFrame df = DataFrame.LoadCsv(GetStream(data), dataTypes: new Type[] { typeof(string), typeof(int), typeof(string) }); + Verify(df); + df = DataFrame.LoadCsvFromString(data, dataTypes: new Type[] { typeof(string), typeof(int), typeof(string) }); + Verify(df); + + // Write DataFrame to a MemoryStream + using MemoryStream csvStream = new MemoryStream(); + DataFrame.WriteCsv(df, csvStream); + + // Read MemoryStream back to DataFrame and verify correctness + csvStream.Seek(0, SeekOrigin.Begin); + DataFrame df2 = DataFrame.LoadCsv(csvStream, dataTypes: new Type[] { typeof(string), typeof(int), typeof(string) }); + Verify(df2); + } + + [Fact] + public void TestReadWriteCsvWithNewlinesInData() + { + string data = @"Name,Age,Description +Paul,34,""Paul lives in Vermont +VA."" +Victor,29,""Victor: Funny guy"" +Maria,31,"; + + void Verify(DataFrame df) + { + Assert.Equal(3, df.Rows.Count); + Assert.Equal(3, df.Columns.Count); + + Assert.True(typeof(string) == df.Columns[0].DataType); + Assert.True(typeof(int) == df.Columns[1].DataType); + Assert.True(typeof(string) == df.Columns[2].DataType); + + + Assert.Equal("Name", df.Columns[0].Name); + Assert.Equal("Age", df.Columns[1].Name); + Assert.Equal("Description", df.Columns[2].Name); + VerifyColumnTypes(df); + + var paulRow = df.Rows[0]; + Assert.Equal("Paul", paulRow[0]); + Assert.Equal(34, paulRow[1]); + Assert.Equal(@"Paul lives in Vermont +VA.", paulRow[2]); + + var victorRow = df.Rows[1]; + Assert.Equal("Victor", victorRow[0]); + Assert.Equal(29, victorRow[1]); + Assert.Equal("Victor: Funny guy", victorRow[2]); + + var mariaRow = df.Rows[2]; + Assert.Equal("Maria", mariaRow[0]); + Assert.Equal(31, mariaRow[1]); + Assert.Equal("", mariaRow[2]); + } + + // Read data to a DataFrame in two ways and verify correctness + DataFrame df = DataFrame.LoadCsv(GetStream(data), dataTypes: new Type[] { typeof(string), typeof(int), typeof(string) }); + Verify(df); + df = DataFrame.LoadCsvFromString(data, dataTypes: new Type[] { typeof(string), typeof(int), typeof(string) }); + Verify(df); + + // Write DataFrame to a MemoryStream + using MemoryStream csvStream = new MemoryStream(); + DataFrame.WriteCsv(df, csvStream); + + // Read MemoryStream back to DataFrame and verify correctness + csvStream.Seek(0, SeekOrigin.Begin); + DataFrame df2 = DataFrame.LoadCsv(csvStream, dataTypes: new Type[] { typeof(string), typeof(int), typeof(string) }); + Verify(df2); + } } } From e0ff9a9c6ce3bb7202abe9664a239699f821a524 Mon Sep 17 00:00:00 2001 From: Drew Kersnar <18474647+dakersnar@users.noreply.github.com> Date: Wed, 24 Aug 2022 18:28:39 -0500 Subject: [PATCH 04/14] Fix separators in data --- src/Microsoft.Data.Analysis/DataFrame.IO.cs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/Microsoft.Data.Analysis/DataFrame.IO.cs b/src/Microsoft.Data.Analysis/DataFrame.IO.cs index 30f395352c..7842ff0cca 100644 --- a/src/Microsoft.Data.Analysis/DataFrame.IO.cs +++ b/src/Microsoft.Data.Analysis/DataFrame.IO.cs @@ -500,6 +500,14 @@ public static void WriteCsv(DataFrame dataFrame, Stream csvStream, continue; } + if (t == typeof(string) && ((string)cell).Contains(separator.ToString())) // TODO why doesn't Contains(char) work? + { + record.Append("\""); + record.Append(cell); + record.Append("\""); + continue; + } + record.Append(cell); } From 6a568da2e58c15e2e3124deaf77c66fb3b657966 Mon Sep 17 00:00:00 2001 From: Drew Kersnar <18474647+dakersnar@users.noreply.github.com> Date: Wed, 24 Aug 2022 18:38:55 -0500 Subject: [PATCH 05/14] Fix separators in header --- src/Microsoft.Data.Analysis/DataFrame.IO.cs | 25 ++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/src/Microsoft.Data.Analysis/DataFrame.IO.cs b/src/Microsoft.Data.Analysis/DataFrame.IO.cs index 7842ff0cca..a543aaebf1 100644 --- a/src/Microsoft.Data.Analysis/DataFrame.IO.cs +++ b/src/Microsoft.Data.Analysis/DataFrame.IO.cs @@ -454,7 +454,30 @@ public static void WriteCsv(DataFrame dataFrame, Stream csvStream, if (header) { - var headerColumns = string.Join(separator.ToString(), columnNames); + bool firstColumn = true; + var headerColumns = new StringBuilder(); + foreach (string name in columnNames) + { + if (!firstColumn) + { + headerColumns.Append(separator); + } + else + { + firstColumn = false; + } + + if (name.Contains(separator.ToString())) // TODO why doesn't Contains(char) work? + { + headerColumns.Append("\""); + headerColumns.Append(name); + headerColumns.Append("\""); + } + else + { + headerColumns.Append(name); + } + } csvFile.WriteLine(headerColumns); } From 9c02ab16ab1343127a13645fa11a3dce20a4fb6b Mon Sep 17 00:00:00 2001 From: Drew Kersnar <18474647+dakersnar@users.noreply.github.com> Date: Thu, 25 Aug 2022 19:25:30 -0500 Subject: [PATCH 06/14] Clean up tests --- .../DataFrame.IOTests.cs | 257 ++++++------------ 1 file changed, 83 insertions(+), 174 deletions(-) diff --git a/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs b/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs index 18937548fb..c463c331b9 100644 --- a/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs +++ b/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs @@ -1062,12 +1062,11 @@ public void VerifyLoadCsv(DataFrame df) } } } - } - public static IEnumerable LoadCsv_TestData() + public static IEnumerable CsvWithTextQualifiers_TestData() { - yield return new object[] + yield return new object[] // Comma Separators in Data { @"Name,Age,Description Paul,34,""Paul lives in Vermont, VA."" @@ -1087,7 +1086,7 @@ public static IEnumerable LoadCsv_TestData() } ) }; - yield return new object[] + yield return new object[] // Colon Separators in Data { @"Name:Age:Description Paul:34:""Paul lives in Vermont, VA."" @@ -1107,192 +1106,102 @@ public static IEnumerable LoadCsv_TestData() } ) }; + yield return new object[] // Comma Separators in Header + { + @"""Na,me"",Age,Description +Paul,34,""Paul lives in Vermont, VA."" +Victor,29,""Victor: Funny guy"" +Maria,31,", + ',', + new LoadCsvVerifyingHelper( + 3, + 3, + new string[] { "Na,me", "Age", "Description" }, + new Type[] { typeof(string), typeof(int), typeof(string) }, + new object[][] + { + new object[] { "Paul", 34, "Paul lives in Vermont, VA." }, + new object[] { "Victor", 29, "Victor: Funny guy" }, + new object[] { "Maria", 31, "" } + } + ) + }; + yield return new object[] // Newlines In Data + { + @"Name,Age,Description +Paul,34,""Paul lives in Vermont +VA."" +Victor,29,""Victor: Funny guy"" +Maria,31,", + ',', + new LoadCsvVerifyingHelper( + 3, + 3, + new string[] { "Name", "Age", "Description" }, + new Type[] { typeof(string), typeof(int), typeof(string) }, + new object[][] + { + new object[] { "Paul", 34, @"Paul lives in Vermont +VA." }, + new object[] { "Victor", 29, "Victor: Funny guy" }, + new object[] { "Maria", 31, "" } + } + ) + }; + yield return new object[] // Newlines In Header + { + @"""Na +me"":Age:Description +Paul:34:""Paul lives in Vermont, VA."" +Victor:29:""Victor: Funny guy"" +Maria:31:", + ':', + new LoadCsvVerifyingHelper( + 3, + 3, + new string[] { @"Na +me", "Age", "Description" }, + new Type[] { typeof(string), typeof(int), typeof(string) }, + new object[][] + { + new object[] { "Paul", 34, "Paul lives in Vermont, VA." }, + new object[] { "Victor", 29, "Victor: Funny guy" }, + new object[] { "Maria", 31, "" } + } + ) + }; } [Theory] - [MemberData(nameof(LoadCsv_TestData))] - public void TestReadWriteCsvWithCommaSeparatorsInData(string data, char separator, LoadCsvVerifyingHelper helper) + [MemberData(nameof(CsvWithTextQualifiers_TestData))] + public void TestLoadCsvWithTextQualifiersFromStream(string data, char separator, LoadCsvVerifyingHelper helper) { - // Read data to a DataFrame in two ways and verify correctness DataFrame df = DataFrame.LoadCsv(GetStream(data), dataTypes: new Type[] { typeof(string), typeof(int), typeof(string) }, separator: separator); helper.VerifyLoadCsv(df); - df = DataFrame.LoadCsvFromString(data, dataTypes: new Type[] { typeof(string), typeof(int), typeof(string) }, separator: separator); - helper.VerifyLoadCsv(df); - - // Write DataFrame to a MemoryStream - using MemoryStream csvStream = new MemoryStream(); - DataFrame.WriteCsv(df, csvStream, separator: separator); - - // Read MemoryStream back to DataFrame and verify correctness - csvStream.Seek(0, SeekOrigin.Begin); - DataFrame df2 = DataFrame.LoadCsv(csvStream, dataTypes: new Type[] { typeof(string), typeof(int), typeof(string) }, separator: separator); - helper.VerifyLoadCsv(df2); - } - - [Fact] - public void TestReadWriteCsvWithColonSeparatorsInData() - { - string data = @"Name:Age:Description -Paul:34:""Paul lives in Vermont, VA."" -Victor:29:""Victor: Funny guy"" -Maria:31:"; - void Verify(DataFrame df) - { - Assert.Equal(3, df.Rows.Count); - Assert.Equal(3, df.Columns.Count); - - Assert.True(typeof(string) == df.Columns[0].DataType); - Assert.True(typeof(int) == df.Columns[1].DataType); - Assert.True(typeof(string) == df.Columns[2].DataType); - - - Assert.Equal("Name", df.Columns[0].Name); - Assert.Equal("Age", df.Columns[1].Name); - Assert.Equal("Description", df.Columns[2].Name); - VerifyColumnTypes(df); - - var paulRow = df.Rows[0]; - Assert.Equal("Paul", paulRow[0]); - Assert.Equal(34, paulRow[1]); - Assert.Equal("Paul lives in Vermont, VA.", paulRow[2]); - - var victorRow = df.Rows[1]; - Assert.Equal("Victor", victorRow[0]); - Assert.Equal(29, victorRow[1]); - Assert.Equal("Victor: Funny guy", victorRow[2]); - - var mariaRow = df.Rows[2]; - Assert.Equal("Maria", mariaRow[0]); - Assert.Equal(31, mariaRow[1]); - Assert.Equal("", mariaRow[2]); - } - - // Read data to a DataFrame in two ways and verify correctness - DataFrame df = DataFrame.LoadCsv(GetStream(data), dataTypes: new Type[] { typeof(string), typeof(int), typeof(string) }, separator: ':'); - Verify(df); - df = DataFrame.LoadCsvFromString(data, dataTypes: new Type[] { typeof(string), typeof(int), typeof(string) }, separator: ':'); - Verify(df); - - // Write DataFrame to a MemoryStream - using MemoryStream csvStream = new MemoryStream(); - DataFrame.WriteCsv(df, csvStream, separator: ':'); - - // Read MemoryStream back to DataFrame and verify correctness - csvStream.Seek(0, SeekOrigin.Begin); - DataFrame df2 = DataFrame.LoadCsv(csvStream, dataTypes: new Type[] { typeof(string), typeof(int), typeof(string) }, separator: ':'); - Verify(df2); } - [Fact] - public void TestReadWriteCsvWithCommaSeparatorsInHeaderData() + [Theory] + [MemberData(nameof(CsvWithTextQualifiers_TestData))] + public void TestLoadCsvWithTextQualifiersFromString(string data, char separator, LoadCsvVerifyingHelper helper) { - string data = @"""Na,me"",Age,Description -Paul,34,""Paul lives in Vermont, VA."" -Victor,29,""Victor: Funny guy"" -Maria,31,"; - - void Verify(DataFrame df) - { - Assert.Equal(3, df.Rows.Count); - Assert.Equal(3, df.Columns.Count); - - Assert.True(typeof(string) == df.Columns[0].DataType); - Assert.True(typeof(int) == df.Columns[1].DataType); - Assert.True(typeof(string) == df.Columns[2].DataType); - - - Assert.Equal("Na,me", df.Columns[0].Name); - Assert.Equal("Age", df.Columns[1].Name); - Assert.Equal("Description", df.Columns[2].Name); - VerifyColumnTypes(df); - - var paulRow = df.Rows[0]; - Assert.Equal("Paul", paulRow[0]); - Assert.Equal(34, paulRow[1]); - Assert.Equal("Paul lives in Vermont, VA.", paulRow[2]); - - var victorRow = df.Rows[1]; - Assert.Equal("Victor", victorRow[0]); - Assert.Equal(29, victorRow[1]); - Assert.Equal("Victor: Funny guy", victorRow[2]); - - var mariaRow = df.Rows[2]; - Assert.Equal("Maria", mariaRow[0]); - Assert.Equal(31, mariaRow[1]); - Assert.Equal("", mariaRow[2]); - } - - // Read data to a DataFrame in two ways and verify correctness - DataFrame df = DataFrame.LoadCsv(GetStream(data), dataTypes: new Type[] { typeof(string), typeof(int), typeof(string) }); - Verify(df); - df = DataFrame.LoadCsvFromString(data, dataTypes: new Type[] { typeof(string), typeof(int), typeof(string) }); - Verify(df); - - // Write DataFrame to a MemoryStream - using MemoryStream csvStream = new MemoryStream(); - DataFrame.WriteCsv(df, csvStream); - - // Read MemoryStream back to DataFrame and verify correctness - csvStream.Seek(0, SeekOrigin.Begin); - DataFrame df2 = DataFrame.LoadCsv(csvStream, dataTypes: new Type[] { typeof(string), typeof(int), typeof(string) }); - Verify(df2); + DataFrame df = DataFrame.LoadCsvFromString(data, dataTypes: new Type[] { typeof(string), typeof(int), typeof(string) }, separator: separator); + helper.VerifyLoadCsv(df); } - [Fact] - public void TestReadWriteCsvWithNewlinesInData() + [Theory] + [MemberData(nameof(CsvWithTextQualifiers_TestData))] + public void TestWriteCsvWithTextQualifiers(string data, char separator, LoadCsvVerifyingHelper helper) { - string data = @"Name,Age,Description -Paul,34,""Paul lives in Vermont -VA."" -Victor,29,""Victor: Funny guy"" -Maria,31,"; - - void Verify(DataFrame df) - { - Assert.Equal(3, df.Rows.Count); - Assert.Equal(3, df.Columns.Count); - - Assert.True(typeof(string) == df.Columns[0].DataType); - Assert.True(typeof(int) == df.Columns[1].DataType); - Assert.True(typeof(string) == df.Columns[2].DataType); - - - Assert.Equal("Name", df.Columns[0].Name); - Assert.Equal("Age", df.Columns[1].Name); - Assert.Equal("Description", df.Columns[2].Name); - VerifyColumnTypes(df); - - var paulRow = df.Rows[0]; - Assert.Equal("Paul", paulRow[0]); - Assert.Equal(34, paulRow[1]); - Assert.Equal(@"Paul lives in Vermont -VA.", paulRow[2]); - - var victorRow = df.Rows[1]; - Assert.Equal("Victor", victorRow[0]); - Assert.Equal(29, victorRow[1]); - Assert.Equal("Victor: Funny guy", victorRow[2]); - - var mariaRow = df.Rows[2]; - Assert.Equal("Maria", mariaRow[0]); - Assert.Equal(31, mariaRow[1]); - Assert.Equal("", mariaRow[2]); - } - - // Read data to a DataFrame in two ways and verify correctness - DataFrame df = DataFrame.LoadCsv(GetStream(data), dataTypes: new Type[] { typeof(string), typeof(int), typeof(string) }); - Verify(df); - df = DataFrame.LoadCsvFromString(data, dataTypes: new Type[] { typeof(string), typeof(int), typeof(string) }); - Verify(df); + DataFrame df = DataFrame.LoadCsv(GetStream(data), dataTypes: new Type[] { typeof(string), typeof(int), typeof(string) }, separator: separator); - // Write DataFrame to a MemoryStream using MemoryStream csvStream = new MemoryStream(); - DataFrame.WriteCsv(df, csvStream); + DataFrame.WriteCsv(df, csvStream, separator: separator); - // Read MemoryStream back to DataFrame and verify correctness + // We are verifying that WriteCsv works by reading the result back to a DataFrame and verifying correctness, + // ensuring no information loss csvStream.Seek(0, SeekOrigin.Begin); - DataFrame df2 = DataFrame.LoadCsv(csvStream, dataTypes: new Type[] { typeof(string), typeof(int), typeof(string) }); - Verify(df2); + DataFrame df2 = DataFrame.LoadCsv(csvStream, dataTypes: new Type[] { typeof(string), typeof(int), typeof(string) }, separator: separator); + helper.VerifyLoadCsv(df2); } } } From b9bec7aac0d26b561a80b70bdce0e53029f32017 Mon Sep 17 00:00:00 2001 From: Drew Kersnar <18474647+dakersnar@users.noreply.github.com> Date: Thu, 25 Aug 2022 19:27:31 -0500 Subject: [PATCH 07/14] Fix issue with not wrapping output with newlines in quotations --- src/Microsoft.Data.Analysis/DataFrame.IO.cs | 25 +++++++++++++-------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/src/Microsoft.Data.Analysis/DataFrame.IO.cs b/src/Microsoft.Data.Analysis/DataFrame.IO.cs index a543aaebf1..2a28324845 100644 --- a/src/Microsoft.Data.Analysis/DataFrame.IO.cs +++ b/src/Microsoft.Data.Analysis/DataFrame.IO.cs @@ -467,7 +467,9 @@ public static void WriteCsv(DataFrame dataFrame, Stream csvStream, firstColumn = false; } - if (name.Contains(separator.ToString())) // TODO why doesn't Contains(char) work? + // TODO why doesn't Contains(char) work? + bool needsQuotes = ((string)name).Contains(separator.ToString()) || ((string)name).Contains("\n"); + if (needsQuotes) { headerColumns.Append("\""); headerColumns.Append(name); @@ -485,16 +487,16 @@ public static void WriteCsv(DataFrame dataFrame, Stream csvStream, foreach (var row in dataFrame.Rows) { - bool firstRow = true; + bool firstCell = true; foreach (var cell in row) { - if (!firstRow) + if (!firstCell) { record.Append(separator); } else { - firstRow = false; + firstCell = false; } Type t = cell?.GetType(); @@ -523,12 +525,17 @@ public static void WriteCsv(DataFrame dataFrame, Stream csvStream, continue; } - if (t == typeof(string) && ((string)cell).Contains(separator.ToString())) // TODO why doesn't Contains(char) work? + if (t == typeof(string)) { - record.Append("\""); - record.Append(cell); - record.Append("\""); - continue; + // TODO why doesn't Contains(char) work? + bool needsQuotes = ((string)cell).Contains(separator.ToString()) || ((string)cell).Contains("\n"); + if (needsQuotes) + { + record.Append("\""); + record.Append(cell); + record.Append("\""); + continue; + } } record.Append(cell); From 6f8029785a274c8d26efd5ea6af8cfcb3c24b65e Mon Sep 17 00:00:00 2001 From: Drew Kersnar <18474647+dakersnar@users.noreply.github.com> Date: Thu, 25 Aug 2022 19:41:09 -0500 Subject: [PATCH 08/14] Accidental commit --- test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs | 1 - 1 file changed, 1 deletion(-) diff --git a/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs b/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs index c463c331b9..ed7aed1312 100644 --- a/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs +++ b/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs @@ -3,7 +3,6 @@ // See the LICENSE file in the project root for more information. using System; -using System.CodeDom; using System.Collections.Generic; using System.Globalization; using System.IO; From d5aaff99099ed4dcc6d8eae68f66b00153d8d72c Mon Sep 17 00:00:00 2001 From: Drew Kersnar <18474647+dakersnar@users.noreply.github.com> Date: Fri, 9 Sep 2022 13:01:00 -0500 Subject: [PATCH 09/14] Clean up --- src/Microsoft.Data.Analysis/DataFrame.IO.cs | 2 -- test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs | 5 +---- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/src/Microsoft.Data.Analysis/DataFrame.IO.cs b/src/Microsoft.Data.Analysis/DataFrame.IO.cs index 2a28324845..55c24cfb14 100644 --- a/src/Microsoft.Data.Analysis/DataFrame.IO.cs +++ b/src/Microsoft.Data.Analysis/DataFrame.IO.cs @@ -467,7 +467,6 @@ public static void WriteCsv(DataFrame dataFrame, Stream csvStream, firstColumn = false; } - // TODO why doesn't Contains(char) work? bool needsQuotes = ((string)name).Contains(separator.ToString()) || ((string)name).Contains("\n"); if (needsQuotes) { @@ -527,7 +526,6 @@ public static void WriteCsv(DataFrame dataFrame, Stream csvStream, if (t == typeof(string)) { - // TODO why doesn't Contains(char) work? bool needsQuotes = ((string)cell).Contains(separator.ToString()) || ((string)cell).Contains("\n"); if (needsQuotes) { diff --git a/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs b/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs index ed7aed1312..20199e839c 100644 --- a/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs +++ b/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs @@ -1055,10 +1055,7 @@ public void VerifyLoadCsv(DataFrame df) for (int i = 0; i < _rowCount; i++) { - for (int j = 0; j < _columnCount; j++) - { - Assert.Equal(_cells[i][j], df.Rows[i][j]); - } + Assert.Equal(_cells[i], df.Rows[i]); } } } From be4fcb06a0eaf3e91e96a25fec1da661dba61bc7 Mon Sep 17 00:00:00 2001 From: Drew Kersnar <18474647+dakersnar@users.noreply.github.com> Date: Fri, 9 Sep 2022 17:23:43 -0500 Subject: [PATCH 10/14] Clean up mini test framework a bit --- .../DataFrame.IOTests.cs | 23 +++++++++++-------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs b/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs index 20199e839c..a34009ec65 100644 --- a/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs +++ b/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs @@ -1069,6 +1069,7 @@ public static IEnumerable CsvWithTextQualifiers_TestData() Victor,29,""Victor: Funny guy"" Maria,31,", ',', + new Type[] { typeof(string), typeof(int), typeof(string) }, new LoadCsvVerifyingHelper( 3, 3, @@ -1089,6 +1090,7 @@ public static IEnumerable CsvWithTextQualifiers_TestData() Victor:29:""Victor: Funny guy"" Maria:31:", ':', + new Type[] { typeof(string), typeof(int), typeof(string) }, new LoadCsvVerifyingHelper( 3, 3, @@ -1104,11 +1106,12 @@ public static IEnumerable CsvWithTextQualifiers_TestData() }; yield return new object[] // Comma Separators in Header { - @"""Na,me"",Age,Description + @"""Na,me"",Age,Description Paul,34,""Paul lives in Vermont, VA."" Victor,29,""Victor: Funny guy"" Maria,31,", - ',', + ',', + new Type[] { typeof(string), typeof(int), typeof(string) }, new LoadCsvVerifyingHelper( 3, 3, @@ -1130,6 +1133,7 @@ public static IEnumerable CsvWithTextQualifiers_TestData() Victor,29,""Victor: Funny guy"" Maria,31,", ',', + new Type[] { typeof(string), typeof(int), typeof(string) }, new LoadCsvVerifyingHelper( 3, 3, @@ -1152,6 +1156,7 @@ public static IEnumerable CsvWithTextQualifiers_TestData() Victor:29:""Victor: Funny guy"" Maria:31:", ':', + new Type[] { typeof(string), typeof(int), typeof(string) }, new LoadCsvVerifyingHelper( 3, 3, @@ -1170,25 +1175,25 @@ public static IEnumerable CsvWithTextQualifiers_TestData() [Theory] [MemberData(nameof(CsvWithTextQualifiers_TestData))] - public void TestLoadCsvWithTextQualifiersFromStream(string data, char separator, LoadCsvVerifyingHelper helper) + public void TestLoadCsvWithTextQualifiersFromStream(string data, char separator, Type[] dataTypes, LoadCsvVerifyingHelper helper) { - DataFrame df = DataFrame.LoadCsv(GetStream(data), dataTypes: new Type[] { typeof(string), typeof(int), typeof(string) }, separator: separator); + DataFrame df = DataFrame.LoadCsv(GetStream(data), dataTypes: dataTypes, separator: separator); helper.VerifyLoadCsv(df); } [Theory] [MemberData(nameof(CsvWithTextQualifiers_TestData))] - public void TestLoadCsvWithTextQualifiersFromString(string data, char separator, LoadCsvVerifyingHelper helper) + public void TestLoadCsvWithTextQualifiersFromString(string data, char separator, Type[] dataTypes, LoadCsvVerifyingHelper helper) { - DataFrame df = DataFrame.LoadCsvFromString(data, dataTypes: new Type[] { typeof(string), typeof(int), typeof(string) }, separator: separator); + DataFrame df = DataFrame.LoadCsvFromString(data, dataTypes: dataTypes, separator: separator); helper.VerifyLoadCsv(df); } [Theory] [MemberData(nameof(CsvWithTextQualifiers_TestData))] - public void TestWriteCsvWithTextQualifiers(string data, char separator, LoadCsvVerifyingHelper helper) + public void TestWriteCsvWithTextQualifiers(string data, char separator, Type[] dataTypes, LoadCsvVerifyingHelper helper) { - DataFrame df = DataFrame.LoadCsv(GetStream(data), dataTypes: new Type[] { typeof(string), typeof(int), typeof(string) }, separator: separator); + DataFrame df = DataFrame.LoadCsv(GetStream(data), dataTypes: dataTypes, separator: separator); using MemoryStream csvStream = new MemoryStream(); DataFrame.WriteCsv(df, csvStream, separator: separator); @@ -1196,7 +1201,7 @@ public void TestWriteCsvWithTextQualifiers(string data, char separator, LoadCsvV // We are verifying that WriteCsv works by reading the result back to a DataFrame and verifying correctness, // ensuring no information loss csvStream.Seek(0, SeekOrigin.Begin); - DataFrame df2 = DataFrame.LoadCsv(csvStream, dataTypes: new Type[] { typeof(string), typeof(int), typeof(string) }, separator: separator); + DataFrame df2 = DataFrame.LoadCsv(csvStream, dataTypes: dataTypes, separator: separator); helper.VerifyLoadCsv(df2); } } From 2d28e611c1b805f911e8a93b5fd6cd17bf38890b Mon Sep 17 00:00:00 2001 From: Drew Kersnar <18474647+dakersnar@users.noreply.github.com> Date: Mon, 26 Sep 2022 12:09:26 -0500 Subject: [PATCH 11/14] Apply suggestions from code review Co-authored-by: Eric Erhardt --- src/Microsoft.Data.Analysis/DataFrame.IO.cs | 8 ++++---- test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/Microsoft.Data.Analysis/DataFrame.IO.cs b/src/Microsoft.Data.Analysis/DataFrame.IO.cs index 55c24cfb14..fb696e17e3 100644 --- a/src/Microsoft.Data.Analysis/DataFrame.IO.cs +++ b/src/Microsoft.Data.Analysis/DataFrame.IO.cs @@ -470,9 +470,9 @@ public static void WriteCsv(DataFrame dataFrame, Stream csvStream, bool needsQuotes = ((string)name).Contains(separator.ToString()) || ((string)name).Contains("\n"); if (needsQuotes) { - headerColumns.Append("\""); + headerColumns.Append('\"'); headerColumns.Append(name); - headerColumns.Append("\""); + headerColumns.Append('\"'); } else { @@ -529,9 +529,9 @@ public static void WriteCsv(DataFrame dataFrame, Stream csvStream, bool needsQuotes = ((string)cell).Contains(separator.ToString()) || ((string)cell).Contains("\n"); if (needsQuotes) { - record.Append("\""); + record.Append('\"'); record.Append(cell); - record.Append("\""); + record.Append('\"'); continue; } } diff --git a/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs b/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs index a34009ec65..0a3d84210e 100644 --- a/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs +++ b/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs @@ -1021,7 +1021,7 @@ public void TestMixedDataTypesInCsv() } } - public struct LoadCsvVerifyingHelper + public readonly struct LoadCsvVerifyingHelper { int _columnCount; long _rowCount; From f3de1b080ca9b52046171158cd32fe5af96389ee Mon Sep 17 00:00:00 2001 From: Drew Kersnar <18474647+dakersnar@users.noreply.github.com> Date: Mon, 26 Sep 2022 12:11:41 -0500 Subject: [PATCH 12/14] Apply suggestions from code review --- src/Microsoft.Data.Analysis/DataFrame.IO.cs | 61 +++++++++++---------- 1 file changed, 32 insertions(+), 29 deletions(-) diff --git a/src/Microsoft.Data.Analysis/DataFrame.IO.cs b/src/Microsoft.Data.Analysis/DataFrame.IO.cs index 55c24cfb14..f0de950d0c 100644 --- a/src/Microsoft.Data.Analysis/DataFrame.IO.cs +++ b/src/Microsoft.Data.Analysis/DataFrame.IO.cs @@ -450,36 +450,9 @@ public static void WriteCsv(DataFrame dataFrame, Stream csvStream, { if (dataFrame != null) { - var columnNames = dataFrame.Columns.GetColumnNames(); - if (header) { - bool firstColumn = true; - var headerColumns = new StringBuilder(); - foreach (string name in columnNames) - { - if (!firstColumn) - { - headerColumns.Append(separator); - } - else - { - firstColumn = false; - } - - bool needsQuotes = ((string)name).Contains(separator.ToString()) || ((string)name).Contains("\n"); - if (needsQuotes) - { - headerColumns.Append("\""); - headerColumns.Append(name); - headerColumns.Append("\""); - } - else - { - headerColumns.Append(name); - } - } - csvFile.WriteLine(headerColumns); + WriteHeader(csvFile, dataFrame.Columns.GetColumnNames(), separator); } var record = new StringBuilder(); @@ -526,7 +499,7 @@ public static void WriteCsv(DataFrame dataFrame, Stream csvStream, if (t == typeof(string)) { - bool needsQuotes = ((string)cell).Contains(separator.ToString()) || ((string)cell).Contains("\n"); + bool needsQuotes = ((string)cell).IndexOf(separator) != -1 || ((string)cell).IndexOf('\n') != -1; if (needsQuotes) { record.Append("\""); @@ -546,5 +519,35 @@ public static void WriteCsv(DataFrame dataFrame, Stream csvStream, } } } + private static void WriteHeader(StreamWriter csvFile, IReadOnlyList columnNames, char separator) + { + bool firstColumn = true; + var headerColumns = new StringBuilder(); + foreach (string name in columnNames) + { + if (!firstColumn) + { + headerColumns.Append(separator); + } + else + { + firstColumn = false; + } + + bool needsQuotes = name.IndexOf(separator) != -1 || name.IndexOf('\n') != -1; + if (needsQuotes) + { + headerColumns.Append("\""); + headerColumns.Append(name); + headerColumns.Append("\""); + } + else + { + headerColumns.Append(name); + } + } + + csvFile.WriteLine(headerColumns); + } } } From 901a98f733c91c884577b842214845a142f1cf79 Mon Sep 17 00:00:00 2001 From: Drew Kersnar <18474647+dakersnar@users.noreply.github.com> Date: Mon, 26 Sep 2022 12:19:20 -0500 Subject: [PATCH 13/14] Apply suggestions from code review --- .../Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs b/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs index 0a3d84210e..371a29749e 100644 --- a/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs +++ b/test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs @@ -1023,11 +1023,11 @@ public void TestMixedDataTypesInCsv() public readonly struct LoadCsvVerifyingHelper { - int _columnCount; - long _rowCount; - string[] _columnNames; - Type[] _columnTypes; - object[][] _cells; + private readonly int _columnCount; + private readonly long _rowCount; + private readonly string[] _columnNames; + private readonly Type[] _columnTypes; + private readonly object[][] _cells; public LoadCsvVerifyingHelper(int columnCount, long rowCount, string[] columnNames, Type[] columnTypes, object[][] cells) { From fe0adcd3b52ae14cf9168f8ef874de7bd4189eb5 Mon Sep 17 00:00:00 2001 From: Drew Kersnar <18474647+dakersnar@users.noreply.github.com> Date: Mon, 26 Sep 2022 16:36:29 -0500 Subject: [PATCH 14/14] Write to StreamWriter directly --- src/Microsoft.Data.Analysis/DataFrame.IO.cs | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/Microsoft.Data.Analysis/DataFrame.IO.cs b/src/Microsoft.Data.Analysis/DataFrame.IO.cs index 95eb2cf1ba..4f14615b0e 100644 --- a/src/Microsoft.Data.Analysis/DataFrame.IO.cs +++ b/src/Microsoft.Data.Analysis/DataFrame.IO.cs @@ -522,12 +522,11 @@ public static void WriteCsv(DataFrame dataFrame, Stream csvStream, private static void WriteHeader(StreamWriter csvFile, IReadOnlyList columnNames, char separator) { bool firstColumn = true; - var headerColumns = new StringBuilder(); foreach (string name in columnNames) { if (!firstColumn) { - headerColumns.Append(separator); + csvFile.Write(separator); } else { @@ -537,17 +536,17 @@ private static void WriteHeader(StreamWriter csvFile, IReadOnlyList colu bool needsQuotes = name.IndexOf(separator) != -1 || name.IndexOf('\n') != -1; if (needsQuotes) { - headerColumns.Append('\"'); - headerColumns.Append(name); - headerColumns.Append('\"'); + csvFile.Write('\"'); + csvFile.Write(name); + csvFile.Write('\"'); } else { - headerColumns.Append(name); + csvFile.Write(name); } } - csvFile.WriteLine(headerColumns); + csvFile.WriteLine(); } } }