Skip to content

Commit a341a93

Browse files
author
Prashanth Govindarajan
committed
Address feedback
1 parent 845e21b commit a341a93

File tree

3 files changed

+32
-89
lines changed

3 files changed

+32
-89
lines changed

src/Microsoft.Data.Analysis/DataFrame.IO.cs

Lines changed: 26 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,7 @@ private static DataFrame ReadCsvLinesIntoDataFrame(WrappedStreamReaderOrStringRe
183183
throw new ArgumentException(string.Format(Strings.ExpectedEitherGuessRowsOrDataTypes, nameof(guessRows), nameof(dataTypes)));
184184
}
185185

186-
TextReader textReader = wrappedReader.TextReader;
186+
TextReader textReader = wrappedReader.GetTextReader();
187187
TextFieldParser parser = new TextFieldParser(textReader);
188188
parser.SetDelimiters(separator.ToString());
189189

@@ -244,36 +244,38 @@ private static DataFrame ReadCsvLinesIntoDataFrame(WrappedStreamReaderOrStringRe
244244
DataFrame ret = new DataFrame(columns);
245245

246246
// Fill values.
247-
textReader = wrappedReader.TextReader;
248-
parser = new TextFieldParser(textReader);
249-
parser.SetDelimiters(separator.ToString());
250-
251-
rowline = 0;
252-
while ((fields = parser.ReadFields()) != null && (numberOfRowsToRead == -1 || rowline < numberOfRowsToRead))
247+
using (textReader = wrappedReader.GetTextReader())
253248
{
254-
string[] spl = fields;
255-
if (header && rowline == 0)
256-
{
257-
// Skips.
258-
}
259-
else
249+
parser = new TextFieldParser(textReader);
250+
parser.SetDelimiters(separator.ToString());
251+
252+
rowline = 0;
253+
while ((fields = parser.ReadFields()) != null && (numberOfRowsToRead == -1 || rowline < numberOfRowsToRead))
260254
{
261-
ret.Append(spl, inPlace: true);
255+
string[] spl = fields;
256+
if (header && rowline == 0)
257+
{
258+
// Skips.
259+
}
260+
else
261+
{
262+
ret.Append(spl, inPlace: true);
263+
}
264+
++rowline;
262265
}
263-
++rowline;
264-
}
265266

266-
if (addIndexColumn)
267-
{
268-
PrimitiveDataFrameColumn<int> indexColumn = new PrimitiveDataFrameColumn<int>("IndexColumn", columns[0].Length);
269-
for (int i = 0; i < columns[0].Length; i++)
267+
if (addIndexColumn)
270268
{
271-
indexColumn[i] = i;
269+
PrimitiveDataFrameColumn<int> indexColumn = new PrimitiveDataFrameColumn<int>("IndexColumn", columns[0].Length);
270+
for (int i = 0; i < columns[0].Length; i++)
271+
{
272+
indexColumn[i] = i;
273+
}
274+
columns.Insert(0, indexColumn);
272275
}
273-
columns.Insert(0, indexColumn);
276+
274277
}
275278

276-
textReader.Dispose();
277279
return ret;
278280
}
279281

@@ -300,12 +302,9 @@ public WrappedStreamReaderOrStringReader(string csvString)
300302
_stream = null;
301303
}
302304

303-
public long InitialPosition => _initialPosition;
304-
305305
// Returns a new TextReader. If the wrapped object is a stream, the stream is reset to its initial position.
306-
public TextReader TextReader
306+
public TextReader GetTextReader()
307307
{
308-
get
309308
{
310309
if (_stream != null)
311310
{

src/Microsoft.Data.Analysis/TextFieldParser.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ internal class TextFieldParser : IDisposable
148148

149149
private Regex _delimiterWithEndCharsRegex;
150150

151-
private int[] _whitespaceCodes = new int[] { '\u0020' };
151+
private int[] _whitespaceCodes = new int[] { '\u0009', '\u000B', '\u000C', '\u0020', '\u0085', '\u00A0', '\u1680', '\u2000', '\u2001', '\u2002', '\u2003', '\u2004', '\u2005', '\u2006', '\u2007', '\u2008', '\u2009', '\u200A', '\u200B', '\u2028', '\u2029', '\u3000', '\uFEFF' };
152152

153153
private Regex _beginQuotesRegex;
154154

test/Microsoft.Data.Analysis.Tests/DataFrame.IOTests.cs

Lines changed: 5 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,11 @@ internal static void VerifyColumnTypes(DataFrame df, bool testArrowStringColumn
101101
}
102102
}
103103

104+
private static Stream GetStream(string streamData)
105+
{
106+
return new MemoryStream(Encoding.Default.GetBytes(streamData));
107+
}
108+
104109
[Theory]
105110
[InlineData(false)]
106111
[InlineData(true)]
@@ -114,10 +119,6 @@ public void TestReadCsvWithHeader(bool useQuotes)
114119
{CMT},1,1,637,1.4,CRD,8.5
115120
{CMT},1,1,181,0.6,CSH,4.5";
116121

117-
Stream GetStream(string streamData)
118-
{
119-
return new MemoryStream(Encoding.Default.GetBytes(streamData));
120-
}
121122
void RegularTest(DataFrame df)
122123
{
123124
Assert.Equal(4, df.Rows.Count);
@@ -155,10 +156,6 @@ public void TestReadCsvSplitAcrossMultipleLines()
155156
{CMT},1,1,637,1.4,CRD,8.5
156157
{CMT},1,1,181,0.6,CSH,4.5";
157158

158-
Stream GetStream(string streamData)
159-
{
160-
return new MemoryStream(Encoding.Default.GetBytes(streamData));
161-
}
162159
void RegularTest(DataFrame df)
163160
{
164161
Assert.Equal(4, df.Rows.Count);
@@ -198,10 +195,6 @@ public void TestReadCsvNoHeader(bool useQuotes)
198195
{CMT},1,1,637,1.4,CRD,8.5
199196
{CMT},1,1,181,0.6,CSH,4.5";
200197

201-
Stream GetStream(string streamData)
202-
{
203-
return new MemoryStream(Encoding.Default.GetBytes(streamData));
204-
}
205198
void RegularTest(DataFrame df)
206199
{
207200
Assert.Equal(4, df.Rows.Count);
@@ -292,10 +285,6 @@ False 10 Null
292285
CMT,1,1,637,1.4,CRD,8.5
293286
CMT,1,1,181,0.6,CSH,4.5";
294287

295-
Stream GetStream(string streamData)
296-
{
297-
return new MemoryStream(Encoding.Default.GetBytes(streamData));
298-
}
299288

300289
string data = header ? headerLine + dataLines : dataLines;
301290
DataFrame df = DataFrame.LoadCsv(GetStream(data),
@@ -346,10 +335,6 @@ public void TestReadCsvWithTypes()
346335
,,,,,,
347336
CMT,1,1,181,0.6,CSH,4.5";
348337

349-
Stream GetStream(string streamData)
350-
{
351-
return new MemoryStream(Encoding.Default.GetBytes(streamData));
352-
}
353338
void Verify(DataFrame df)
354339
{
355340
Assert.Equal(5, df.Rows.Count);
@@ -409,10 +394,6 @@ public void TestReadCsvWithPipeSeparator()
409394
||||||
410395
CMT|1|1|181|0.6|CSH|4.5";
411396

412-
Stream GetStream(string streamData)
413-
{
414-
return new MemoryStream(Encoding.Default.GetBytes(streamData));
415-
}
416397
void Verify(DataFrame df)
417398
{
418399
Assert.Equal(5, df.Rows.Count);
@@ -452,10 +433,6 @@ public void TestReadCsvWithSemicolonSeparator()
452433
;;;;;;
453434
CMT;1;1;181;0.6;CSH;4.5";
454435

455-
Stream GetStream(string streamData)
456-
{
457-
return new MemoryStream(Encoding.Default.GetBytes(streamData));
458-
}
459436
void Verify(DataFrame df)
460437
{
461438
Assert.Equal(5, df.Rows.Count);
@@ -494,10 +471,6 @@ public void TestReadCsvWithExtraColumnInHeader()
494471
CMT,1,1,637,1.4,CRD,8.5
495472
CMT,1,1,181,0.6,CSH,4.5";
496473

497-
Stream GetStream(string streamData)
498-
{
499-
return new MemoryStream(Encoding.Default.GetBytes(streamData));
500-
}
501474
void Verify(DataFrame df)
502475
{
503476
Assert.Equal(4, df.Rows.Count);
@@ -527,11 +500,6 @@ public void TestReadCsvWithExtraColumnInRow()
527500
CMT,1,1,637,1.4,CRD,8.5,0
528501
CMT,1,1,181,0.6,CSH,4.5,0";
529502

530-
Stream GetStream(string streamData)
531-
{
532-
return new MemoryStream(Encoding.Default.GetBytes(streamData));
533-
}
534-
535503
Assert.Throws<IndexOutOfRangeException>(() => DataFrame.LoadCsv(GetStream(data)));
536504
Assert.Throws<IndexOutOfRangeException>(() => DataFrame.LoadCsvFromString(data));
537505
}
@@ -545,11 +513,6 @@ public void TestReadCsvWithLessColumnsInRow()
545513
CMT,1,1,637,1.4,CRD
546514
CMT,1,1,181,0.6,CSH";
547515

548-
Stream GetStream(string streamData)
549-
{
550-
return new MemoryStream(Encoding.Default.GetBytes(streamData));
551-
}
552-
553516
void Verify(DataFrame df)
554517
{
555518
Assert.Equal(4, df.Rows.Count);
@@ -581,11 +544,6 @@ public void TestReadCsvWithAllNulls()
581544
null,null,null,null
582545
null,null,null,null";
583546

584-
Stream GetStream(string streamData)
585-
{
586-
return new MemoryStream(Encoding.Default.GetBytes(streamData));
587-
}
588-
589547
void Verify(DataFrame df)
590548
{
591549
Assert.Equal(6, df.Rows.Count);
@@ -629,11 +587,6 @@ public void TestReadCsvWithNullsAndDataTypes()
629587
,,,
630588
CMT,1,1,null";
631589

632-
Stream GetStream(string streamData)
633-
{
634-
return new MemoryStream(Encoding.Default.GetBytes(streamData));
635-
}
636-
637590
void Verify(DataFrame df)
638591
{
639592
Assert.Equal(6, df.Rows.Count);
@@ -696,11 +649,6 @@ public void TestReadCsvWithNulls()
696649
,,,
697650
CMT,1,1,null";
698651

699-
Stream GetStream(string streamData)
700-
{
701-
return new MemoryStream(Encoding.Default.GetBytes(streamData));
702-
}
703-
704652
void Verify(DataFrame df)
705653
{
706654
Assert.Equal(6, df.Rows.Count);
@@ -918,10 +866,6 @@ public void TestMixedDataTypesInCsv()
918866
,
919867
CMT,";
920868

921-
Stream GetStream(string streamData)
922-
{
923-
return new MemoryStream(Encoding.Default.GetBytes(streamData));
924-
}
925869
DataFrame df = DataFrame.LoadCsv(GetStream(data));
926870
Assert.Equal(6, df.Rows.Count);
927871
Assert.Equal(2, df.Columns.Count);

0 commit comments

Comments
 (0)