Skip to content

Commit 79dcab6

Browse files
author
Prashanth Govindarajan
authored
Merge 382f948 into 17aec1e
2 parents 17aec1e + 382f948 commit 79dcab6

File tree

9 files changed

+1818
-188
lines changed

9 files changed

+1818
-188
lines changed

src/Microsoft.Data.Analysis/DataFrame.IO.cs

Lines changed: 100 additions & 103 deletions
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,7 @@ private static DataFrameColumn CreateColumn(Type kind, string[] columnNames, int
172172
return ret;
173173
}
174174

175-
private static DataFrame ReadCsvLinesIntoDataFrame(IEnumerable<string> lines,
175+
private static DataFrame ReadCsvLinesIntoDataFrame(WrappedStreamReaderOrStringReader wrappedReader,
176176
char separator = ',', bool header = true,
177177
string[] columnNames = null, Type[] dataTypes = null,
178178
long numberOfRowsToRead = -1, int guessRows = 10, bool addIndexColumn = false
@@ -183,140 +183,141 @@ private static DataFrame ReadCsvLinesIntoDataFrame(IEnumerable<string> lines,
183183
throw new ArgumentException(string.Format(Strings.ExpectedEitherGuessRowsOrDataTypes, nameof(guessRows), nameof(dataTypes)));
184184
}
185185

186-
var linesForGuessType = new List<string[]>();
187-
long rowline = 0;
188-
int numberOfColumns = dataTypes?.Length ?? 0;
189-
190-
if (header == true && numberOfRowsToRead != -1)
186+
List<DataFrameColumn> columns;
187+
string[] fields;
188+
using (var textReader = wrappedReader.GetTextReader())
191189
{
192-
numberOfRowsToRead++;
193-
}
190+
TextFieldParser parser = new TextFieldParser(textReader);
191+
parser.SetDelimiters(separator.ToString());
194192

195-
List<DataFrameColumn> columns;
196-
// First pass: schema and number of rows.
197-
string line = null;
193+
var linesForGuessType = new List<string[]>();
194+
long rowline = 0;
195+
int numberOfColumns = dataTypes?.Length ?? 0;
198196

199-
var enumerator = lines.GetEnumerator();
200-
while (enumerator.MoveNext())
201-
{
202-
line = enumerator.Current;
203-
if ((numberOfRowsToRead == -1) || rowline < numberOfRowsToRead)
197+
if (header == true && numberOfRowsToRead != -1)
198+
{
199+
numberOfRowsToRead++;
200+
}
201+
202+
// First pass: schema and number of rows.
203+
while ((fields = parser.ReadFields()) != null)
204204
{
205-
if (linesForGuessType.Count < guessRows || (header && rowline == 0))
205+
if ((numberOfRowsToRead == -1) || rowline < numberOfRowsToRead)
206206
{
207-
var spl = line.Split(separator);
208-
if (header && rowline == 0)
207+
if (linesForGuessType.Count < guessRows || (header && rowline == 0))
209208
{
210-
if (columnNames == null)
209+
string[] spl = fields;
210+
if (header && rowline == 0)
211211
{
212-
columnNames = spl;
212+
if (columnNames == null)
213+
{
214+
columnNames = spl;
215+
}
216+
}
217+
else
218+
{
219+
linesForGuessType.Add(spl);
220+
numberOfColumns = Math.Max(numberOfColumns, spl.Length);
213221
}
214222
}
215-
else
216-
{
217-
linesForGuessType.Add(spl);
218-
numberOfColumns = Math.Max(numberOfColumns, spl.Length);
219-
}
223+
}
224+
++rowline;
225+
if (rowline == guessRows || guessRows == 0)
226+
{
227+
break;
220228
}
221229
}
222-
++rowline;
223-
if (rowline == guessRows || guessRows == 0)
230+
231+
if (rowline == 0)
224232
{
225-
break;
233+
throw new FormatException(Strings.EmptyFile);
226234
}
227-
}
228235

229-
if (rowline == 0)
230-
{
231-
throw new FormatException(Strings.EmptyFile);
232-
}
233-
234-
columns = new List<DataFrameColumn>(numberOfColumns);
235-
// Guesses types or looks up dataTypes and adds columns.
236-
for (int i = 0; i < numberOfColumns; ++i)
237-
{
238-
Type kind = dataTypes == null ? GuessKind(i, linesForGuessType) : dataTypes[i];
239-
columns.Add(CreateColumn(kind, columnNames, i));
236+
columns = new List<DataFrameColumn>(numberOfColumns);
237+
// Guesses types or looks up dataTypes and adds columns.
238+
for (int i = 0; i < numberOfColumns; ++i)
239+
{
240+
Type kind = dataTypes == null ? GuessKind(i, linesForGuessType) : dataTypes[i];
241+
columns.Add(CreateColumn(kind, columnNames, i));
242+
}
240243
}
241244

242245
DataFrame ret = new DataFrame(columns);
243-
line = null;
244246

245247
// Fill values.
246-
enumerator.Reset();
247-
rowline = 0;
248-
while (enumerator.MoveNext() && (numberOfRowsToRead == -1 || rowline < numberOfRowsToRead))
248+
using (var textReader = wrappedReader.GetTextReader())
249249
{
250-
line = enumerator.Current;
251-
var spl = line.Split(separator);
252-
if (header && rowline == 0)
253-
{
254-
// Skips.
255-
}
256-
else
250+
TextFieldParser parser = new TextFieldParser(textReader);
251+
parser.SetDelimiters(separator.ToString());
252+
253+
long rowline = 0;
254+
while ((fields = parser.ReadFields()) != null && (numberOfRowsToRead == -1 || rowline < numberOfRowsToRead))
257255
{
258-
ret.Append(spl, inPlace: true);
256+
string[] spl = fields;
257+
if (header && rowline == 0)
258+
{
259+
// Skips.
260+
}
261+
else
262+
{
263+
ret.Append(spl, inPlace: true);
264+
}
265+
++rowline;
259266
}
260-
++rowline;
261-
}
262267

263-
if (addIndexColumn)
264-
{
265-
PrimitiveDataFrameColumn<int> indexColumn = new PrimitiveDataFrameColumn<int>("IndexColumn", columns[0].Length);
266-
for (int i = 0; i < columns[0].Length; i++)
268+
if (addIndexColumn)
267269
{
268-
indexColumn[i] = i;
270+
PrimitiveDataFrameColumn<int> indexColumn = new PrimitiveDataFrameColumn<int>("IndexColumn", columns[0].Length);
271+
for (int i = 0; i < columns[0].Length; i++)
272+
{
273+
indexColumn[i] = i;
274+
}
275+
columns.Insert(0, indexColumn);
269276
}
270-
columns.Insert(0, indexColumn);
271-
}
272-
return ret;
273-
}
274277

275-
private class CsvLines : IEnumerable<string>
276-
{
277-
private CsvLineEnumerator enumerator;
278-
public CsvLines(CsvLineEnumerator csvLineEnumerator)
279-
{
280-
enumerator = csvLineEnumerator;
281278
}
282279

283-
public IEnumerator<string> GetEnumerator() => enumerator;
284-
285-
IEnumerator IEnumerable.GetEnumerator() => enumerator;
280+
return ret;
286281
}
287282

288-
private class CsvLineEnumerator : IEnumerator<string>
283+
private class WrappedStreamReaderOrStringReader
289284
{
290-
private StreamReader streamReader;
291-
private string currentLine;
292-
private long streamStartPosition;
293-
public CsvLineEnumerator(StreamReader csvStream)
294-
{
295-
streamStartPosition = csvStream.BaseStream.Position;
296-
streamReader = csvStream;
297-
currentLine = null;
298-
}
299-
300-
public string Current => currentLine;
301-
302-
object IEnumerator.Current => currentLine;
285+
private Stream _stream;
286+
private long _initialPosition;
287+
private Encoding _encoding;
288+
private string _csvString;
303289

304-
public void Dispose()
290+
public WrappedStreamReaderOrStringReader(Stream stream, Encoding encoding)
305291
{
306-
throw new NotImplementedException();
292+
_stream = stream;
293+
_initialPosition = stream.Position;
294+
_encoding = encoding;
295+
_csvString = null;
307296
}
308297

309-
public bool MoveNext()
298+
public WrappedStreamReaderOrStringReader(string csvString)
310299
{
311-
currentLine = streamReader.ReadLine();
312-
return currentLine != null;
300+
_csvString = csvString;
301+
_initialPosition = 0;
302+
_encoding = null;
303+
_stream = null;
313304
}
314305

315-
public void Reset()
306+
// Returns a new TextReader. If the wrapped object is a stream, the stream is reset to its initial position.
307+
public TextReader GetTextReader()
316308
{
317-
streamReader.DiscardBufferedData();
318-
streamReader.BaseStream.Seek(streamStartPosition, SeekOrigin.Begin);
309+
if (_stream != null)
310+
{
311+
_stream.Seek(_initialPosition, SeekOrigin.Begin);
312+
return new StreamReader(_stream, _encoding, detectEncodingFromByteOrderMarks: true, DefaultStreamReaderBufferSize, leaveOpen: true);
313+
}
314+
else
315+
{
316+
return new StringReader(_csvString);
317+
}
318+
319319
}
320+
320321
}
321322

322323
/// <summary>
@@ -336,8 +337,8 @@ public static DataFrame LoadCsvFromString(string csvString,
336337
string[] columnNames = null, Type[] dataTypes = null,
337338
long numberOfRowsToRead = -1, int guessRows = 10, bool addIndexColumn = false)
338339
{
339-
string[] lines = csvString.Split(new[] { Environment.NewLine }, StringSplitOptions.None);
340-
return ReadCsvLinesIntoDataFrame(lines, separator, header, columnNames, dataTypes, numberOfRowsToRead, guessRows, addIndexColumn);
340+
WrappedStreamReaderOrStringReader wrappedStreamReaderOrStringReader = new WrappedStreamReaderOrStringReader(csvString);
341+
return ReadCsvLinesIntoDataFrame(wrappedStreamReaderOrStringReader, separator, header, columnNames, dataTypes, numberOfRowsToRead, guessRows, addIndexColumn);
341342
}
342343

343344
/// <summary>
@@ -369,12 +370,8 @@ public static DataFrame LoadCsv(Stream csvStream,
369370
throw new ArgumentException(string.Format(Strings.ExpectedEitherGuessRowsOrDataTypes, nameof(guessRows), nameof(dataTypes)));
370371
}
371372

372-
using (var streamReader = new StreamReader(csvStream, encoding ?? Encoding.UTF8, detectEncodingFromByteOrderMarks: true, DefaultStreamReaderBufferSize, leaveOpen: true))
373-
{
374-
CsvLineEnumerator linesEnumerator = new CsvLineEnumerator(streamReader);
375-
IEnumerable<string> lines = new CsvLines(linesEnumerator);
376-
return ReadCsvLinesIntoDataFrame(lines, separator, header, columnNames, dataTypes, numberOfRowsToRead, guessRows, addIndexColumn);
377-
}
373+
WrappedStreamReaderOrStringReader wrappedStreamReaderOrStringReader = new WrappedStreamReaderOrStringReader(csvStream, encoding ?? Encoding.UTF8);
374+
return ReadCsvLinesIntoDataFrame(wrappedStreamReaderOrStringReader, separator, header, columnNames, dataTypes, numberOfRowsToRead, guessRows, addIndexColumn);
378375
}
379376

380377
/// <summary>

0 commit comments

Comments
 (0)