From 0af9aa52fb63eb972bad3c81773bf94e859c47b3 Mon Sep 17 00:00:00 2001 From: Brett Rosen Date: Thu, 29 Sep 2016 10:56:48 -0400 Subject: [PATCH 01/15] Fix segv --- pandas/src/parser/tokenizer.c | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index cad5d98dde53a..c8f8c7c21799c 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -1468,13 +1468,8 @@ int parser_trim_buffers(parser_t *self) { Free memory */ size_t new_cap; - - /* trim stream */ - new_cap = _next_pow2(self->stream_len) + 1; - if (new_cap < self->stream_cap) { - self->stream = safe_realloc((void*) self->stream, new_cap); - self->stream_cap = new_cap; - } + void *newptr; + int i; /* trim words, word_starts */ new_cap = _next_pow2(self->words_len) + 1; @@ -1486,6 +1481,27 @@ int parser_trim_buffers(parser_t *self) { self->words_cap = new_cap; } + /* trim stream */ + new_cap = _next_pow2(self->stream_len) + 1; + if (new_cap < self->stream_cap) { + newptr = safe_realloc((void*) self->stream, new_cap); + // Update the pointers in the self->words array (char **) if `safe_realloc` + // moved the `self->stream` buffer. This block mirrors a similar block in + // `make_stream_space`. + if (self->stream != newptr) { + self->pword_start = newptr + self->word_start; + + for (i = 0; i < self->words_len; ++i) + { + self->words[i] = newptr + self->word_starts[i]; + } + } + + self->stream = newptr; + self->stream_cap = new_cap; + + } + /* trim line_start, line_fields */ new_cap = _next_pow2(self->lines) + 1; if (new_cap < self->lines_cap) { From 569acbc9bae764da25959242857d046af181c61b Mon Sep 17 00:00:00 2001 From: Brett Rosen Date: Thu, 29 Sep 2016 11:03:36 -0400 Subject: [PATCH 02/15] second fix --- pandas/src/parser/tokenizer.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index c8f8c7c21799c..7ac87473acf7b 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -280,7 +280,7 @@ void parser_free(parser_t *self) { static int make_stream_space(parser_t *self, size_t nbytes) { int i, status, cap; - void *orig_ptr; + void *orig_ptr, *newptr; // Can we fit potentially nbytes tokens (+ null terminators) in the stream? @@ -330,10 +330,11 @@ static int make_stream_space(parser_t *self, size_t nbytes) { // realloc took place if (cap != self->words_cap) { - self->word_starts = (int*) safe_realloc((void *) self->word_starts, - sizeof(int) * self->words_cap); - if (self->word_starts == NULL) { + newptr = safe_realloc((void *) self->word_starts, sizeof(int) * self->words_cap); + if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; + } else { + self->word_starts = (int*) newptr; } } @@ -362,11 +363,11 @@ static int make_stream_space(parser_t *self, size_t nbytes) { // realloc took place if (cap != self->lines_cap) { - self->line_fields = (int*) safe_realloc((void *) self->line_fields, - sizeof(int) * self->lines_cap); - - if (self->line_fields == NULL) { + newptr = safe_realloc((void *) self->line_fields, sizeof(int) * self->lines_cap); + if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; + } else { + self->line_fields = (int*) newptr; } } From 52754b532ae96677df810701ffc4a0fe0769e609 Mon Sep 17 00:00:00 2001 From: Brett Rosen Date: Thu, 29 Sep 2016 11:32:01 -0400 Subject: [PATCH 03/15] more fixes --- pandas/parser.pyx | 9 +++-- pandas/src/parser/tokenizer.c | 64 +++++++++++++++++++++++------------ 2 files changed, 49 insertions(+), 24 deletions(-) diff --git a/pandas/parser.pyx b/pandas/parser.pyx index 185cf1a752803..6070e0ed83b2c 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -747,7 +747,9 @@ cdef class TextReader: else: chunks.append(chunk) - parser_trim_buffers(self.parser) + status = parser_trim_buffers(self.parser) + if status < 0: + raise_parser_error('Error trimming data', self.parser) if len(chunks) == 0: raise StopIteration @@ -812,7 +814,10 @@ cdef class TextReader: # trim parser_consume_rows(self.parser, rows_read) if trim: - parser_trim_buffers(self.parser) + status = parser_trim_buffers(self.parser) + + if status < 0: + raise_parser_error('Error trimming data', self.parser) self.parser_start -= rows_read self._end_clock('Parser memory cleanup') diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index 7ac87473acf7b..e70ed92500690 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -1475,42 +1475,62 @@ int parser_trim_buffers(parser_t *self) { /* trim words, word_starts */ new_cap = _next_pow2(self->words_len) + 1; if (new_cap < self->words_cap) { - self->words = (char**) safe_realloc((void*) self->words, - new_cap * sizeof(char*)); - self->word_starts = (int*) safe_realloc((void*) self->word_starts, - new_cap * sizeof(int)); - self->words_cap = new_cap; + newptr = safe_realloc((void*) self->words, new_cap * sizeof(char*)); + if (newptr == NULL) { + return PARSER_OUT_OF_MEMORY; + } else { + self->words = (char**) newptr; + } + newptr = safe_realloc((void*) self->word_starts, new_cap * sizeof(int)); + if (newptr == NULL) { + return PARSER_OUT_OF_MEMORY; + } else { + self->word_starts = (int*) newptr; + self->words_cap = new_cap; + } } /* trim stream */ new_cap = _next_pow2(self->stream_len) + 1; if (new_cap < self->stream_cap) { newptr = safe_realloc((void*) self->stream, new_cap); - // Update the pointers in the self->words array (char **) if `safe_realloc` - // moved the `self->stream` buffer. This block mirrors a similar block in - // `make_stream_space`. - if (self->stream != newptr) { - self->pword_start = newptr + self->word_start; - - for (i = 0; i < self->words_len; ++i) - { - self->words[i] = newptr + self->word_starts[i]; + if (newptr == NULL) { + return PARSER_OUT_OF_MEMORY; + } else { + // Update the pointers in the self->words array (char **) if `safe_realloc` + // moved the `self->stream` buffer. This block mirrors a similar block in + // `make_stream_space`. + if (self->stream != newptr) { + self->pword_start = newptr + self->word_start; + + for (i = 0; i < self->words_len; ++i) + { + self->words[i] = newptr + self->word_starts[i]; + } } - } - self->stream = newptr; - self->stream_cap = new_cap; + self->stream = newptr; + self->stream_cap = new_cap; + } } /* trim line_start, line_fields */ new_cap = _next_pow2(self->lines) + 1; if (new_cap < self->lines_cap) { - self->line_start = (int*) safe_realloc((void*) self->line_start, - new_cap * sizeof(int)); - self->line_fields = (int*) safe_realloc((void*) self->line_fields, - new_cap * sizeof(int)); - self->lines_cap = new_cap; + newptr = safe_realloc((void*) self->line_start, new_cap * sizeof(int)); + if (newptr == NULL) { + return PARSER_OUT_OF_MEMORY; + } else { + self->line_start = (int*) newptr; + } + newptr = safe_realloc((void*) self->line_fields, new_cap * sizeof(int)); + if (newptr == NULL) { + return PARSER_OUT_OF_MEMORY; + } else { + self->line_fields = (int*) newptr; + self->lines_cap = new_cap; + } } return 0; From 443aa406f419063f0bfc6ca6e86d9017dc989447 Mon Sep 17 00:00:00 2001 From: Brett Rosen Date: Thu, 29 Sep 2016 13:33:19 -0400 Subject: [PATCH 04/15] tracing and some fixes --- pandas/src/parser/tokenizer.c | 71 +++++++++++++++++++++++++++++------ 1 file changed, 60 insertions(+), 11 deletions(-) diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index e70ed92500690..d0b55bc64895c 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -26,6 +26,9 @@ See LICENSE for the license #define READ_ERROR_OUT_OF_MEMORY 1 +#define VERBOSE +#undef TRACE +#define TRACE(X) printf X; /* * restore: @@ -53,6 +56,7 @@ static void *safe_realloc(void *buffer, size_t size) { // different-realloc-behaviour-in-linux-and-osx result = realloc(buffer, size); + TRACE(("safe_realloc: buffer = %p, size = %zu, result = %p\n", buffer, size, result)) if (result != NULL) { // errno gets set to 12 on my OS Xmachine in some cases even when the @@ -92,6 +96,7 @@ coliter_t *coliter_new(parser_t *self, int i) { static void free_if_not_null(void *ptr) { + TRACE(("free_if_not_null %p\n", ptr)) if (ptr != NULL) free(ptr); } @@ -291,11 +296,13 @@ static int make_stream_space(parser_t *self, size_t nbytes) { */ orig_ptr = (void *) self->stream; + TRACE(("\n\nmake_stream_space: nbytes = %zu. grow_buffer(self->stream...)\n", nbytes)) self->stream = (char*) grow_buffer((void *) self->stream, self->stream_len, &self->stream_cap, nbytes * 2, sizeof(char), &status); - + TRACE(("make_stream_space: self->stream=%p, self->stream_len = %zu, self->stream_cap=%zu, status=%zu\n", + self->stream, self->stream_len, self->stream_cap, status)) if (status != 0) { return PARSER_OUT_OF_MEMORY; } @@ -323,6 +330,8 @@ static int make_stream_space(parser_t *self, size_t nbytes) { self->words_len, &self->words_cap, nbytes, sizeof(char*), &status); + TRACE(("make_stream_space: grow_buffer(self->self->words, %zu, %zu, %zu, %d)\n", + self->words_len, self->words_cap, nbytes, status)) if (status != 0) { return PARSER_OUT_OF_MEMORY; } @@ -330,6 +339,7 @@ static int make_stream_space(parser_t *self, size_t nbytes) { // realloc took place if (cap != self->words_cap) { + TRACE(("make_stream_space: cap != self->words_cap, nbytes = %d, self->words_cap=%d\n", nbytes, self->words_cap)) newptr = safe_realloc((void *) self->word_starts, sizeof(int) * self->words_cap); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; @@ -357,12 +367,15 @@ static int make_stream_space(parser_t *self, size_t nbytes) { self->lines + 1, &self->lines_cap, nbytes, sizeof(int), &status); + TRACE(("make_stream_space: grow_buffer(self->line_start, %zu, %zu, %zu, %d)\n", + self->lines + 1, self->lines_cap, nbytes, status)) if (status != 0) { return PARSER_OUT_OF_MEMORY; } // realloc took place if (cap != self->lines_cap) { + TRACE(("make_stream_space: cap != self->lines_cap, nbytes = %d\n", nbytes)) newptr = safe_realloc((void *) self->line_fields, sizeof(int) * self->lines_cap); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; @@ -379,6 +392,14 @@ static int make_stream_space(parser_t *self, size_t nbytes) { static int push_char(parser_t *self, char c) { /* TRACE(("pushing %c \n", c)) */ + TRACE(("push_char: self->stream[%zu] = %x, stream_cap=%zu\n", self->stream_len+1, c, self->stream_cap)) + if (self->stream_len >= self->stream_cap) { + TRACE(("push_char: ERROR!!! self->stream_len(%d) >= self->stream_cap(%d)\n", + self->stream_len, self->stream_cap)) + self->error_msg = (char*) malloc(64); + sprintf(self->error_msg, "Buffer overflow caught - possible malformed input file.\n"); + return PARSER_OUT_OF_MEMORY; + } self->stream[self->stream_len++] = c; return 0; } @@ -387,6 +408,13 @@ static int P_INLINE end_field(parser_t *self) { // XXX cruft self->numeric_field = 0; + if (self->words_len >= self->words_cap) { + TRACE(("end_field: ERROR!!! self->words_len(%zu) >= self->words_cap(%zu)\n", self->words_len, self->words_cap)) + self->error_msg = (char*) malloc(64); + sprintf(self->error_msg, "Buffer overflow caught - possible malformed input file.\n"); + return PARSER_OUT_OF_MEMORY; + } + // null terminate token push_char(self, '\0'); @@ -524,6 +552,13 @@ static int end_line(parser_t *self) { /* printf("word at column 5: %s\n", COLITER_NEXT(it)); */ // good line, set new start point + // good line, set new start point + if (self->lines >= self->lines_cap) { + TRACE(("end_line: ERROR!!! self->lines(%zu) >= self->lines_cap(%zu)\n", self->lines, self->lines_cap)) \ + self->error_msg = (char*) malloc(100); \ + sprintf(self->error_msg, "Buffer overflow caught - possible malformed input file.\n"); \ + return PARSER_OUT_OF_MEMORY; \ + } self->line_start[self->lines] = (self->line_start[self->lines - 1] + fields); @@ -565,6 +600,8 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) { status = 0; self->datapos = 0; self->data = self->cb_io(self->source, nbytes, &bytes_read, &status); + TRACE(("parser_buffer_bytes self->cb_io: nbytes=%zu, datalen: %d, status=%d\n", + nbytes, bytes_read, status)); self->datalen = bytes_read; if (status != REACHED_EOF && self->data == NULL) { @@ -593,17 +630,16 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) { // printf("pushing %c\n", c); -#if defined(VERBOSE) #define PUSH_CHAR(c) \ - printf("Pushing %c, slen now: %d\n", c, slen); \ + TRACE(("PUSH_CHAR: Pushing %c, slen= %d, stream_cap=%zu, stream_len=%zu\n", c, slen, self->stream_cap, self->stream_len)) \ + if (slen >= maxstreamsize) { \ + TRACE(("PUSH_CHAR: ERROR!!! slen(%d) >= maxstreamsize(%d)\n", slen, maxstreamsize)) \ + self->error_msg = (char*) malloc(100); \ + sprintf(self->error_msg, "Buffer overflow caught - possible malformed input file.\n"); \ + return PARSER_OUT_OF_MEMORY; \ + } \ *stream++ = c; \ slen++; -#else -#define PUSH_CHAR(c) \ - *stream++ = c; \ - slen++; -#endif - // This is a little bit of a hack but works for now @@ -661,6 +697,7 @@ typedef int (*parser_op)(parser_t *self, size_t line_limit); int tokenize_delimited(parser_t *self, size_t line_limit) { int i, slen, start_lines; + long maxstreamsize; char c; char *stream; char *buf = self->data + self->datapos; @@ -675,6 +712,7 @@ int tokenize_delimited(parser_t *self, size_t line_limit) stream = self->stream + self->stream_len; slen = self->stream_len; + maxstreamsize = self->stream_cap; TRACE(("%s\n", buf)); @@ -915,6 +953,7 @@ int tokenize_delim_customterm(parser_t *self, size_t line_limit) { int i, slen, start_lines; + long maxstreamsize; char c; char *stream; char *buf = self->data + self->datapos; @@ -929,6 +968,7 @@ int tokenize_delim_customterm(parser_t *self, size_t line_limit) stream = self->stream + self->stream_len; slen = self->stream_len; + maxstreamsize = self->stream_cap; TRACE(("%s\n", buf)); @@ -1117,6 +1157,7 @@ int tokenize_delim_customterm(parser_t *self, size_t line_limit) int tokenize_whitespace(parser_t *self, size_t line_limit) { int i, slen, start_lines; + long maxstreamsize; char c; char *stream; char *buf = self->data + self->datapos; @@ -1130,6 +1171,7 @@ int tokenize_whitespace(parser_t *self, size_t line_limit) stream = self->stream + self->stream_len; slen = self->stream_len; + maxstreamsize = self->stream_cap; TRACE(("%s\n", buf)); @@ -1475,6 +1517,7 @@ int parser_trim_buffers(parser_t *self) { /* trim words, word_starts */ new_cap = _next_pow2(self->words_len) + 1; if (new_cap < self->words_cap) { + TRACE(("parser_trim_buffers: new_cap < self->words_cap\n")); newptr = safe_realloc((void*) self->words, new_cap * sizeof(char*)); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; @@ -1492,7 +1535,10 @@ int parser_trim_buffers(parser_t *self) { /* trim stream */ new_cap = _next_pow2(self->stream_len) + 1; + TRACE(("parser_trim_buffers: new_cap = %zu, stream_cap = %zu, lines_cap = %zu\n", + new_cap, self->stream_cap, self->lines_cap)); if (new_cap < self->stream_cap) { + TRACE(("parser_trim_buffers: new_cap < self->stream_cap, calling safe_realloc\n")); newptr = safe_realloc((void*) self->stream, new_cap); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; @@ -1518,6 +1564,7 @@ int parser_trim_buffers(parser_t *self) { /* trim line_start, line_fields */ new_cap = _next_pow2(self->lines) + 1; if (new_cap < self->lines_cap) { + TRACE(("parser_trim_buffers: new_cap < self->lines_cap\n")); newptr = safe_realloc((void*) self->line_start, new_cap * sizeof(int)); if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; @@ -1583,7 +1630,8 @@ int _tokenize_helper(parser_t *self, size_t nrows, int all) { return 0; } - TRACE(("Asked to tokenize %d rows\n", (int) nrows)); + TRACE(("Asked to tokenize %d rows, datapos=%d, datalen=%d\n", \ + (int) nrows, self->datapos, self->datalen)); while (1) { if (!all && self->lines - start_lines >= nrows) @@ -1602,7 +1650,8 @@ int _tokenize_helper(parser_t *self, size_t nrows, int all) { } } - TRACE(("Trying to process %d bytes\n", self->datalen - self->datapos)); + TRACE(("_tokenize_helper: Trying to process %d bytes, datalen=%d, datapos= %d\n", + self->datalen - self->datapos, self->datalen, self->datapos)); /* TRACE(("sourcetype: %c, status: %d\n", self->sourcetype, status)); */ status = tokenize_bytes(self, nrows); From b90a7c07cfead2a3cbfe07e3e9a5b29d677e17f4 Mon Sep 17 00:00:00 2001 From: Brett Rosen Date: Thu, 29 Sep 2016 14:13:45 -0400 Subject: [PATCH 05/15] comment more verbose parts --- pandas/src/parser/tokenizer.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index d0b55bc64895c..5442ab4fc8746 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -392,7 +392,7 @@ static int make_stream_space(parser_t *self, size_t nbytes) { static int push_char(parser_t *self, char c) { /* TRACE(("pushing %c \n", c)) */ - TRACE(("push_char: self->stream[%zu] = %x, stream_cap=%zu\n", self->stream_len+1, c, self->stream_cap)) + //TRACE(("push_char: self->stream[%zu] = %x, stream_cap=%zu\n", self->stream_len+1, c, self->stream_cap)) if (self->stream_len >= self->stream_cap) { TRACE(("push_char: ERROR!!! self->stream_len(%d) >= self->stream_cap(%d)\n", self->stream_len, self->stream_cap)) @@ -631,7 +631,7 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) { // printf("pushing %c\n", c); #define PUSH_CHAR(c) \ - TRACE(("PUSH_CHAR: Pushing %c, slen= %d, stream_cap=%zu, stream_len=%zu\n", c, slen, self->stream_cap, self->stream_len)) \ + //TRACE(("PUSH_CHAR: Pushing %c, slen= %d, stream_cap=%zu, stream_len=%zu\n", c, slen, self->stream_cap, self->stream_len)) \ if (slen >= maxstreamsize) { \ TRACE(("PUSH_CHAR: ERROR!!! slen(%d) >= maxstreamsize(%d)\n", slen, maxstreamsize)) \ self->error_msg = (char*) malloc(100); \ @@ -714,16 +714,16 @@ int tokenize_delimited(parser_t *self, size_t line_limit) slen = self->stream_len; maxstreamsize = self->stream_cap; - TRACE(("%s\n", buf)); + //TRACE(("%s\n", buf)); for (i = self->datapos; i < self->datalen; ++i) { // Next character in file c = *buf++; - TRACE(("Iter: %d Char: %c Line %d field_count %d, state %d\n", - i, c, self->file_lines + 1, self->line_fields[self->lines], - self->state)); + //TRACE(("Iter: %d Char: %c Line %d field_count %d, state %d\n", + // i, c, self->file_lines + 1, self->line_fields[self->lines], + // self->state)); switch(self->state) { From 79382c50a790e312f31f9063803e9e432e18988c Mon Sep 17 00:00:00 2001 From: Brett Rosen Date: Thu, 29 Sep 2016 14:21:38 -0400 Subject: [PATCH 06/15] less verbose --- pandas/src/parser/tokenizer.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index 5442ab4fc8746..964f5f5d3c0d4 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -421,10 +421,10 @@ static int P_INLINE end_field(parser_t *self) { // set pointer and metadata self->words[self->words_len] = self->pword_start; - TRACE(("Char diff: %d\n", self->pword_start - self->words[0])); + //TRACE(("Char diff: %d\n", self->pword_start - self->words[0])); - TRACE(("Saw word %s at: %d. Total: %d\n", - self->pword_start, self->word_start, self->words_len + 1)) + //TRACE(("Saw word %s at: %d. Total: %d\n", + // self->pword_start, self->word_start, self->words_len + 1)) self->word_starts[self->words_len] = self->word_start; self->words_len++; From e2245fa65d2a2f0c83394cd66d3ff2f2ae9992f6 Mon Sep 17 00:00:00 2001 From: Brett Rosen Date: Thu, 29 Sep 2016 14:24:41 -0400 Subject: [PATCH 07/15] less verbose --- pandas/src/parser/tokenizer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index 964f5f5d3c0d4..cebcf17e2881c 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -463,7 +463,7 @@ static int end_line(parser_t *self) { fields = self->line_fields[self->lines]; - TRACE(("Line end, nfields: %d\n", fields)); + //TRACE(("Line end, nfields: %d\n", fields)); if (self->lines > 0) { if (self->expected_fields >= 0) { @@ -477,7 +477,7 @@ static int end_line(parser_t *self) { k = kh_get_int64((kh_int64_t*) self->skipset, self->file_lines); if (k != ((kh_int64_t*)self->skipset)->n_buckets) { - TRACE(("Skipping row %d\n", self->file_lines)); + //TRACE(("Skipping row %d\n", self->file_lines)); // increment file line count self->file_lines++; From b03e27b141667f1450c8a08c674979855f9d6398 Mon Sep 17 00:00:00 2001 From: Brett Rosen Date: Thu, 29 Sep 2016 14:45:01 -0400 Subject: [PATCH 08/15] slight changes --- pandas/parser.pyx | 1 + pandas/src/parser/tokenizer.c | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/parser.pyx b/pandas/parser.pyx index 6070e0ed83b2c..564e60bc59677 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -721,6 +721,7 @@ cdef class TextReader: cdef: size_t rows_read = 0 chunks = [] + int status if rows is None: while True: diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index cebcf17e2881c..864bf63545906 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -26,7 +26,6 @@ See LICENSE for the license #define READ_ERROR_OUT_OF_MEMORY 1 -#define VERBOSE #undef TRACE #define TRACE(X) printf X; From 728cc6ebd28e336b267703ac0b8aed1fbbc31399 Mon Sep 17 00:00:00 2001 From: Brett Rosen Date: Thu, 29 Sep 2016 15:08:35 -0400 Subject: [PATCH 09/15] fix macro --- pandas/src/parser/tokenizer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index 864bf63545906..1012fa1fae230 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -630,7 +630,7 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) { // printf("pushing %c\n", c); #define PUSH_CHAR(c) \ - //TRACE(("PUSH_CHAR: Pushing %c, slen= %d, stream_cap=%zu, stream_len=%zu\n", c, slen, self->stream_cap, self->stream_len)) \ + /*TRACE(("PUSH_CHAR: Pushing %c, slen= %d, stream_cap=%zu, stream_len=%zu\n", c, slen, self->stream_cap, self->stream_len)) */ \ if (slen >= maxstreamsize) { \ TRACE(("PUSH_CHAR: ERROR!!! slen(%d) >= maxstreamsize(%d)\n", slen, maxstreamsize)) \ self->error_msg = (char*) malloc(100); \ From 93bf296fd7d2910880c2a644f9a769e9d0610967 Mon Sep 17 00:00:00 2001 From: Brett Rosen Date: Thu, 29 Sep 2016 15:11:21 -0400 Subject: [PATCH 10/15] fix bug --- pandas/src/parser/tokenizer.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index 1012fa1fae230..c811f00b1676f 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -54,6 +54,11 @@ static void *safe_realloc(void *buffer, size_t size) { // http://stackoverflow.com/questions/9560609/ // different-realloc-behaviour-in-linux-and-osx + if (size == 0) + { + TRACE(("safe_realloc: asking for 0 length")); + return buffer; + } result = realloc(buffer, size); TRACE(("safe_realloc: buffer = %p, size = %zu, result = %p\n", buffer, size, result)) From eca4b352f97f33bb2e47822e25e4786ee96bbc60 Mon Sep 17 00:00:00 2001 From: Brett Rosen Date: Thu, 29 Sep 2016 15:17:43 -0400 Subject: [PATCH 11/15] issue with grow_buffer --- pandas/src/parser/tokenizer.c | 38 +++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index c811f00b1676f..68767ffc19a42 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -57,7 +57,7 @@ static void *safe_realloc(void *buffer, size_t size) { if (size == 0) { TRACE(("safe_realloc: asking for 0 length")); - return buffer; + abort(); } result = realloc(buffer, size); TRACE(("safe_realloc: buffer = %p, size = %zu, result = %p\n", buffer, size, result)) @@ -115,24 +115,28 @@ static void free_if_not_null(void *ptr) { static void *grow_buffer(void *buffer, int length, int *capacity, int space, int elsize, int *error) { - int cap = *capacity; + int cap = *capacity; + void *newbuffer = buffer; - // Can we fit potentially nbytes tokens (+ null terminators) in the stream? - while (length + space > cap) { - cap = cap? cap << 1 : 2; - - buffer = safe_realloc(buffer, elsize * cap); - - if (buffer == NULL) { - // TODO: error codes - *error = -1; - } - } + // Can we fit potentially nbytes tokens (+ null terminators) in the stream? + while ( (length + space >= cap) && (newbuffer != NULL) ){ + cap = cap? cap << 1 : 2; + buffer = newbuffer; + newbuffer = safe_realloc(newbuffer, elsize * cap); + } - // sigh, multiple return values - *capacity = cap; - *error = 0; - return buffer; + if (newbuffer == NULL) { + // realloc failed so don't change *capacity, set *error to errno + // and return the last good realloc'd buffer so it can be freed + *error = errno; + newbuffer = buffer; + } else { + // realloc worked, update *capacity and set *error to 0 + // sigh, multiple return values + *capacity = cap; + *error = 0; + } + return newbuffer; } From 29ea1d3b8bca09d3e1b701a6f63c3df0735d4811 Mon Sep 17 00:00:00 2001 From: Brett Rosen Date: Fri, 30 Sep 2016 06:57:05 -0400 Subject: [PATCH 12/15] backport skip line improvements --- pandas/src/parser/tokenizer.c | 86 +++++++++++++++++++++++++++-------- pandas/src/parser/tokenizer.h | 1 + 2 files changed, 69 insertions(+), 18 deletions(-) diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index 68767ffc19a42..fdf4f0c5a2ed4 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -480,22 +480,17 @@ static int end_line(parser_t *self) { ex_fields = self->line_fields[self->lines - 1]; } } + if (self->state == SKIP_LINE) { + TRACE(("Skipping row %d\n", self->file_lines)); + // increment file line count + self->file_lines++; - if (self->skipset != NULL) { - k = kh_get_int64((kh_int64_t*) self->skipset, self->file_lines); - - if (k != ((kh_int64_t*)self->skipset)->n_buckets) { - //TRACE(("Skipping row %d\n", self->file_lines)); - // increment file line count - self->file_lines++; - - // skip the tokens from this bad line - self->line_start[self->lines] += fields; + // skip the tokens from this bad line + self->line_start[self->lines] += fields; - // reset field count - self->line_fields[self->lines] = 0; - return 0; - } + // reset field count + self->line_fields[self->lines] = 0; + return 0; } /* printf("Line: %d, Fields: %d, Ex-fields: %d\n", self->lines, fields, ex_fields); */ @@ -701,6 +696,14 @@ typedef int (*parser_op)(parser_t *self, size_t line_limit); TRACE(("datapos: %d, datalen: %d\n", self->datapos, self->datalen)); +int skip_this_line(parser_t *self, int64_t rownum) { + if (self->skipset != NULL) { + return ( kh_get_int64((kh_int64_t*) self->skipset, self->file_lines) != + ((kh_int64_t*)self->skipset)->n_buckets ); + } + return 0; +} + int tokenize_delimited(parser_t *self, size_t line_limit) { @@ -735,10 +738,25 @@ int tokenize_delimited(parser_t *self, size_t line_limit) switch(self->state) { + case SKIP_LINE: + TRACE(("tokenize_delimited SKIP_LINE %c, state %d\n", c, self->state)); + if (c == '\n') { + END_LINE(); + } + break; + case START_RECORD: // start of record - - if (c == '\n') { + if (skip_this_line(self, self->file_lines)) { + if (c == '\n') { + END_LINE() + } + else { + self->state = SKIP_LINE; + } + break; + } + else if (c == '\n') { // \n\r possible? END_LINE(); break; @@ -990,9 +1008,26 @@ int tokenize_delim_customterm(parser_t *self, size_t line_limit) self->state)); switch(self->state) { + + case SKIP_LINE: + TRACE(("tokenize_delim_customterm SKIP_LINE %c, state %d\n", c, self->state)); + if (c == self->lineterminator) { + END_LINE(); + } + break; + case START_RECORD: // start of record - if (c == self->lineterminator) { + if (skip_this_line(self, self->file_lines)) { + if (c == self->lineterminator) { + END_LINE() + } + else { + self->state = SKIP_LINE; + } + break; + } + else if (c == self->lineterminator) { // \n\r possible? END_LINE(); break; @@ -1193,6 +1228,12 @@ int tokenize_whitespace(parser_t *self, size_t line_limit) self->state)); switch(self->state) { + case SKIP_LINE: + TRACE(("tokenize_whitespace SKIP_LINE %c, state %d\n", c, self->state)); + if (c == '\n') { + END_LINE(); + } + break; case EAT_WHITESPACE: if (!IS_WHITESPACE(c)) { @@ -1206,7 +1247,16 @@ int tokenize_whitespace(parser_t *self, size_t line_limit) case START_RECORD: // start of record - if (c == '\n') { + if (skip_this_line(self, self->file_lines)) { + if (c == '\n') { + END_LINE() + } + else { + self->state = SKIP_LINE; + } + break; + } + else if (c == '\n') { // \n\r possible? END_LINE(); break; diff --git a/pandas/src/parser/tokenizer.h b/pandas/src/parser/tokenizer.h index 01f9397685da6..82d5d657e3478 100644 --- a/pandas/src/parser/tokenizer.h +++ b/pandas/src/parser/tokenizer.h @@ -123,6 +123,7 @@ typedef enum { EAT_CRNL, EAT_WHITESPACE, EAT_COMMENT, + SKIP_LINE, FINISHED } ParserState; From 5baf897a7e36583437210605a5d0f0897414cbc4 Mon Sep 17 00:00:00 2001 From: Brett Rosen Date: Fri, 30 Sep 2016 07:02:23 -0400 Subject: [PATCH 13/15] remove troubleshooting code --- pandas/src/parser/tokenizer.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index fdf4f0c5a2ed4..a0c5982373124 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -26,8 +26,6 @@ See LICENSE for the license #define READ_ERROR_OUT_OF_MEMORY 1 -#undef TRACE -#define TRACE(X) printf X; /* * restore: @@ -54,11 +52,6 @@ static void *safe_realloc(void *buffer, size_t size) { // http://stackoverflow.com/questions/9560609/ // different-realloc-behaviour-in-linux-and-osx - if (size == 0) - { - TRACE(("safe_realloc: asking for 0 length")); - abort(); - } result = realloc(buffer, size); TRACE(("safe_realloc: buffer = %p, size = %zu, result = %p\n", buffer, size, result)) From b022a0ed3a61e015811945d3c51a5d67bf9c282f Mon Sep 17 00:00:00 2001 From: Brett Rosen Date: Fri, 30 Sep 2016 07:04:55 -0400 Subject: [PATCH 14/15] more cleanup --- pandas/src/parser/tokenizer.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index a0c5982373124..0a041005b808b 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -422,10 +422,10 @@ static int P_INLINE end_field(parser_t *self) { // set pointer and metadata self->words[self->words_len] = self->pword_start; - //TRACE(("Char diff: %d\n", self->pword_start - self->words[0])); + TRACE(("Char diff: %d\n", self->pword_start - self->words[0])); - //TRACE(("Saw word %s at: %d. Total: %d\n", - // self->pword_start, self->word_start, self->words_len + 1)) + TRACE(("Saw word %s at: %d. Total: %d\n", + self->pword_start, self->word_start, self->words_len + 1)) self->word_starts[self->words_len] = self->word_start; self->words_len++; @@ -464,7 +464,7 @@ static int end_line(parser_t *self) { fields = self->line_fields[self->lines]; - //TRACE(("Line end, nfields: %d\n", fields)); + TRACE(("Line end, nfields: %d\n", fields)); if (self->lines > 0) { if (self->expected_fields >= 0) { @@ -627,7 +627,7 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) { // printf("pushing %c\n", c); #define PUSH_CHAR(c) \ - /*TRACE(("PUSH_CHAR: Pushing %c, slen= %d, stream_cap=%zu, stream_len=%zu\n", c, slen, self->stream_cap, self->stream_len)) */ \ + TRACE(("PUSH_CHAR: Pushing %c, slen= %d, stream_cap=%zu, stream_len=%zu\n", c, slen, self->stream_cap, self->stream_len)) \ if (slen >= maxstreamsize) { \ TRACE(("PUSH_CHAR: ERROR!!! slen(%d) >= maxstreamsize(%d)\n", slen, maxstreamsize)) \ self->error_msg = (char*) malloc(100); \ @@ -718,16 +718,16 @@ int tokenize_delimited(parser_t *self, size_t line_limit) slen = self->stream_len; maxstreamsize = self->stream_cap; - //TRACE(("%s\n", buf)); + TRACE(("%s\n", buf)); for (i = self->datapos; i < self->datalen; ++i) { // Next character in file c = *buf++; - //TRACE(("Iter: %d Char: %c Line %d field_count %d, state %d\n", - // i, c, self->file_lines + 1, self->line_fields[self->lines], - // self->state)); + TRACE(("Iter: %d Char: %c Line %d field_count %d, state %d\n", + i, c, self->file_lines + 1, self->line_fields[self->lines], + self->state)); switch(self->state) { From ed47cfeed708d3c9f2fe5e144511c2557b67783d Mon Sep 17 00:00:00 2001 From: Brett Rosen Date: Fri, 30 Sep 2016 07:06:50 -0400 Subject: [PATCH 15/15] final cleanup --- pandas/src/parser/tokenizer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index 0a041005b808b..79de1beeb1cd2 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -393,7 +393,7 @@ static int make_stream_space(parser_t *self, size_t nbytes) { static int push_char(parser_t *self, char c) { /* TRACE(("pushing %c \n", c)) */ - //TRACE(("push_char: self->stream[%zu] = %x, stream_cap=%zu\n", self->stream_len+1, c, self->stream_cap)) + TRACE(("push_char: self->stream[%zu] = %x, stream_cap=%zu\n", self->stream_len+1, c, self->stream_cap)) if (self->stream_len >= self->stream_cap) { TRACE(("push_char: ERROR!!! self->stream_len(%d) >= self->stream_cap(%d)\n", self->stream_len, self->stream_cap))