-
-
Notifications
You must be signed in to change notification settings - Fork 5.7k
RFC: Make EOL handling more general #19877
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
81d7c38
a04d81b
fb30c92
80c94b8
e3cfa28
cffdf0d
603baae
7e9cedd
bb7475e
3538aff
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -167,24 +167,37 @@ The text is assumed to be encoded in UTF-8. | |
| readuntil(filename::AbstractString, args...) = open(io->readuntil(io, args...), filename) | ||
|
|
||
| """ | ||
| readline(stream::IO=STDIN) | ||
| readline(filename::AbstractString) | ||
| readline(stream::IO=STDIN, chomp::Bool=false) | ||
| readline(filename::AbstractString, chomp::Bool=false) | ||
|
|
||
| Read a single line of text including from the given I/O stream or file (defaults to `STDIN`). | ||
| Lines in the input can end in `'\\n'`, `'\\r'`, or `'\\r\\n'`. When reading from a file, the text is | ||
| assumed to be encoded in UTF-8. If `chomp=false` trailing newline character(s) will be included | ||
| in the output (if reached before the end of the input); otherwise newline characters(s) | ||
| are stripped from result. | ||
| """ | ||
| function readline(filename::AbstractString, chomp = false) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Still need to mention |
||
| open(filename) do f | ||
| readline(f, chomp) | ||
| end | ||
| end | ||
|
|
||
| Read a single line of text, including a trailing newline character (if one is reached before | ||
| the end of the input), from the given I/O stream or file (defaults to `STDIN`). | ||
| When reading from a file, the text is assumed to be encoded in UTF-8. | ||
| """ | ||
| readline(filename::AbstractString) = open(readline, filename) | ||
|
|
||
| """ | ||
| readlines(stream::IO) | ||
| readlines(filename::AbstractString) | ||
| readlines(stream::IO, chomp::Bool=false) | ||
| readlines(filename::AbstractString, chomp::Bool=false) | ||
|
|
||
| Read all lines of an I/O stream or a file as a vector of strings. | ||
| The text is assumed to be encoded in UTF-8. | ||
| """ | ||
| readlines(filename::AbstractString) = open(readlines, filename) | ||
|
|
||
| Lines in the input can end in `'\\n'`, `'\\r'`, or `'\\r\\n'`. | ||
| The text is assumed to be encoded in UTF-8. If `chomp=false` | ||
| trailing newline character(s) will be included in the output; | ||
| otherwise newline characters(s) are stripped from result. | ||
| """ | ||
| function readlines(filename::AbstractString, chomp::Bool=false) | ||
| open(filename) do f | ||
| readlines(f, chomp) | ||
| end | ||
| end | ||
|
|
||
| ## byte-order mark, ntoh & hton ## | ||
|
|
||
|
|
@@ -448,7 +461,29 @@ function readuntil(s::IO, t::AbstractString) | |
| end | ||
|
|
||
| readline() = readline(STDIN) | ||
| readline(s::IO) = readuntil(s, '\n') | ||
|
|
||
| function readline(s::IO, chomp::Bool=false) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is no longer needed now, right?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's still needed. C version reads from IOStreams, but not for instance IOBuffers.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, so this means it has to be relatively efficient too. Is that the case?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd say its relatively efficient and anyway it has the same performance as before. |
||
| out = UInt8[] | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why not use an
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is significantly faster, readuntil uses the same approach. |
||
| while !eof(s) | ||
| c = read(s, UInt8) | ||
| if c == 0x0d | ||
| !chomp && push!(out, c) | ||
| if !eof(s) && Base.peek(s) == 0x0a | ||
| c = read(s, UInt8) | ||
| !chomp && push!(out, c) | ||
| end | ||
|
|
||
| break | ||
| elseif c == 0x0a | ||
| !chomp && push!(out, c) | ||
| break | ||
| else | ||
| push!(out, c) | ||
| end | ||
| end | ||
|
|
||
| return String(out) | ||
| end | ||
|
|
||
| """ | ||
| readchomp(x) | ||
|
|
@@ -512,22 +547,28 @@ readstring(filename::AbstractString) = open(readstring, filename) | |
|
|
||
| type EachLine | ||
| stream::IO | ||
| chomp::Bool | ||
| ondone::Function | ||
| EachLine(stream) = EachLine(stream, ()->nothing) | ||
| EachLine(stream, ondone) = new(stream, ondone) | ||
| EachLine(stream, chomp) = EachLine(stream, chomp, ()->nothing) | ||
| EachLine(stream, chomp, ondone) = new(stream, chomp, ondone) | ||
| end | ||
|
|
||
| """ | ||
| eachline(stream::IO) | ||
| eachline(filename::AbstractString) | ||
| eachline(stream::IO, chomp::Bool=false) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. One space is enough. |
||
| eachline(filename::AbstractString, chomp::Bool=false) | ||
|
|
||
| Create an iterable object that will yield each line from an I/O stream or a file. | ||
| The text is assumed to be encoded in UTF-8. | ||
| Lines in the input can end in `'\\n'`, `'\\r'`, or `'\\r\\n'`. | ||
| The text is assumed to be encoded in UTF-8. If `chomp=false` | ||
| trailing newline character(s) will be included in the output; | ||
| otherwise newline characters(s) are stripped from result. | ||
| """ | ||
| eachline(stream::IO) = EachLine(stream) | ||
| function eachline(filename::AbstractString) | ||
| eachline(stream::IO, chomp::Bool=false) = EachLine(stream, chomp) | ||
|
|
||
|
|
||
| function eachline(filename::AbstractString, chomp::Bool=false) | ||
| s = open(filename) | ||
| EachLine(s, ()->close(s)) | ||
| EachLine(s, chomp, ()->close(s)) | ||
| end | ||
|
|
||
| start(itr::EachLine) = nothing | ||
|
|
@@ -538,10 +579,11 @@ function done(itr::EachLine, nada) | |
| itr.ondone() | ||
| true | ||
| end | ||
| next(itr::EachLine, nada) = (readline(itr.stream), nothing) | ||
|
|
||
| next(itr::EachLine, nada) = (readline(itr.stream, itr.chomp), nothing) | ||
| eltype(::Type{EachLine}) = String | ||
|
|
||
| readlines(s=STDIN) = collect(eachline(s)) | ||
| readlines(s::IO=STDIN, chomp::Bool=false) = collect(eachline(s, chomp)) | ||
|
|
||
| iteratorsize(::Type{EachLine}) = SizeUnknown() | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -77,18 +77,23 @@ chop(s::AbstractString) = SubString(s, 1, endof(s)-1) | |
| chomp(s::AbstractString) | ||
|
|
||
| Remove a single trailing newline from a string. | ||
| Lines in the input can end in `'\\n'`, `'\\r'`, or `'\\r\\n'`. | ||
| """ | ||
| function chomp(s::AbstractString) | ||
| i = endof(s) | ||
| if (i < 1 || s[i] != '\n') return SubString(s, 1, i) end | ||
| if (i < 1 || (s[i] != '\n' && s[i] != '\r')) return SubString(s, 1, i) end | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Double space here. |
||
| if (s[i] == '\r') return SubString(s, 1, i-1) end | ||
| j = prevind(s,i) | ||
| if (j < 1 || s[j] != '\r') return SubString(s, 1, i-1) end | ||
| if (j < 1 || (s[j] != '\r' && s[i] == '\n')) return SubString(s, 1, i-1) end | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You don't need to check that the |
||
| return SubString(s, 1, j-1) | ||
| end | ||
|
|
||
| function chomp(s::String) | ||
| i = endof(s) | ||
| if i < 1 || s.data[i] != 0x0a | ||
| if i < 1 || (s.data[i] != 0x0a && s.data[i] != 0x0d) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since you're making the function more complex, I would find it clearer to replace the integer codes with |
||
| SubString(s, 1, i) | ||
| elseif s.data[i] == 0x0d | ||
| SubString(s, 1, i-1) | ||
| elseif i < 2 || s.data[i-1] != 0x0d | ||
| SubString(s, 1, i-1) | ||
| else | ||
|
|
@@ -101,6 +106,8 @@ function chomp!(s::String) | |
| if !isempty(s) && s.data[end] == 0x0a | ||
| n = (endof(s) < 2 || s.data[end-1] != 0x0d) ? 1 : 2 | ||
| ccall(:jl_array_del_end, Void, (Any, UInt), s.data, n) | ||
| elseif s.data[end] == 0x0d | ||
| ccall(:jl_array_del_end, Void, (Any, UInt), s.data, 1) | ||
| end | ||
| return s | ||
| end | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -831,6 +831,66 @@ size_t ios_copyuntil(ios_t *to, ios_t *from, char delim) | |
| return total; | ||
| } | ||
|
|
||
| // Copy until '\r', '\n' or '\r\n' | ||
| size_t ios_copyline(ios_t *to, ios_t *from, int chomp) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Existing code uses
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK, it's just that I found a |
||
| { | ||
| size_t nchomp = 0; | ||
| size_t total = 0, avail = (size_t)(from->size - from->bpos); | ||
| size_t ntowrite; | ||
| while (!ios_eof(from)) { | ||
| if (avail == 0) { | ||
| avail = ios_readprep(from, LINE_CHUNK_SIZE); | ||
| if (avail == 0) | ||
| break; | ||
| } | ||
| size_t written; | ||
|
|
||
| char *r = NULL; | ||
| char *n = NULL; | ||
|
|
||
| for (size_t i = 0; i < avail; i++){ | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Wouldn't it be simpler and possibly more efficient to use
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's the first thing I tried and it's simpler and significantly slower. You have to look for both This has the same performance as
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I don't follow. What difference does it make from the manual loop? I just suggest breaking on the first match. Then you only have to call
Even, for relatively long strings? I'm asking because it looks like glibc contains hand-written assembly code here and explicit SSE instructions here for this, so I figure there must be a reason.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually I tried using two calls to Most of the time in
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK. Anyway I've just realized |
||
| char *p = (char*)from->buf+from->bpos+i; | ||
| char ch = from->buf[from->bpos+i]; | ||
|
|
||
| if (ch == '\n'){ | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Space before |
||
| n = p; | ||
| if (chomp) nchomp = 1; | ||
| ntowrite = n - (from->buf+from->bpos) + 1 - nchomp; | ||
| break; | ||
| } | ||
| if (ch == '\r'){ | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Missing |
||
| r = p; | ||
| if (chomp) nchomp = 1; | ||
| ntowrite = r - (from->buf+from->bpos) + 1 - nchomp; | ||
| if (i < avail){ | ||
| char ch2 = from->buf[from->bpos+i+1]; | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can combine the two |
||
| if (ch2 == '\n'){ | ||
| if (chomp) nchomp = 2; | ||
| ntowrite = r - (from->buf+from->bpos) + 2 - nchomp; | ||
| } | ||
| } | ||
| break; | ||
| } | ||
| } | ||
|
|
||
| if (r == NULL && n == NULL) { | ||
| written = ios_write(to, from->buf+from->bpos, avail); | ||
| from->bpos += avail; | ||
| total += written; | ||
| avail = 0; | ||
| } | ||
| else { | ||
| written = ios_write(to, from->buf+from->bpos, ntowrite); | ||
| from->bpos += ntowrite + nchomp; | ||
| total += written; | ||
| return total; | ||
| } | ||
| } | ||
| from->_eof = 1; | ||
| return total; | ||
| } | ||
|
|
||
|
|
||
| static void _ios_init(ios_t *s) | ||
| { | ||
| // put all fields in a sane initial state | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -74,6 +74,20 @@ Base.compact(io) | |
| @test_throws ArgumentError seek(io,0) | ||
| @test_throws ArgumentError truncate(io,0) | ||
| @test readline(io) == "whipped cream\n" | ||
| @test write(io,"pancakes\r\nwaffles\n\rblueberries\r") > 0 | ||
| @test readline(io) == "pancakes\r\n" | ||
| @test readline(io) == "waffles\n" | ||
| @test readline(io) == "\r" | ||
| @test readline(io) == "blueberries\r" | ||
| write(io,"pancakes\r\nwaffles\n\rblueberries\r") | ||
| @test readline(io, true) == "pancakes" | ||
| @test readline(io, true) == "waffles" | ||
| @test readline(io, true) == "" | ||
| @test readline(io, true) == "blueberries" | ||
| write(io,"pancakes\r\nwaffles\n\rblueberries\r") | ||
| @test readlines(io) == String["pancakes\r\n","waffles\n","\r","blueberries\r"] | ||
| write(io,"pancakes\r\nwaffles\n\rblueberries\r") | ||
| @test readlines(io, true) == String["pancakes","waffles","","blueberries"] | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you also add tests for potentially problematic cases like |
||
| Base.compact(io) | ||
| @test position(io) == 0 | ||
| @test ioslength(io) == 0 | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Remove "including" (typo?).