Skip to content

Commit 9b881ff

Browse files
committed
possible simple approach to faster String allocation
[ci skip]
1 parent 2a4b68a commit 9b881ff

File tree

5 files changed

+37
-51
lines changed

5 files changed

+37
-51
lines changed

base/boot.jl

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -221,10 +221,10 @@ end
221221

222222
abstract DirectIndexString <: AbstractString
223223

224-
immutable String <: AbstractString
225-
data::Array{UInt8,1}
226-
# required to make String("foo") work (#15120):
227-
String(d::Array{UInt8,1}) = new(d)
224+
type String <: AbstractString
225+
len::Int
226+
String(p::Ptr{UInt8}, len::Int) =
227+
ccall(:jl_pchar_to_string, Ref{String}, (Ptr{UInt8}, Int), p, len)
228228
end
229229

230230
# This should always be inlined

base/strings/basic.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,8 @@ function unsafe_string(p::Union{Ptr{UInt8},Ptr{Int8}})
5757
ccall(:jl_cstr_to_string, Ref{String}, (Ptr{UInt8},), p)
5858
end
5959

60-
convert(::Type{Vector{UInt8}}, s::AbstractString) = String(s).data
61-
convert(::Type{Array{UInt8}}, s::AbstractString) = String(s).data
60+
#convert(::Type{Vector{UInt8}}, s::AbstractString) = String(s).data
61+
#convert(::Type{Array{UInt8}}, s::AbstractString) = String(s).data
6262
convert(::Type{String}, s::AbstractString) = String(s)
6363
convert(::Type{Vector{Char}}, s::AbstractString) = collect(s)
6464
convert(::Type{Symbol}, s::AbstractString) = Symbol(s)

base/strings/string.jl

Lines changed: 25 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -34,46 +34,35 @@ const utf8_trailing = [
3434
## required core functionality ##
3535

3636
function endof(s::String)
37-
d = s.data
38-
i = length(d)
39-
@inbounds while i > 0 && is_valid_continuation(d[i])
37+
p = pointer(s)
38+
i = s.len
39+
while i > 0 && is_valid_continuation(unsafe_load(p,i))
4040
i -= 1
4141
end
4242
i
4343
end
4444

4545
function length(s::String)
46-
d = s.data
46+
p = pointer(s)
4747
cnum = 0
48-
for i = 1:length(d)
49-
@inbounds cnum += !is_valid_continuation(d[i])
48+
for i = 1:s.len
49+
cnum += !is_valid_continuation(unsafe_load(p,i))
5050
end
5151
cnum
5252
end
5353

54-
@noinline function slow_utf8_next(d::Vector{UInt8}, b::UInt8, i::Int)
55-
# potentially faster version
56-
# d = s.data
57-
# a::UInt32 = d[i]
58-
# if a < 0x80; return Char(a); end
59-
# #if a&0xc0==0x80; return '\ufffd'; end
60-
# b::UInt32 = a<<6 + d[i+1]
61-
# if a < 0xe0; return Char(b - 0x00003080); end
62-
# c::UInt32 = b<<6 + d[i+2]
63-
# if a < 0xf0; return Char(c - 0x000e2080); end
64-
# return Char(c<<6 + d[i+3] - 0x03c82080)
65-
54+
@noinline function slow_utf8_next(p::Ptr{UInt8}, b::UInt8, i::Int, l::Int)
6655
if is_valid_continuation(b)
67-
throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, d[i]))
56+
throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, unsafe_load(p,i)))
6857
end
6958
trailing = utf8_trailing[b + 1]
70-
if length(d) < i + trailing
59+
if l < i + trailing
7160
return '\ufffd', i+1
7261
end
7362
c::UInt32 = 0
7463
for j = 1:(trailing + 1)
7564
c <<= 6
76-
c += d[i]
65+
c += unsafe_load(p,i)
7766
i += 1
7867
end
7968
c -= utf8_offset[trailing + 1]
@@ -84,12 +73,15 @@ end
8473
# function is split into this critical fast-path
8574
# for pure ascii data, such as parsing numbers,
8675
# and a longer function that can handle any utf8 data
87-
d = s.data
88-
b = d[i]
76+
if i < 1 || i > s.len
77+
throw(BoundsError(s,i))
78+
end
79+
p = pointer(s)
80+
b = unsafe_load(p, i)
8981
if b < 0x80
90-
return Char(b), i + 1
82+
return Char(b), i+1
9183
end
92-
return slow_utf8_next(d, b, i)
84+
return slow_utf8_next(p, b, i, s.len)
9385
end
9486

9587
function first_utf8_byte(ch::Char)
@@ -102,20 +94,20 @@ function first_utf8_byte(ch::Char)
10294
end
10395

10496
function reverseind(s::String, i::Integer)
105-
j = length(s.data) + 1 - i
106-
d = s.data
107-
while is_valid_continuation(d[j])
97+
j = s.len + 1 - i
98+
p = pointer(s)
99+
while is_valid_continuation(unsafe_load(p,j))
108100
j -= 1
109101
end
110102
return j
111103
end
112104

113105
## overload methods for efficiency ##
114106

115-
sizeof(s::String) = sizeof(s.data)
107+
sizeof(s::String) = s.len
116108

117109
isvalid(s::String, i::Integer) =
118-
(1 <= i <= endof(s.data)) && !is_valid_continuation(s.data[i])
110+
(1 <= i <= s.len) && !is_valid_continuation(unsafe_load(pointer(s),i))
119111

120112
const empty_utf8 = String(UInt8[])
121113

@@ -237,10 +229,10 @@ function reverse(s::String)
237229
String(buf)
238230
end
239231

240-
write(io::IO, s::String) = write(io, s.data)
232+
write(io::IO, s::String) = write(io, pointer(s), s.len)
241233

242-
pointer(x::String) = pointer(x.data)
243-
pointer(x::String, i::Integer) = pointer(x.data)+(i-1)
234+
pointer(s::String) = convert(Ptr{UInt8}, pointer_from_objref(s)+sizeof(Int))
235+
pointer(x::String, i::Integer) = pointer(x)+(i-1)
244236

245237
convert(::Type{String}, s::String) = s
246238
convert(::Type{String}, v::Vector{UInt8}) = String(v)

src/array.c

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -369,20 +369,14 @@ JL_DLLEXPORT jl_array_t *jl_pchar_to_array(const char *str, size_t len)
369369

370370
JL_DLLEXPORT jl_value_t *jl_array_to_string(jl_array_t *a)
371371
{
372-
jl_ptls_t ptls = jl_get_ptls_states();
373-
if (!jl_typeis(a, jl_array_uint8_type))
374-
jl_type_error("jl_array_to_string", (jl_value_t*)jl_array_uint8_type, (jl_value_t*)a);
375-
jl_value_t *s = jl_gc_alloc(ptls, sizeof(void*), jl_string_type);
376-
jl_set_nth_field(s, 0, (jl_value_t*)a);
377-
return s;
372+
return jl_pchar_to_string(jl_array_data(a), jl_array_len(a));
378373
}
379374

380375
JL_DLLEXPORT jl_value_t *jl_pchar_to_string(const char *str, size_t len)
381376
{
382-
jl_array_t *a = jl_pchar_to_array(str, len);
383-
JL_GC_PUSH1(&a);
384-
jl_value_t *s = jl_array_to_string(a);
385-
JL_GC_POP();
377+
jl_value_t *s = jl_gc_allocobj(jl_get_ptls_states(), sizeof(void*)+len, jl_string_type);
378+
*(size_t*)s = len;
379+
memcpy((char*)s + sizeof(void*), str, len);
386380
return s;
387381
}
388382

src/julia.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -756,8 +756,8 @@ STATIC_INLINE void jl_array_uint8_set(void *a, size_t i, uint8_t x)
756756
#define jl_data_ptr(v) ((jl_value_t**)v)
757757

758758
#define jl_array_ptr_data(a) ((jl_value_t**)((jl_array_t*)a)->data)
759-
#define jl_string_data(s) ((char*)((jl_array_t*)jl_data_ptr(s)[0])->data)
760-
#define jl_string_len(s) (jl_array_len((jl_array_t*)(jl_data_ptr(s)[0])))
759+
#define jl_string_data(s) ((char*)s + sizeof(void*))
760+
#define jl_string_len(s) (*(size_t*)s)
761761
#define jl_iostr_data(s) ((char*)((jl_array_t*)jl_data_ptr(s)[0])->data)
762762

763763
#define jl_gf_mtable(f) (((jl_datatype_t*)jl_typeof(f))->name->mt)

0 commit comments

Comments
 (0)