11# This file is a part of Julia. License is MIT: http://julialang.org/license
22
3- immutable UTF16String <: AbstractString
4- data:: Array{UInt16,1} # includes 16-bit NULL termination after string chars
5- function UTF16String (data:: Vector{UInt16} )
6- if length (data) < 1 || data[end ] != 0
7- throw (ArgumentError (" UTF16String data must be NULL-terminated" ))
8- end
9- new (data)
10- end
11- end
12-
13- utf16_is_lead (c:: UInt16 ) = (c & 0xfc00 ) == 0xd800
14- utf16_is_trail (c:: UInt16 ) = (c & 0xfc00 ) == 0xdc00
15- utf16_is_surrogate (c:: UInt16 ) = (c & 0xf800 ) == 0xd800
16- utf16_get_supplementary (lead:: UInt16 , trail:: UInt16 ) = Char (UInt32 (lead- 0xd7f7 )<< 10 + trail)
17-
183function length (s:: UTF16String )
194 d = s. data
205 len = length (d) - 1
216 len == 0 && return 0
227 cnum = 0
238 for i = 1 : len
24- @inbounds cnum += ! utf16_is_trail (d[i])
9+ @inbounds cnum += ! is_surrogate_trail (d[i])
2510 end
2611 cnum
2712end
@@ -30,92 +15,69 @@ function endof(s::UTF16String)
3015 d = s. data
3116 i = length (d) - 1
3217 i == 0 && return i
33- utf16_is_surrogate (d[i]) ? i- 1 : i
18+ return is_surrogate_codeunit (d[i]) ? i- 1 : i
3419end
3520
21+ get_supplementary (lead:: Unsigned , trail:: Unsigned ) = (UInt32 (lead- 0xd7f7 )<< 10 + trail)
22+
3623function next (s:: UTF16String , i:: Int )
37- if ! utf16_is_surrogate (s. data[i])
38- return Char (s. data[i]), i+ 1
39- elseif length (s. data)- 1 > i && utf16_is_lead (s. data[i]) && utf16_is_trail (s. data[i+ 1 ])
40- return utf16_get_supplementary (s. data[i], s. data[i+ 1 ]), i+ 2
41- end
42- throw (ArgumentError (" invalid UTF-16 character index" ))
24+ ch = s. data[i]
25+ ! is_surrogate_codeunit (ch) && return (Char (ch), i+ 1 )
26+ # check length, account for terminating \0
27+ i >= (length (s. data)- 1 ) && utf_errfunc (UTF_ERR_MISSING_SURROGATE, i, UInt32 (ch))
28+ ! is_surrogate_lead (ch) && utf_errfunc (UTF_ERR_NOT_LEAD, i, ch)
29+ ct = s. data[i+ 1 ]
30+ ! is_surrogate_trail (ct) && utf_errfunc (UTF_ERR_NOT_TRAIL, i, ch)
31+ Char (get_supplementary (ch, ct)), i+ 2
4332end
4433
4534function reverseind (s:: UTF16String , i:: Integer )
4635 j = length (s. data) - i
47- return Base . utf16_is_trail (s. data[j]) ? j- 1 : j
36+ return is_surrogate_trail (s. data[j]) ? j- 1 : j
4837end
4938
5039lastidx (s:: UTF16String ) = length (s. data) - 1 # s.data includes NULL terminator
5140
5241function reverse (s:: UTF16String )
53- d = s. data
42+ d = s. data
5443 out = similar (d)
5544 out[end ] = 0 # NULL termination
5645 n = length (d)
57- for i = 1 : n- 1
58- out[i] = d[n- i]
59- if Base. utf16_is_lead (out[i])
60- out[i],out[i- 1 ] = out[i- 1 ],out[i]
61- end
62- end
63- return UTF16String (out)
64- end
65-
66- # TODO : optimize this
67- function encode16 (s:: AbstractString )
68- buf = UInt16[]
69- for ch in s
70- c = reinterpret (UInt32, ch)
71- if c < 0x10000
72- push! (buf, UInt16 (c))
73- elseif c <= 0x10ffff
74- push! (buf, UInt16 (0xd7c0 + (c>> 10 )))
75- push! (buf, UInt16 (0xdc00 + (c & 0x3ff )))
46+ @inbounds for i = 1 : n- 1
47+ ch = d[n- i]
48+ if is_surrogate_lead (ch)
49+ out[i],out[i- 1 ] = out[i- 1 ],ch
7650 else
77- throw ( ArgumentError ( " invalid Unicode character (0x $( hex (c)) > 0x10ffff) " ))
51+ out[i] = ch
7852 end
7953 end
80- push! (buf, 0 ) # NULL termination
81- UTF16String (buf)
54+ UTF16String (out)
8255end
8356
84- utf16 (x) = convert (UTF16String, x)
85- convert (:: Type{UTF16String} , s:: UTF16String ) = s
86- convert (:: Type{UTF16String} , s:: AbstractString ) = encode16 (s)
87- convert (:: Type{Array{UInt16,1}} , s:: UTF16String ) = s. data
88- convert (:: Type{Array{UInt16}} , s:: UTF16String ) = s. data
89-
90- # TODO : optimize this
91- convert (:: Type{UTF8String} , s:: UTF16String ) =
92- sprint (length (s. data)- 1 , io-> for c in s; write (io,c:: Char ); end )
93-
9457sizeof (s:: UTF16String ) = sizeof (s. data) - sizeof (UInt16)
95- unsafe_convert {T<:Union(Int16,UInt16)} (:: Type{Ptr{T}} , s:: UTF16String ) =
96- convert (Ptr{T}, pointer (s))
9758
9859function isvalid (:: Type{UTF16String} , data:: AbstractArray{UInt16} )
9960 i = 1
10061 n = length (data) # this may include NULL termination; that's okay
101- while i < n # check for unpaired surrogates
102- if utf16_is_lead (data[i]) && utf16_is_trail (data[i+ 1 ])
62+ @inbounds while i < n # check for unpaired surrogates
63+ if is_surrogate_lead (data[i]) && is_surrogate_trail (data[i+ 1 ])
10364 i += 2
104- elseif utf16_is_surrogate (data[i])
65+ elseif is_surrogate_codeunit (data[i])
10566 return false
10667 else
10768 i += 1
10869 end
10970 end
110- return i > n || ! utf16_is_surrogate (data[i])
71+ return i > n || ! is_surrogate_codeunit (data[i])
11172end
11273
74+ unsafe_convert {T<:Union(Int16,UInt16)} (:: Type{Ptr{T}} , s:: UTF16String ) =
75+ convert (Ptr{T}, pointer (s))
76+
11377function convert (:: Type{UTF16String} , data:: AbstractVector{UInt16} )
11478 ! isvalid (UTF16String, data) && throw (ArgumentError (" invalid UTF16 data" ))
11579 len = length (data)
116- d = Array (UInt16, len + 1 )
117- d[end ] = 0 # NULL terminate
118- UTF16String (copy! (d,1 , data,1 , len))
80+ @inbounds return UTF16String (setindex! (copy! (Vector {UInt16} (len+ 1 ),1 ,data,1 ,len),0 ,len+ 1 ))
11981end
12082
12183convert (T:: Type{UTF16String} , data:: AbstractArray{UInt16} ) =
@@ -146,6 +108,7 @@ function convert(T::Type{UTF16String}, bytes::AbstractArray{UInt8})
146108 UTF16String (d)
147109end
148110
111+ utf16 (x) = convert (UTF16String, x)
149112utf16 (p:: Ptr{UInt16} , len:: Integer ) = utf16 (pointer_to_array (p, len))
150113utf16 (p:: Ptr{Int16} , len:: Integer ) = utf16 (convert (Ptr{UInt16}, p), len)
151114function utf16 (p:: Union(Ptr{UInt16}, Ptr{Int16}) )
0 commit comments