11# This file is a part of Julia. License is MIT: https://julialang.org/license
22
3- struct InvalidCharError <: Exception
4- char:: Char
3+ """
4+ The `AbstractChar` type is the supertype of all character implementations
5+ in Julia. A character represents a Unicode code point, and can be converted
6+ to/from `UInt32` in order to obtain the numerical value of the code point.
7+ These numerical values determine how characters are compared with `<` and `==`,
8+ for example.
9+
10+ A given `AbstractChar` subtype may be capable of representing only a subset
11+ of Unicode, in which case conversion from an unsupported `UInt32` value
12+ may throw an error. Conversely, the built-in [`Char`](@ref) type represents
13+ a *superset* of Unicode (in order to losslessly encode invalid byte streams),
14+ in which case conversion of a non-Unicode value *to* `UInt32` throws an error.
15+ The [`isvalid`](@ref) function can be used to check which codepoints are
16+ representable in a given `AbstractChar` type.
17+
18+ Internally, an `AbstractChar` type may use a variety of encodings. Conversion
19+ to `UInt32` will not reveal this encoding because it always returns the
20+ Unicode value of the character. (Typically, the raw encoding can be obtained
21+ via [`reinterpret`](@ref).)
22+ """
23+ AbstractChar
24+
25+ """
26+ Char(c::Union{Number,AbstractChar})
27+
28+ `Char` is a 32-bit [`AbstractChar`](@ref) type that is the default representation
29+ of characters in Julia. `Char` is the type used for character literals like `'x'`
30+ and it is also the element type of [`String`](@ref).
31+
32+ In order to losslessly represent arbitrary byte streams stored in a `String`,
33+ a `Char` value may store information that cannot be converted to a Unicode
34+ codepoint — converting such a `Char` to `UInt32` will throw an error.
35+ The [`isvalid(c::Char)`](@ref) function can be used to query whether `c`
36+ represents a valid Unicode character.
37+ """
38+ Char
39+
40+ struct InvalidCharError{T<: AbstractChar } <: Exception
41+ char:: T
542end
643struct CodePointError <: Exception
744 code:: Integer
845end
9- @noinline invalid_char (c:: Char ) = throw (InvalidCharError (c))
46+ @noinline invalid_char (c:: AbstractChar ) = throw (InvalidCharError (c))
1047@noinline code_point_err (u:: UInt32 ) = throw (CodePointError (u))
1148
1249function ismalformed (c:: Char )
@@ -24,6 +61,11 @@ function isoverlong(c::Char)
2461 is_overlong_enc (u)
2562end
2663
64+ # fallback: other AbstractChar types, by default, are assumed
65+ # not to support malformed or overlong encodings.
66+ ismalformed (c:: AbstractChar ) = false
67+ isoverlong (c:: AbstractChar ) = false
68+
2769function UInt32 (c:: Char )
2870 # TODO : use optimized inline LLVM
2971 u = reinterpret (UInt32, c)
@@ -69,50 +111,57 @@ function Char(b::Union{Int8,UInt8})
69111 0 ≤ b ≤ 0x7f ? reinterpret (Char, (b % UInt32) << 24 ) : Char (UInt32 (b))
70112end
71113
72- convert (:: Type{Char} , x:: Number ) = Char (x)
73- convert (:: Type{T} , x:: Char ) where {T<: Number } = T (x)
114+ convert (:: Type{AbstractChar} , x:: Number ) = Char (x) # default to Char
115+ convert (:: Type{T} , x:: Number ) where {T<: AbstractChar } = T (x)
116+ convert (:: Type{T} , x:: AbstractChar ) where {T<: Number } = T (x)
74117
75- rem (x:: Char , :: Type{T} ) where {T<: Number } = rem (UInt32 (x), T)
118+ rem (x:: AbstractChar , :: Type{T} ) where {T<: Number } = rem (UInt32 (x), T)
76119
77120typemax (:: Type{Char} ) = reinterpret (Char, typemax (UInt32))
78121typemin (:: Type{Char} ) = reinterpret (Char, typemin (UInt32))
79122
80- size (c:: Char ) = ()
81- size (c:: Char ,d) = convert (Int, d) < 1 ? throw (BoundsError ()) : 1
82- ndims (c:: Char ) = 0
83- ndims (:: Type{Char } ) = 0
84- length (c:: Char ) = 1
85- firstindex (c:: Char ) = 1
86- lastindex (c:: Char ) = 1
87- getindex (c:: Char ) = c
88- getindex (c:: Char , i:: Integer ) = i == 1 ? c : throw (BoundsError ())
89- getindex (c:: Char , I:: Integer... ) = all (x -> x == 1 , I) ? c : throw (BoundsError ())
90- first (c:: Char ) = c
91- last (c:: Char ) = c
92- eltype (:: Type{Char } ) = Char
93-
94- start (c:: Char ) = false
95- next (c:: Char , state) = (c, true )
96- done (c:: Char , state) = state
97- isempty (c:: Char ) = false
98- in (x:: Char , y:: Char ) = x == y
123+ size (c:: AbstractChar ) = ()
124+ size (c:: AbstractChar ,d) = convert (Int, d) < 1 ? throw (BoundsError ()) : 1
125+ ndims (c:: AbstractChar ) = 0
126+ ndims (:: Type{<:AbstractChar } ) = 0
127+ length (c:: AbstractChar ) = 1
128+ firstindex (c:: AbstractChar ) = 1
129+ lastindex (c:: AbstractChar ) = 1
130+ getindex (c:: AbstractChar ) = c
131+ getindex (c:: AbstractChar , i:: Integer ) = i == 1 ? c : throw (BoundsError ())
132+ getindex (c:: AbstractChar , I:: Integer... ) = all (x -> x == 1 , I) ? c : throw (BoundsError ())
133+ first (c:: AbstractChar ) = c
134+ last (c:: AbstractChar ) = c
135+ eltype (:: Type{T } ) where {T <: AbstractChar } = T
136+
137+ start (c:: AbstractChar ) = false
138+ next (c:: AbstractChar , state) = (c, true )
139+ done (c:: AbstractChar , state) = state
140+ isempty (c:: AbstractChar ) = false
141+ in (x:: AbstractChar , y:: AbstractChar ) = x == y
99142
100143== (x:: Char , y:: Char ) = reinterpret (UInt32, x) == reinterpret (UInt32, y)
101144isless (x:: Char , y:: Char ) = reinterpret (UInt32, x) < reinterpret (UInt32, y)
102145hash (x:: Char , h:: UInt ) =
103146 hash_uint64 (((reinterpret (UInt32, x) + UInt64 (0xd4d64234 )) << 32 ) ⊻ UInt64 (h))
104- widen (:: Type{Char} ) = Char
105147
106- - (x:: Char , y:: Char ) = Int (x) - Int (y)
107- - (x:: Char , y:: Integer ) = Char (Int32 (x) - Int32 (y))
108- + (x:: Char , y:: Integer ) = Char (Int32 (x) + Int32 (y))
109- + (x:: Integer , y:: Char ) = y + x
148+ # fallbacks:
149+ isless (x:: AbstractChar , y:: AbstractChar ) = isless (Char (x), Char (y))
150+ == (x:: AbstractChar , y:: AbstractChar ) = Char (x) == Char (y)
151+ hash (x:: AbstractChar , h:: UInt ) =
152+ hash_uint64 (((UInt32 (x) + UInt64 (0xd060fad0 )) << 32 ) ⊻ UInt64 (h))
153+ widen (:: Type{T} ) where {T<: AbstractChar } = T
154+
155+ - (x:: AbstractChar , y:: AbstractChar ) = Int (x) - Int (y)
156+ - (x:: T , y:: Integer ) where {T<: AbstractChar } = T (Int32 (x) - Int32 (y))
157+ + (x:: T , y:: Integer ) where {T<: AbstractChar } = T (Int32 (x) + Int32 (y))
158+ + (x:: Integer , y:: AbstractChar ) = y + x
110159
111- print (io:: IO , c:: Char ) = (write (io, c); nothing )
160+ print (io:: IO , c:: AbstractChar ) = (write (io, c); nothing )
112161
113162const hex_chars = UInt8[' 0' :' 9' ;' a' :' z' ]
114163
115- function show (io:: IO , c:: Char )
164+ function show (io:: IO , c:: AbstractChar )
116165 if c <= ' \\ '
117166 b = c == ' \0 ' ? 0x30 :
118167 c == ' \a ' ? 0x61 :
@@ -154,14 +203,14 @@ function show(io::IO, c::Char)
154203 return
155204end
156205
157- function show (io:: IO , :: MIME"text/plain" , c:: Char )
206+ function show (io:: IO , :: MIME"text/plain" , c:: T ) where {T <: AbstractChar }
158207 show (io, c)
159208 if ! ismalformed (c)
160209 print (io, " : " )
161210 if isoverlong (c)
162211 print (io, " [overlong] " )
163212 u = decode_overlong (c)
164- c = Char (u)
213+ c = T (u)
165214 else
166215 u = UInt32 (c)
167216 end
0 commit comments