@@ -46,22 +46,56 @@ function Normed{T,f}(x::Normed{T2}) where {T <: Unsigned,T2 <: Unsigned,f}
4646end
4747N0f16 (x:: N0f8 ) = reinterpret (N0f16, convert (UInt16, 0x0101 * reinterpret (x)))
4848
49- (:: Type{U} )(x:: Real ) where {U <: Normed } = _convert (U, rawtype (U), x)
50-
51- function _convert (:: Type{U} , :: Type{T} , x) where {U <: Normed ,T}
52- y = round (widen1 (rawone (U))* x)
53- (0 <= y) & (y <= typemax (T)) || throw_converterror (U, x)
54- U (_unsafe_trunc (T, y), 0 )
49+ (:: Type{U} )(x:: Real ) where {U <: Normed } = _convert (U, x)
50+
51+ function _convert (:: Type{U} , x) where {T, f, U <: Normed{T,f} }
52+ if T == UInt128 # for UInt128, we can't widen
53+ # the upper limit is not exact
54+ (0 <= x) & (x <= (typemax (T)/ rawone (U))) || throw_converterror (U, x)
55+ y = round (rawone (U)* x)
56+ else
57+ y = round (widen1 (rawone (U))* x)
58+ (0 <= y) & (y <= typemax (T)) || throw_converterror (U, x)
59+ end
60+ reinterpret (U, _unsafe_trunc (T, y))
5561end
5662# Prevent overflow (https://discourse.julialang.org/t/saving-greater-than-8-bit-images/6057)
57- _convert (:: Type{U} , :: Type{T} , x:: Float16 ) where {U <: Normed ,T} =
58- _convert (U, T, Float32 (x))
59- _convert (:: Type{U} , :: Type{UInt128} , x:: Float16 ) where {U <: Normed } =
60- _convert (U, UInt128, Float32 (x))
61- function _convert (:: Type{U} , :: Type{UInt128} , x) where {U <: Normed }
62- y = round (rawone (U)* x) # for UInt128, we can't widen
63- (0 <= y) & (y <= typemax (UInt128)) & (x <= Float64 (typemax (U))) || throw_converterror (U, x)
64- U (_unsafe_trunc (UInt128, y), 0 )
63+ function _convert (:: Type{U} , x:: Float16 ) where {T, f, U <: Normed{T,f} }
64+ if Float16 (typemax (T)/ rawone (U)) > Float32 (typemax (T)/ rawone (U))
65+ x == Float16 (typemax (T)/ rawone (U)) && return typemax (U)
66+ end
67+ return _convert (U, Float32 (x))
68+ end
69+ function _convert (:: Type{U} , x:: Tf ) where {T, f, U <: Normed{T,f} , Tf <: Union{Float32, Float64} }
70+ if T == UInt128 && f == 53
71+ 0 <= x <= Tf (3.777893186295717e22 ) || throw_converterror (U, x)
72+ else
73+ 0 <= x <= Tf ((typemax (T)- rawone (U))/ rawone (U)+ 1 ) || throw_converterror (U, x)
74+ end
75+
76+ significand_bits = Tf == Float64 ? 52 : 23
77+ if f <= (significand_bits + 1 ) && sizeof (T) * 8 < significand_bits
78+ return reinterpret (U, unsafe_trunc (T, round (rawone (U) * x)))
79+ end
80+ # cf. the implementation of `frexp`
81+ Tw = f < sizeof (T) * 8 ? T : widen1 (T)
82+ bits = sizeof (Tw) * 8 - 1
83+ xu = reinterpret (Tf == Float64 ? UInt64 : UInt32, x)
84+ k = Int (xu >> significand_bits)
85+ k == 0 && return zero (U) # neglect subnormal numbers
86+ significand = xu | (one (xu) << significand_bits)
87+ yh = unsafe_trunc (Tw, significand) << (bits - significand_bits)
88+ exponent_bias = Tf == Float64 ? 1023 : 127
89+ ex = exponent_bias - k + bits - f
90+ yi = bits >= f ? yh - (yh >> f) : yh
91+ if ex <= 0
92+ ex == 0 && return reinterpret (U, unsafe_trunc (T, yi))
93+ ex != - 1 || signbit (signed (yi)) && return typemax (U)
94+ return reinterpret (U, unsafe_trunc (T, yi + yi))
95+ end
96+ ex > bits && return reinterpret (U, ex == bits + 1 ? one (T) : zero (T))
97+ yi += one (Tw)<< ((ex - 1 ) & bits) # RoundNearestTiesUp
98+ return reinterpret (U, unsafe_trunc (T, yi >> (ex & bits)))
6599end
66100
67101rem (x:: T , :: Type{T} ) where {T <: Normed } = x
0 commit comments