@@ -63,6 +63,8 @@ for t1 in (Float32,Float64)
6363 end
6464 end
6565end
66+ convert {T<:Integer} (:: Type{T} , x:: Float16 ) = convert (T, Float32 (x))
67+
6668
6769promote_rule (:: Type{Float64} , :: Type{UInt128} ) = Float64
6870promote_rule (:: Type{Float64} , :: Type{Int128} ) = Float64
@@ -129,13 +131,110 @@ function convert(::Type{Float32}, x::Int128)
129131 reinterpret (Float32, s | d + y)
130132end
131133
134+ function convert (:: Type{Float16} , val:: Float32 )
135+ f = reinterpret (UInt32, val)
136+ i = (f >> 23 ) & 0x1ff + 1
137+ sh = shifttable[i]
138+ f &= 0x007fffff
139+ h:: UInt16 = basetable[i] + (f >> sh)
140+ # round
141+ # NOTE: we maybe should ignore NaNs here, but the payload is
142+ # getting truncated anyway so "rounding" it might not matter
143+ nextbit = (f >> (sh- 1 )) & 1
144+ if nextbit != 0
145+ # Round halfway to even or check lower bits
146+ if h& 1 == 1 || (f & ((1 << (sh- 1 ))- 1 )) != 0
147+ h += 1
148+ end
149+ end
150+ reinterpret (Float16, h)
151+ end
152+
153+ function convert (:: Type{Float32} , val:: Float16 )
154+ local ival:: UInt32 = reinterpret (UInt16, val),
155+ sign:: UInt32 = (ival & 0x8000 ) >> 15 ,
156+ exp:: UInt32 = (ival & 0x7c00 ) >> 10 ,
157+ sig:: UInt32 = (ival & 0x3ff ) >> 0 ,
158+ ret:: UInt32
159+
160+ if exp == 0
161+ if sig == 0
162+ sign = sign << 31
163+ ret = sign | exp | sig
164+ else
165+ n_bit = 1
166+ bit = 0x0200
167+ while (bit & sig) == 0
168+ n_bit = n_bit + 1
169+ bit = bit >> 1
170+ end
171+ sign = sign << 31
172+ exp = (- 14 - n_bit + 127 ) << 23
173+ sig = ((sig & (~ bit)) << n_bit) << (23 - 10 )
174+ ret = sign | exp | sig
175+ end
176+ elseif exp == 0x1f
177+ if sig == 0 # Inf
178+ if sign == 0
179+ ret = 0x7f800000
180+ else
181+ ret = 0xff800000
182+ end
183+ else # NaN
184+ ret = 0x7fc00000 | (sign<< 31 )
185+ end
186+ else
187+ sign = sign << 31
188+ exp = (exp - 15 + 127 ) << 23
189+ sig = sig << (23 - 10 )
190+ ret = sign | exp | sig
191+ end
192+ return reinterpret (Float32, ret)
193+ end
194+
195+ # Float32 -> Float16 algorithm from:
196+ # "Fast Half Float Conversion" by Jeroen van der Zijp
197+ # ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf
198+
199+ const basetable = Array {UInt16} (512 )
200+ const shifttable = Array {UInt8} (512 )
201+
202+ for i = 0 : 255
203+ e = i - 127
204+ if e < - 24 # Very small numbers map to zero
205+ basetable[i| 0x000 + 1 ] = 0x0000
206+ basetable[i| 0x100 + 1 ] = 0x8000
207+ shifttable[i| 0x000 + 1 ] = 24
208+ shifttable[i| 0x100 + 1 ] = 24
209+ elseif e < - 14 # Small numbers map to denorms
210+ basetable[i| 0x000 + 1 ] = (0x0400 >> (- e- 14 ))
211+ basetable[i| 0x100 + 1 ] = (0x0400 >> (- e- 14 )) | 0x8000
212+ shifttable[i| 0x000 + 1 ] = - e- 1
213+ shifttable[i| 0x100 + 1 ] = - e- 1
214+ elseif e <= 15 # Normal numbers just lose precision
215+ basetable[i| 0x000 + 1 ] = ((e+ 15 )<< 10 )
216+ basetable[i| 0x100 + 1 ] = ((e+ 15 )<< 10 ) | 0x8000
217+ shifttable[i| 0x000 + 1 ] = 13
218+ shifttable[i| 0x100 + 1 ] = 13
219+ elseif e < 128 # Large numbers map to Infinity
220+ basetable[i| 0x000 + 1 ] = 0x7C00
221+ basetable[i| 0x100 + 1 ] = 0xFC00
222+ shifttable[i| 0x000 + 1 ] = 24
223+ shifttable[i| 0x100 + 1 ] = 24
224+ else # Infinity and NaN's stay Infinity and NaN's
225+ basetable[i| 0x000 + 1 ] = 0x7C00
226+ basetable[i| 0x100 + 1 ] = 0xFC00
227+ shifttable[i| 0x000 + 1 ] = 13
228+ shifttable[i| 0x100 + 1 ] = 13
229+ end
230+ end
132231# convert(::Type{Float16}, x::Float32) = box(Float16,fptrunc(Float16,x))
133- convert (:: Type{Float16} , x:: Float64 ) = convert (Float16, convert (Float32,x))
134232convert (:: Type{Float32} , x:: Float64 ) = box (Float32,fptrunc (Float32,unbox (Float64,x)))
233+ convert (:: Type{Float16} , x:: Float64 ) = convert (Float16, convert (Float32,x))
135234
136235# convert(::Type{Float32}, x::Float16) = box(Float32,fpext(Float32,x))
137- convert (:: Type{Float64} , x:: Float16 ) = convert (Float64, convert (Float32,x))
138236convert (:: Type{Float64} , x:: Float32 ) = box (Float64,fpext (Float64,unbox (Float32,x)))
237+ convert (:: Type{Float64} , x:: Float16 ) = convert (Float64, convert (Float32,x))
139238
140239convert (:: Type{AbstractFloat} , x:: Bool ) = convert (Float64, x)
141240convert (:: Type{AbstractFloat} , x:: Int8 ) = convert (Float64, x)
@@ -204,23 +303,31 @@ trunc(::Type{Unsigned}, x::Float32) = trunc(UInt,x)
204303trunc (:: Type{Unsigned} , x:: Float64 ) = trunc (UInt,x)
205304trunc (:: Type{Integer} , x:: Float32 ) = trunc (Int,x)
206305trunc (:: Type{Integer} , x:: Float64 ) = trunc (Int,x)
306+ trunc {T<:Integer} (:: Type{T} , x:: Float16 ) = trunc (T, Float32 (x))
207307
208308# fallbacks
209309floor {T<:Integer} (:: Type{T} , x:: AbstractFloat ) = trunc (T,floor (x))
310+ floor {T<:Integer} (:: Type{T} , x:: Float16 ) = floor (T, Float32 (x))
210311ceil { T<:Integer} (:: Type{T} , x:: AbstractFloat ) = trunc (T,ceil (x))
312+ ceil { T<:Integer} (:: Type{T} , x:: Float16 ) = ceil (T, Float32 (x))
211313round {T<:Integer} (:: Type{T} , x:: AbstractFloat ) = trunc (T,round (x))
314+ round {T<:Integer} (:: Type{T} , x:: Float16 ) = round (T, Float32 (x))
212315
213316trunc (x:: Float64 ) = box (Float64,trunc_llvm (unbox (Float64,x)))
214317trunc (x:: Float32 ) = box (Float32,trunc_llvm (unbox (Float32,x)))
318+ trunc (x:: Float16 ) = Float16 (trunc (Float32 (x)))
215319
216320floor (x:: Float64 ) = box (Float64,floor_llvm (unbox (Float64,x)))
217321floor (x:: Float32 ) = box (Float32,floor_llvm (unbox (Float32,x)))
322+ floor (x:: Float16 ) = Float16 (floor (Float32 (x)))
218323
219324ceil (x:: Float64 ) = box (Float64,ceil_llvm (unbox (Float64,x)))
220325ceil (x:: Float32 ) = box (Float32,ceil_llvm (unbox (Float32,x)))
326+ ceil (x:: Float16 ) = Float16 ( ceil (Float32 (x)))
221327
222328round (x:: Float64 ) = box (Float64,rint_llvm (unbox (Float64,x)))
223329round (x:: Float32 ) = box (Float32,rint_llvm (unbox (Float32,x)))
330+ round (x:: Float16 ) = Float16 (round (Float32 (x)))
224331
225332# # floating point promotions ##
226333promote_rule (:: Type{Float32} , :: Type{Float16} ) = Float32
@@ -233,9 +340,13 @@ widen(::Type{Float32}) = Float64
233340_default_type (T:: Union{Type{Real},Type{AbstractFloat}} ) = Float64
234341
235342# # floating point arithmetic ##
236- - (x:: Float32 ) = box (Float32,neg_float (unbox (Float32,x)))
237343- (x:: Float64 ) = box (Float64,neg_float (unbox (Float64,x)))
344+ - (x:: Float32 ) = box (Float32,neg_float (unbox (Float32,x)))
345+ - (x:: Float16 ) = reinterpret (Float16, reinterpret (UInt16,x) $ 0x8000 )
238346
347+ for op in (:+ ,:- ,:* ,:/ ,:\ ,:^ )
348+ @eval ($ op)(a:: Float16 , b:: Float16 ) = Float16 (($ op)(Float32 (a), Float32 (b)))
349+ end
239350+ (x:: Float32 , y:: Float32 ) = box (Float32,add_float (unbox (Float32,x),unbox (Float32,y)))
240351+ (x:: Float64 , y:: Float64 ) = box (Float64,add_float (unbox (Float64,x),unbox (Float64,y)))
241352- (x:: Float32 , y:: Float32 ) = box (Float32,sub_float (unbox (Float32,x),unbox (Float32,y)))
@@ -247,10 +358,20 @@ _default_type(T::Union{Type{Real},Type{AbstractFloat}}) = Float64
247358
248359muladd (x:: Float32 , y:: Float32 , z:: Float32 ) = box (Float32,muladd_float (unbox (Float32,x),unbox (Float32,y),unbox (Float32,z)))
249360muladd (x:: Float64 , y:: Float64 , z:: Float64 ) = box (Float64,muladd_float (unbox (Float64,x),unbox (Float64,y),unbox (Float64,z)))
361+ function muladd (a:: Float16 , b:: Float16 , c:: Float16 )
362+ Float16 (muladd (Float32 (a), Float32 (b), Float32 (c)))
363+ end
250364
251365# TODO : faster floating point div?
252366# TODO : faster floating point fld?
253367# TODO : faster floating point mod?
368+
369+ for func in (:div ,:fld ,:cld ,:rem ,:mod )
370+ @eval begin
371+ $ func (a:: Float16 ,b:: Float16 ) = Float16 ($ func (Float32 (a),Float32 (b)))
372+ end
373+ end
374+
254375rem (x:: Float32 , y:: Float32 ) = box (Float32,rem_float (unbox (Float32,x),unbox (Float32,y)))
255376rem (x:: Float64 , y:: Float64 ) = box (Float64,rem_float (unbox (Float64,x),unbox (Float64,y)))
256377
@@ -268,6 +389,17 @@ function mod{T<:AbstractFloat}(x::T, y::T)
268389end
269390
270391# # floating point comparisons ##
392+ function == (x:: Float16 , y:: Float16 )
393+ ix = reinterpret (UInt16,x)
394+ iy = reinterpret (UInt16,y)
395+ if (ix| iy)& 0x7fff > 0x7c00 # isnan(x) || isnan(y)
396+ return false
397+ end
398+ if (ix| iy)& 0x7fff == 0x0000
399+ return true
400+ end
401+ return ix == iy
402+ end
271403== (x:: Float32 , y:: Float32 ) = eq_float (unbox (Float32,x),unbox (Float32,y))
272404== (x:: Float64 , y:: Float64 ) = eq_float (unbox (Float64,x),unbox (Float64,y))
273405!= (x:: Float32 , y:: Float32 ) = ne_float (unbox (Float32,x),unbox (Float32,y))
@@ -281,6 +413,9 @@ isequal(x::Float32, y::Float32) = fpiseq(unbox(Float32,x),unbox(Float32,y))
281413isequal (x:: Float64 , y:: Float64 ) = fpiseq (unbox (Float64,x),unbox (Float64,y))
282414isless ( x:: Float32 , y:: Float32 ) = fpislt (unbox (Float32,x),unbox (Float32,y))
283415isless ( x:: Float64 , y:: Float64 ) = fpislt (unbox (Float64,x),unbox (Float64,y))
416+ for op in (:< ,:<= ,:isless )
417+ @eval ($ op)(a:: Float16 , b:: Float16 ) = ($ op)(Float32 (a), Float32 (b))
418+ end
284419
285420function cmp (x:: AbstractFloat , y:: AbstractFloat )
286421 (isnan (x) || isnan (y)) && throw (DomainError ())
@@ -349,18 +484,22 @@ end
349484<= (x:: Float32 , y:: Union{Int32,UInt32} ) = Float64 (x)<= Float64 (y)
350485<= (x:: Union{Int32,UInt32} , y:: Float32 ) = Float64 (x)<= Float64 (y)
351486
352- abs (x:: Float64 ) = box (Float64,abs_float (unbox (Float64,x)))
487+
488+ abs (x:: Float16 ) = reinterpret (Float16, reinterpret (UInt16,x) & 0x7fff )
353489abs (x:: Float32 ) = box (Float32,abs_float (unbox (Float32,x)))
490+ abs (x:: Float64 ) = box (Float64,abs_float (unbox (Float64,x)))
354491
355492"""
356493 isnan(f) -> Bool
357494
358495Test whether a floating point number is not a number (NaN).
359496"""
360497isnan (x:: AbstractFloat ) = x != x
498+ isnan (x:: Float16 ) = reinterpret (UInt16,x)& 0x7fff > 0x7c00
361499isnan (x:: Real ) = false
362500
363501isfinite (x:: AbstractFloat ) = x - x == 0
502+ isfinite (x:: Float16 ) = reinterpret (UInt16,x)& 0x7c00 != 0x7c00
364503isfinite (x:: Real ) = decompose (x)[3 ] != 0
365504isfinite (x:: Integer ) = true
366505
@@ -526,6 +665,12 @@ exponent_one(::Type{Float32}) = 0x3f80_0000
526665exponent_half (:: Type{Float32} ) = 0x3f00_0000
527666significand_mask (:: Type{Float32} ) = 0x007f_ffff
528667
668+ sign_mask (:: Type{Float16} ) = 0x8000
669+ exponent_mask (:: Type{Float16} ) = 0x7c00
670+ exponent_one (:: Type{Float16} ) = 0x3c00
671+ exponent_half (:: Type{Float16} ) = 0x3800
672+ significand_mask (:: Type{Float16} ) = 0x03ff
673+
529674@pure significand_bits {T<:AbstractFloat} (:: Type{T} ) = trailing_ones (significand_mask (T))
530675@pure exponent_bits {T<:AbstractFloat} (:: Type{T} ) = sizeof (T)* 8 - significand_bits (T) - 1
531676@pure exponent_bias {T<:AbstractFloat} (:: Type{T} ) = Int (exponent_one (T) >> significand_bits (T))
0 commit comments