@@ -240,23 +240,6 @@ def extra_f16_tensors(self, name: str, new_name: str, bid: int | None, n_dims: i
240
240
return False
241
241
242
242
def write_tensors (self ):
243
- # same as ggml_compute_fp32_to_bf16 in ggml-impl.h
244
- def np_fp32_to_bf16 (n : np .ndarray ):
245
- # force nan to quiet
246
- n = np .where ((n & 0x7fffffff ) > 0x7f800000 , (n & 0xffff0000 ) | (64 << 16 ), n )
247
- # flush subnormals to zero
248
- n = np .where ((n & 0x7f800000 ) == 0 , n & 0x80000000 , n )
249
- # round to nearest even
250
- n = (n + (0x7fff + ((n >> 16 ) & 1 ))) >> 16
251
- return n .astype (np .int16 )
252
-
253
- # Doing this row-wise is much, much faster than element-wise, hence the signature
254
- v_fp32_to_bf16 = np .vectorize (np_fp32_to_bf16 , otypes = [np .int16 ], signature = "(n)->(n)" )
255
- if self .lazy :
256
- # TODO: find a way to implicitly wrap np.vectorize functions
257
- # NOTE: the type is changed to reflect otypes passed to np.vectorize above
258
- v_fp32_to_bf16 = gguf .LazyNumpyTensor ._wrap_fn (v_fp32_to_bf16 , meta_noop = np .int16 )
259
-
260
243
max_name_len = max (len (s ) for _ , s in self .tensor_map .mapping .values ()) + len (".weight," )
261
244
262
245
for name , data_torch in self .get_tensors ():
@@ -309,27 +292,31 @@ def np_fp32_to_bf16(n: np.ndarray):
309
292
))
310
293
311
294
if self .ftype != gguf .LlamaFileType .ALL_F32 and extra_f16 and not extra_f32 :
312
- if self .ftype == gguf .LlamaFileType .MOSTLY_F16 :
295
+ if self .ftype == gguf .LlamaFileType .MOSTLY_BF16 :
296
+ data = gguf .quantize_bf16 (data )
297
+ assert data .dtype == np .int16
298
+ data_qtype = gguf .GGMLQuantizationType .BF16
299
+
300
+ elif self .ftype == gguf .LlamaFileType .MOSTLY_Q8_0 and gguf .can_quantize_to_q8_0 (data ):
301
+ data = gguf .quantize_q8_0 (data )
302
+ assert data .dtype == np .uint8
303
+ data_qtype = gguf .GGMLQuantizationType .Q8_0
304
+
305
+ else : # default to float16 for quantized tensors
313
306
if data_dtype != np .float16 :
314
307
data = data .astype (np .float16 )
315
308
data_qtype = gguf .GGMLQuantizationType .F16
316
309
317
- elif self .ftype == gguf .LlamaFileType .MOSTLY_BF16 :
318
- if data_dtype != np .float32 :
319
- data = data .astype (np .float32 )
320
- data = v_fp32_to_bf16 (data .view (np .int32 ))
321
- assert data .dtype == np .int16
322
- data_qtype = gguf .GGMLQuantizationType .BF16
323
-
324
- else : # by default, convert to float32
310
+ if data_qtype is None : # by default, convert to float32
325
311
if data_dtype != np .float32 :
326
312
data = data .astype (np .float32 )
327
313
data_qtype = gguf .GGMLQuantizationType .F32
328
314
329
- assert data_qtype is not None
330
-
315
+ block_size , type_size = gguf .GGML_QUANT_SIZES [data_qtype ]
331
316
# reverse shape to make it similar to the internal ggml dimension order
332
- shape_str = f"{{{ ', ' .join (str (n ) for n in reversed (data .shape ))} }}"
317
+ shape_str = f"""{{{ ', ' .join (str (n ) for n in reversed (
318
+ (* data .shape [:- 1 ], data .shape [- 1 ] * data .dtype .itemsize // type_size * block_size ))
319
+ )} }}"""
333
320
334
321
# n_dims is implicit in the shape
335
322
logger .info (f"{ f'%-{ max_name_len } s' % f'{ new_name } ,' } { old_dtype } --> { data_qtype .name } , shape = { shape_str } " )
@@ -2415,25 +2402,15 @@ class LazyTorchTensor(gguf.LazyBase):
2415
2402
def numpy (self ) -> gguf .LazyNumpyTensor :
2416
2403
dtype = self ._dtype_map [self .dtype ]
2417
2404
return gguf .LazyNumpyTensor (
2418
- meta = np . lib . stride_tricks . as_strided ( np . zeros ( 1 , dtype ) , self .shape , ( 0 for _ in self . shape ) ),
2405
+ meta = gguf . LazyNumpyTensor . meta_with_dtype_and_shape ( dtype , self .shape ),
2419
2406
lazy = self ._lazy ,
2420
2407
args = (self ,),
2421
2408
func = (lambda s : s [0 ].numpy ())
2422
2409
)
2423
2410
2424
2411
@classmethod
2425
- def eager_to_meta (cls , t : Tensor ) -> Tensor :
2426
- if t .is_meta :
2427
- return t
2428
- return t .detach ().to ("meta" )
2429
-
2430
- @classmethod
2431
- def meta_with_dtype (cls , m : Tensor , dtype : torch .dtype ) -> Tensor :
2432
- m = m .detach ()
2433
- if not m .is_meta :
2434
- m = m .to ("meta" )
2435
- m .dtype = dtype
2436
- return m
2412
+ def meta_with_dtype_and_shape (cls , dtype : torch .dtype , shape : torch .Size ) -> Tensor :
2413
+ return torch .empty (size = shape , dtype = dtype , device = "meta" )
2437
2414
2438
2415
@classmethod
2439
2416
def __torch_function__ (cls , func , types , args = (), kwargs = None ):
@@ -2464,8 +2441,8 @@ def parse_args() -> argparse.Namespace:
2464
2441
help = "path to write to; default: based on input. {ftype} will be replaced by the outtype." ,
2465
2442
)
2466
2443
parser .add_argument (
2467
- "--outtype" , type = str , choices = ["f32" , "f16" , "bf16" , "auto" ], default = "f16" ,
2468
- help = "output format - use f32 for float32, f16 for float16, bf16 for bfloat16, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type" ,
2444
+ "--outtype" , type = str , choices = ["f32" , "f16" , "bf16" , "q8_0" , " auto" ], default = "f16" ,
2445
+ help = "output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type" ,
2469
2446
)
2470
2447
parser .add_argument (
2471
2448
"--bigendian" , action = "store_true" ,
@@ -2523,6 +2500,7 @@ def main() -> None:
2523
2500
"f32" : gguf .LlamaFileType .ALL_F32 ,
2524
2501
"f16" : gguf .LlamaFileType .MOSTLY_F16 ,
2525
2502
"bf16" : gguf .LlamaFileType .MOSTLY_BF16 ,
2503
+ "q8_0" : gguf .LlamaFileType .MOSTLY_Q8_0 ,
2526
2504
"auto" : gguf .LlamaFileType .GUESSED ,
2527
2505
}
2528
2506
0 commit comments