pytorch
diff --git a/‎torchao/_models/llama/benchmark_results.txt‎
Lines changed: 280 additions & 51 deletions b/‎torchao/_models/llama/benchmark_results.txt‎
Lines changed: 280 additions & 51 deletions
diff --git a/‎torchao/_models/llama/benchmarks.sh‎
Lines changed: 185 additions & 91 deletions b/‎torchao/_models/llama/benchmarks.sh‎
Lines changed: 185 additions & 91 deletions
diff --git a/‎torchao/_models/llama/generate.py‎
Lines changed: 111 additions & 1 deletion b/‎torchao/_models/llama/generate.py‎
Lines changed: 111 additions & 1 deletion
diff --git a/‎torchao/_models/llama/model.py‎
Lines changed: 9 additions & 3 deletions b/‎torchao/_models/llama/model.py‎
Lines changed: 9 additions & 3 deletions
diff --git a/‎torchao/dtypes/affine_quantized_tensor.py‎
Lines changed: 7 additions & 1 deletion b/‎torchao/dtypes/affine_quantized_tensor.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎torchao/dtypes/affine_quantized_tensor_ops.py‎
Lines changed: 8 additions & 0 deletions b/‎torchao/dtypes/affine_quantized_tensor_ops.py‎
Lines changed: 8 additions & 0 deletions
@@ -19,6 +19,7 @@
 import torchao
 from torchao.quantization.quant_primitives import MappingType
 from torchao.utils import get_model_size_in_bytes, TORCH_VERSION_AT_LEAST_2_5
+from torchao.utils import unwrap_tensor_subclass
 
 torch.sparse.SparseSemiStructuredTensor._FORCE_CUTLASS = False
 
@@ -171,7 +172,7 @@ def decode_n_tokens(
             )
             next_token, next_prob = next_token.clone(), next_prob.clone()
             input_pos += 1
-            new_tokens.append(next_token)
+            new_tokens.append(next_token.clone())
             callback(new_tokens[-1])
             new_probs.append(next_prob)
             cur_token = next_token
@@ -368,6 +369,7 @@ def ffn_or_attn_only(mod, fqn):
             int8_weight_only,
             quantize_,
             uintx_weight_only,
+            gemlite_uintx_weight_only,
         )
 
         from torchao.quantization.granularity import PerRow, PerTensor
@@ -377,6 +379,113 @@ def ffn_or_attn_only(mod, fqn):
             from torchao.prototype.spinquant import apply_spinquant
 
             apply_spinquant(model)
+        if "gemsub" in quantization:
+            import os, pwd
+            import gemlite
+            from gemlite.core import GemLiteLinearTriton, set_autotune
+            _quant_args = quantization.split("-")
+            bit_width = int(_quant_args[-2])
+            group_size = None if _quant_args[-1] == 'None' else int(_quant_args[-1]) # TODO is 'None' working?
+            try:
+                packing_bitwidth = int(_quant_args[-3])
+            except:
+                packing_bitwidth = 8
+
+            quantize_(model, gemlite_uintx_weight_only(group_size, bit_width, packing_bitwidth))
+
+            # try to load gemlite kernel config
+            try:
+                GemLiteLinearTriton.load_config(f"/tmp/{pwd.getpwuid(os.getuid()).pw_gecos}_gemlite.json")
+            except:
+                pass
+            print("running calibration")
+            generate(
+                model,
+                encode_tokens(tokenizer, prompt, bos=True, device=device),
+                max_new_tokens,
+                batch_size,
+                interactive=False,
+                temperature=temperature,
+                top_k=top_k,
+            )
+
+            GemLiteLinearTriton.cache_config(f"/tmp/{pwd.getpwuid(os.getuid()).pw_gecos}_gemlite.json")
+        if "gemlite" in quantization:
+            import gemlite
+            import hqq
+            from gemlite.core import GemLiteLinearTriton, DType, set_autotune
+            from torchao.quantization.quant_api import _replace_with_custom_fn_if_matches_filter, _is_linear
+            from hqq.core.quantize import HQQLinear, BaseQuantizeConfig
+            _quant_args = quantization.split("-")
+
+            W_nbits = int(_quant_args[-2])
+            group_size = None if _quant_args[-1] == 'None' else int(_quant_args[-1]) #None is channel-wise 
+
+
+            assert W_nbits in [1, 2, 4, 8], f"W_nbits needs to be in [1, 2, 4, 8], got {W_nbits} for gemlite-<W_nbits>-<group_size>"
+            assert group_size in [32, 64, 128, 256, 512, 1024, None], f"group_size needs to be in [32, 64, 128, 256, 512, 1024, None], got {group_size} for gemlite-<W_nbits>-<group_size>"
+            assert precision == torch.float16, f"gemlite only supports float16 precision, got {precision}"
+
+
+
+            quant_config = BaseQuantizeConfig(nbits=W_nbits, group_size=group_size, quant_zero=False, quant_scale=False, axis=1)
+            quant_config['weight_quant_params']['optimize'] = False
+
+            set_autotune({'GEMV_REVSPLITK':True, 'GEMV':True, 'GEMM_SPLITK':True, 'GEMM':True}, exhaustive=False, use_cuda_graph=False)
+
+            def replace_fn(mod):
+                if not isinstance(mod, torch.nn.Linear):
+                    return mod
+
+                in_features = mod.in_features
+                out_features = mod.out_features
+                
+                compute_dtype = mod.weight.dtype
+                input_dtype, output_dtype = DType.FP16, DType.FP16
+
+
+                hqq_layer = HQQLinear(mod, quant_config=quant_config, compute_dtype=compute_dtype, device=device, del_orig=False)
+                if(hqq_layer.meta["group_size"] is None):
+                    hqq_layer.meta["group_size"] = hqq_layer.in_features
+
+                gemlite_linear = GemLiteLinearTriton(
+                                hqq_layer.meta["nbits"], 
+                                group_size=hqq_layer.meta["group_size"], 
+                                in_features=hqq_layer.in_features, 
+                                out_features=hqq_layer.out_features, 
+                                input_dtype=DType.FP16, 
+                                output_dtype=DType.FP16, 
+                                )
+                orig_shape = hqq_layer.meta['shape']
+                W_q        = hqq_layer.unpack(dtype=torch.uint8).view(orig_shape) #Expects uint8 for Wn quantization!
+                scales     = hqq_layer.meta['scale'].clone()
+                zeros      = hqq_layer.meta['zero'].clone()
+                bias       = hqq_layer.bias.clone() if (hqq_layer.bias is not None) else None  
+                gemlite_linear.pack(W_q, scales, zeros, bias=bias, fma_mode=False, packing_bitwidth=32, contiguous=False)
+
+                del hqq_layer.W_q
+                del hqq_layer.meta
+                del hqq_layer
+                torch.cuda.empty_cache()
+
+                return gemlite_linear
+                
+
+            _replace_with_custom_fn_if_matches_filter(model, replace_fn, _is_linear)
+            import gc
+            gc.collect()
+
+            generate(
+                model,
+                encode_tokens(tokenizer, prompt, bos=True, device=device),
+                max_new_tokens,
+                batch_size,
+                interactive=False,
+                temperature=temperature,
+                top_k=top_k,
+            )
+
+
         if "int8wo" in quantization:
             quantize_(model, int8_weight_only())
         if "int8dq" in quantization:
@@ -1053,6 +1162,7 @@ def callback(x):
     )
 
     args = parser.parse_args()
+    print(args)
     main(
         args.prefill_size,
         args.prompt,
 
@@ -170,7 +170,9 @@ def setup_caches(self, max_batch_size, max_seq_length, training: bool=False, kv_
         max_seq_length = find_multiple(max_seq_length, 8)
         self.max_seq_length = max_seq_length
         self.max_batch_size = max_batch_size
-        dtype = self.output.weight.dtype
+        dtype = None
+        if hasattr(self.output, "weight"):
+            dtype = self.output.weight.dtype
         # For quantized layers, dtype is encoded in scales
         if hasattr(self.output, "scales"):
             dtype = self.output.scales.dtype
@@ -243,7 +245,11 @@ def forward(self, idx: Tensor, input_pos: Optional[Tensor] = None) -> Tensor:
         x = self.tok_embeddings(idx)
 
         for i, layer in enumerate(self.layers):
-            x = layer(x, input_pos, freqs_cis, mask)
+            x_new = layer(x, input_pos, freqs_cis, mask)
+            # if torch.isnan(x_new).sum()>0:
+            #     import fbvscode; fbvscode.set_trace()
+            x = x_new
+
         x = self.norm(x)
         logits = self.output(x)
         return logits
@@ -311,7 +317,7 @@ def forward(self, x: Tensor, freqs_cis: Tensor, mask: Optional[Tensor], input_po
 
         k = k.repeat_interleave(self.n_head // self.n_local_heads, dim=1)
         v = v.repeat_interleave(self.n_head // self.n_local_heads, dim=1)
-        if mask is not None:
+        if mask is not None:            
             y = F.scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=0.0)
         else:
             y = F.scaled_dot_product_attention(q, k, v, dropout_p=0.0, is_causal=True)
 
@@ -225,6 +225,7 @@ def from_hp_to_intx(
                 else input_float.dtype
             )
             device = input_float.device
+            from torchao.dtypes.uintx import TensorCoreTiledLayout
             data, scale, zero_point, _ = choose_qparams_and_quantize_affine_hqq(
                 input_float,
                 nbits=nbits,
@@ -233,7 +234,12 @@ def from_hp_to_intx(
                 compute_dtype=compute_dtype,
                 device=device,
                 verbose=False,
-                raw_output=False,
+                raw_output=not isinstance(_layout, TensorCoreTiledLayout), 
+                # raw_output=False is basically the 'convert to TensorCoreTiledLayout zero_point version' option (add scale*midpoint)
+                # note in choose_qparams_affine, preserve_zero = False does this same thing while also controlling whether
+                # zero is preserved.
+                # TODO uncouple preserve_zero and conversion of zero_point to TensorCoreTiledLayout version
+                # TODO move the conversion of zero_point out of quant_primitives and into TensorCoreTiledLayout.from_plain  
             )
             data = data.to(target_dtype)
         else:
 
@@ -43,6 +43,10 @@
     _linear_bf16_act_uint4_weight_check,
     _linear_bf16_act_uint4_weight_impl,
 )
+from torchao.dtypes.uintx.gemlite_layout import (
+    _linear_fp_act_int4_weight_gemlite_check,
+    _linear_fp_act_int4_weight_gemlite_impl,
+)
 from torchao.quantization.quant_primitives import dequantize_affine
 from torchao.utils import (
     fill_defaults,
@@ -135,6 +139,10 @@ def _register_aqt_quantized_linear_dispatches():
             _linear_int8_act_int4_weight_marlin_qqq_check,
             _linear_int8_act_int4_weight_marlin_qqq_impl,
         ),
+        (
+            _linear_fp_act_int4_weight_gemlite_check,
+            _linear_fp_act_int4_weight_gemlite_impl,
+        )
     ]:
         register_aqt_quantized_linear_dispatch(dispatch_condition, impl)