Skip to content

Commit 04a25e7

Browse files
authored
Benchamarking (#1353)
Benchamarking (#1353) Summary: Add benchmarks for experimental torchao kernels. Differential Revision: D66512859
1 parent 5eb6339 commit 04a25e7

File tree

3 files changed

+76
-1
lines changed

3 files changed

+76
-1
lines changed

torchao/_models/llama/generate.py

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -217,7 +217,6 @@ def main(
217217
float8_weight_only,
218218
float8_dynamic_activation_float8_weight,
219219
)
220-
from torchao.prototype.quantization.autoquant_v2 import autoquant_v2
221220
from torchao.utils import unwrap_tensor_subclass
222221

223222
from torchao.quantization.granularity import PerTensor, PerRow
@@ -297,6 +296,29 @@ def main(
297296
dtype = _NBITS_TO_DTYPE[nbits]
298297
group_size = int(_quant_args[2])
299298
quantize_(model, uintx_weight_only(dtype, group_size, use_hqq=use_hqq))
299+
elif "int8_dynamic_activation_intx_weight" in quantization:
300+
from torchao.experimental.quant_api import int8_dynamic_activation_intx_weight
301+
assert precision == torch.float32, "int8_dynamic_activation_intx_weight requires fp32 precision"
302+
303+
# Build kernels in temp location, and load them in torch
304+
# This requires an ARM CPU
305+
from torchao.experimental.temp_build import temp_build_and_load_torchao_ops
306+
temp_build_and_load_torchao_ops(cmake_lists_path=os.path.dirname(os.path.realpath(__file__)) + "/../../experimental")
307+
308+
# Quantize model
309+
_quant_args = quantization.split("-")
310+
nbit = int(_quant_args[1])
311+
assert nbit >= 1 and nbit <= 8, "nbits must be 1 to 8"
312+
group_size = int(_quant_args[2])
313+
has_weight_zeros = bool(_quant_args[3])
314+
quantize_(
315+
model,
316+
int8_dynamic_activation_intx_weight(
317+
group_size=group_size,
318+
nbit=nbit,
319+
has_weight_zeros=has_weight_zeros,
320+
),
321+
)
300322
elif "float8wo" in quantization:
301323
quantize_(model, float8_weight_only())
302324
elif "float8dq" in quantization:
@@ -309,6 +331,7 @@ def main(
309331
granularity = PerTensor()
310332
quantize_(model, float8_dynamic_activation_float8_weight(granularity=granularity))
311333
elif "autoquant_v2" in quantization:
334+
from torchao.prototype.quantization.autoquant_v2 import autoquant_v2
312335
from torchao._models._eval import InputRecorder
313336
from torchao._models.llama.model import prepare_inputs_for_model
314337

torchao/experimental/temp_build.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
import glob
8+
import subprocess
9+
import tempfile
10+
import torch
11+
12+
def cmake_build_torchao_ops(cmake_lists_path, temp_build_dir):
13+
from distutils.sysconfig import get_python_lib
14+
print("Building torchao ops for ATen target")
15+
cmake_prefix_path = get_python_lib()
16+
subprocess.run(
17+
[
18+
"cmake",
19+
"-DCMAKE_PREFIX_PATH=" + cmake_prefix_path,
20+
"-DCMAKE_INSTALL_PREFIX=" + temp_build_dir.name,
21+
"-S " + cmake_lists_path,
22+
"-B " + temp_build_dir.name,
23+
]
24+
)
25+
subprocess.run(
26+
[
27+
"cmake",
28+
"--build",
29+
temp_build_dir.name,
30+
"-j 16",
31+
"--target install",
32+
"--config Release",
33+
]
34+
)
35+
36+
def temp_build_and_load_torchao_ops(cmake_lists_path):
37+
temp_build_dir = tempfile.TemporaryDirectory()
38+
cmake_build_torchao_ops(cmake_lists_path, temp_build_dir)
39+
libs = glob.glob(f"{temp_build_dir.name}/lib/libtorchao_ops_aten.*")
40+
libs = list(filter(lambda l: (l.endswith("so") or l.endswith("dylib")), libs))
41+
assert len(libs) == 1
42+
torch.ops.load_library(libs[0])
43+
print(f"TorchAO ops are loaded from {libs[0]}")

torchao/quantization/README.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -333,7 +333,16 @@ We're trying to develop kernels for low bit quantization for intx quantization f
333333

334334
You try can out these apis with the `quantize_` api as above alongside the constructor `uintx_weight_only` an example can be found in in `torchao/_models/llama/generate.py`.
335335

336+
### int8_dynamic_activation_intx_weight Quantization
337+
We have kernels that do 8-bit dynamic quantization of activations and uintx groupwise quantization of weights. These kernels are experimental and can only be run on a device with an ARM CPU (e.g., a Mac computers with Apple silicon). The benchmarks below were run on an M1 Mac Pro, with 8 perf cores, and 2 efficiency cores, and 32GB of RAM. In all cases, torch.compile was used.
336338

339+
| Model | Technique | Tokens/Second | Memory Bandwidth (GB/s) | Peak Memory (GB) | Model Size (GB) |
340+
| ------------- | -------------------------------------------------| --------------| ------------------------| ---------------- | ----------------|
341+
| Llama-3.1-8B | Base (bfloat16) | 1.24 | 18.62 | NA | 15.01 |
342+
| | int8_dynamic_activation_intx_weight-4-256-false | 16.03 | 65.81 | NA | 4.11 |
343+
| | int8_dynamic_activation_intx_weight-3-256-false | 18.94 | 59.97 | NA | 3.17 |
344+
345+
You try can out these apis with the `quantize_` api as above alongside the constructor `int8_dynamic_activation_intx_weight`. An example can be found in `torchao/_models/llama/generate.py`.
337346

338347
### Automatic Inductor Configuration
339348
The `quantize_` and `autoquant` apis now automatically use our recommended inductor configuration setings. You can mimic the same configuration settings for your own experiments by using the `torchao.quantization.utils.recommended_inductor_config_setter` to replicate our recommended configuration settings. Alternatively if you wish to disable these recommended settings, you can use the key word argument `set_inductor_config` and set it to false in the `quantize_` or `autoquant` apis to prevent assignment of those configuration settings. You can also overwrite these configuration settings after they are assigned if you so desire, as long as they are overwritten before passing any inputs to the torch.compiled model. This means that previous flows which referenced a variety of inductor configurations that needed to be set are now outdated, though continuing to manually set those same inductor configurations is unlikely to cause any issues.

0 commit comments

Comments
 (0)