Skip to content

Commit 0802465

Browse files
authored
[SW-225078] [INC][DynamicQuant] Reenable testing dynamic quantization… (#214)
* [SW-225078] [INC][DynamicQuant] Reenable testing dynamic quantization scales on hpu graphs and torch.compile * CR fixes * tiny fix * cr fix * don't support running _quant_only_scale_methods with dynamic quantization * string check fix * fix test_matmul runs and atol in HW_ALIGNED_SINGLE_SCALE * string fixes
1 parent c3c64f1 commit 0802465

File tree

6 files changed

+87
-51
lines changed

6 files changed

+87
-51
lines changed

neural_compressor/torch/algorithms/fp8_quant/_quant_common/quant_config.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -266,17 +266,17 @@ def parse(custom_config: Mapping[str, str]) -> Fp8cfg:
266266
# TODO: "Linear only" in types still causes issues as llama7b quantizes also self_attn,
267267
# which should be blocked for some reason. We might then want to set measured_global_config["allowlist"]["types"] = supported_dynamic_ops
268268
# TODO [SW-222725]: support HW aligned rounding in dynamic quantization
269-
if scale_method in _hw_aligned_scale_methods:
269+
if scale_method in _hw_aligned_scale_methods or scale_method in _quant_only_scale_methods:
270270
raise ValueError(
271-
f"Unsupported config: scale method {scale_method} is not supported in dynamic quantization"
271+
f"Unsupported config: scale_method {scale_method} is not supported in dynamic quantization"
272272
)
273273
#TODO [SW-224403]: enable dynamic quantization in row parallel allreduce
274274
if measured_global_config["row_parallel_linear_allreduce_quantization"]:
275275
raise ValueError(f"Dynamic quantization is not supported when using row_parallel_linear_allreduce_quantization")
276276
else:
277277
if scale_method == ScaleMethod.ACT_MAXABS_PCS_POW2_WEIGHT_MAXABS_PTS_POW2_HW:
278278
raise ValueError(
279-
f"Unsupported config: scale method {scale_method} is supported only in dynamic quantization"
279+
f"Unsupported config: scale_method {scale_method} is supported only in dynamic quantization"
280280
)
281281

282282
if scale_method in _quant_only_scale_methods or dynamic_quantization:
@@ -353,7 +353,7 @@ def set_gaudi_device_for_scales(custom_config, measured_global_config):
353353
# Currently, only maxabs_hw is supported for a different device scales configuration
354354
if measured_global_config["scale_method"] != ScaleMethod.MAXABS_HW:
355355
raise ValueError(
356-
f"Unsupported config: scale_method: {measured_global_config['scale_method']} "
356+
f"Unsupported config: scale_method {measured_global_config['scale_method']} "
357357
f"for scale device overriding: {measured_global_config['device_for_scales']}"
358358
)
359359
if not (

test/3x/torch/algorithms/fp8_quant/unit_tests/test_functions/test_config_json.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ def run_predefined_config():
5050
if scale_method in SCALE_METHODS_KEY_ERROR and quant_mode == QuantMode.QUANTIZE:
5151
run_with_raised_exception(run_predefined_config, KeyError, "(<ScaleMethod.")
5252
elif scale_method == ScaleMethod.ACT_MAXABS_PCS_POW2_WEIGHT_MAXABS_PTS_POW2_HW:
53-
return run_with_raised_exception(run_predefined_config, ValueError, "Unsupported config: scale method ScaleMethod.ACT_MAXABS_PCS_POW2_WEIGHT_MAXABS_PTS_POW2_HW")
53+
return run_with_raised_exception(run_predefined_config, ValueError, "Unsupported config: scale_method")
5454
# This is an expected exception, quant only methods support only quantization
5555
elif scale_method in SCALE_METHODS_QUANT_ONLY and quant_mode not in [QuantMode.QUANTIZE, QuantMode.LOAD]:
5656
run_with_raised_exception(run_predefined_config, ValueError, "Unexpected behavior. This scale method doesn't require measurements.")

test/3x/torch/algorithms/fp8_quant/unit_tests/test_layers/test_conv2d.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,10 +61,10 @@ def run():
6161
device_type=device_type,
6262
)
6363
if get_device_type() != device_type_id[device_type] and scale_method != ScaleMethod.MAXABS_HW:
64-
return run_with_raised_exception(run, ValueError, "Unsupported config: scale_method: ")
64+
return run_with_raised_exception(run, ValueError, "Unsupported config: scale_method")
6565
elif device_type_id[device_type] != get_device_type():
6666
if not (device_type_id[device_type] == get_gaudi2_type() and is_gaudi3()):
6767
return run_with_raised_exception(run, ValueError, "Unsupported config: device_for_scales=")
6868
elif scale_method == ScaleMethod.ACT_MAXABS_PCS_POW2_WEIGHT_MAXABS_PTS_POW2_HW:
69-
return run_with_raised_exception(run, ValueError, "Unsupported config: scale method ScaleMethod.ACT_MAXABS_PCS_POW2_WEIGHT_MAXABS_PTS_POW2_HW")
69+
return run_with_raised_exception(run, ValueError, "Unsupported config: scale_method ScaleMethod.ACT_MAXABS_PCS_POW2_WEIGHT_MAXABS_PTS_POW2_HW")
7070
return run()

test/3x/torch/algorithms/fp8_quant/unit_tests/test_layers/test_linear.py

Lines changed: 50 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,27 @@
33
import pytest
44
import torch
55

6-
from neural_compressor.torch.algorithms.fp8_quant._quant_common.quant_config import ScaleMethod, ScaleFormat, _hw_aligned_scale_methods
6+
from neural_compressor.torch.algorithms.fp8_quant._quant_common.quant_config import ScaleMethod, ScaleFormat, _hw_aligned_scale_methods, _quant_only_scale_methods
77
from neural_compressor.torch.algorithms.fp8_quant._core.scale_handler import scale_to_scalar
8+
from neural_compressor.torch.algorithms.fp8_quant._core.quant_dequant import QuantDynamicInput
89

910
from ...test_hpu_utils import *
1011
from ...tester import *
1112

1213

14+
SUPPORTED_DYNAMIC_SCALES= [ScaleMethod.ACT_MAXABS_PCS_POW2_WEIGHT_MAXABS_PTS_POW2_HW]
15+
# Test Class to support restoration of calculated scale during runtime with dynamic quantization to test it correctness.
16+
# This is a workaround to avoid saving the scale in the original QuantDynamicInput class as scale saving may cause unwanted graph breaks in torch.compile or issues with hpu_graph.
17+
class TestQuantDynamicInput(QuantDynamicInput):
18+
def __init__(self, input_scales_creator, lp_dtype, hp_dtype, *args, **kwargs):
19+
super(TestQuantDynamicInput, self).__init__(input_scales_creator, lp_dtype, hp_dtype, *args, **kwargs)
20+
self.input_scale = None
21+
def forward(self, x):
22+
ret ,scale = super().forward(x)
23+
# We save the calculated scale during this forward pass to test it correctness.
24+
self.input_scale = scale
25+
return ret, scale
26+
1327
def get_test_vectors(*, dtype: torch.dtype, N: int, D_in: int, atol: float = 0.02, rtol: float = 0.01) -> typing.Iterable[TestVector]:
1428
yield TestVector(
1529
inputs=[torch.ones(N, D_in, dtype=dtype, device="hpu", requires_grad=False)],
@@ -24,14 +38,16 @@ def get_test_vectors(*, dtype: torch.dtype, N: int, D_in: int, atol: float = 0.0
2438
rtol=rtol,
2539
)
2640

27-
def check_tests_to_skip(scale_method, scale_format, dynamic_quantization):
41+
def check_tests_to_skip(scale_method, scale_format, dynamic_quantization, device_type = None):
2842
if scale_method in SCALE_METHODS_KEY_ERROR:
2943
pytest.xfail("KeyError")
3044
# TODO [SW-215692]: Fix segfault
3145
if scale_format == ScaleFormat.CONST or dynamic_quantization:
3246
if scale_method in [ScaleMethod.MAXABS_HW_OPT_WEIGHT, ScaleMethod.MAXABS_POW2_OPT_WEIGHT]:
3347
pytest.xfail("Segfault")
34-
48+
# TODO [SW-225900] HW_ALIGNED_SINGLE_SCALE on gaudi3 fails in test_linear unit test
49+
if scale_method == ScaleMethod.HW_ALIGNED_SINGLE_SCALE and device_type == GAUDI3:
50+
pytest.xfail("NoAccuracy")
3551

3652
@pytest.mark.parametrize("hp_dtype", [torch.bfloat16, torch.float32], ids=["bf16", "fp32"])
3753
@pytest.mark.parametrize("lp_dtype", [torch.float8_e4m3fn], ids=["fp8_e4m3fn"])
@@ -49,12 +65,13 @@ def test_linear_accuracy(
4965
use_hpu_graphs: bool,
5066
dynamic_quantization: bool
5167
):
52-
check_tests_to_skip(scale_method, scale_format, dynamic_quantization)
68+
check_tests_to_skip(scale_method, scale_format, dynamic_quantization, device_type)
5369
quant_modes = QUANT_MODES_DEFAULT
5470
atol = 0.022
5571
rtol = 0.175
5672
if scale_method == ScaleMethod.MAXABS_ARBITRARY:
5773
atol = 0.078
74+
rtol = 0.3
5875
if scale_method in SCALE_METHODS_QUANT_ONLY or dynamic_quantization:
5976
quant_modes = QUANT_MODES_QUANT_ONLY
6077
if scale_method == ScaleMethod.HW_ALIGNED_SINGLE_SCALE:
@@ -84,21 +101,27 @@ def run():
84101
use_hpu_graphs=use_hpu_graphs,
85102
dynamic_quantization=dynamic_quantization
86103
)
87-
if get_device_type() != device_type_id[device_type] and scale_method != ScaleMethod.MAXABS_HW:
88-
return run_with_raised_exception(run, ValueError, "Unsupported config: scale_method: ")
89-
elif device_type_id[device_type] != get_device_type():
90-
if not (device_type_id[device_type] == get_gaudi2_type() and is_gaudi3()):
104+
105+
if scale_method == ScaleMethod.MAXABS_HW:
106+
if device_type_id[device_type] == get_gaudi3_type() and is_gaudi2():
107+
# Gaudi3 scales not supported on Gaudi2 so "device_for_scales:Gaudi3" is not supported on Gaudi2 run
91108
return run_with_raised_exception(run, ValueError, "Unsupported config: device_for_scales=")
92-
elif scale_method == ScaleMethod.ACT_MAXABS_PCS_POW2_WEIGHT_MAXABS_PTS_POW2_HW and not dynamic_quantization:
93-
return run_with_raised_exception(run, ValueError, "Unsupported config: scale method ScaleMethod.ACT_MAXABS_PCS_POW2_WEIGHT_MAXABS_PTS_POW2_HW")
94-
# TODO [SW-222725]: support HW aligned rounding in dynamic quantization
95-
elif dynamic_quantization and scale_method in _hw_aligned_scale_methods:
96-
return run_with_raised_exception(run, ValueError, "is not supported in dynamic quantization")
109+
else:
110+
if get_device_type() != device_type_id[device_type]:
111+
# In scale_method different than MAXABS_HW, we don't support device_for_scales so this scale_method config fails
112+
return run_with_raised_exception(run, ValueError, "Unsupported config: scale_method")
113+
114+
if dynamic_quantization:
115+
if scale_method in _hw_aligned_scale_methods or scale_method in _quant_only_scale_methods:
116+
# When in dynamic quantization we don't support hw aligned scale methods and unit scale
117+
return run_with_raised_exception(run, ValueError, "Unsupported config: scale_method")
118+
else :
119+
if scale_method in SUPPORTED_DYNAMIC_SCALES:
120+
# When in static quantization we don't support dynamic scale method
121+
return run_with_raised_exception(run, ValueError, "Unsupported config: scale_method")
97122
return run()
98123

99124

100-
#TODO [SW-225078]: Reeanable test, find a way to test scales in dynamic quantization
101-
@pytest.mark.skip("[SW-225078] Find a way to test scales in dynamic quantization")
102125
@pytest.mark.parametrize("hp_dtype", [torch.bfloat16, torch.float32], ids=["bf16", "fp32"])
103126
@pytest.mark.parametrize("lp_dtype", [torch.float8_e4m3fn], ids=["fp8_e4m3fn"])
104127
@pytest.mark.parametrize("scale_method", ScaleMethod)
@@ -126,8 +149,6 @@ def test_linear_dynamic_quantization(
126149
}
127150
def run():
128151
test_vectors=get_test_vectors(dtype=hp_dtype, N=N, D_in=D_in)
129-
import neural_compressor.torch.algorithms.fp8_quant.prepare_quant.prepare_model as prepare_model
130-
131152
dynamic_quantized_model = WrapModel(module_class, None, **module_kwargs)
132153
dynamic_quantized_model = setup_quantization(
133154
dynamic_quantized_model,
@@ -141,25 +162,26 @@ def run():
141162
**module_kwargs,
142163
)
143164
previous_input_dynamic_scale = 0
165+
test_quant_dynamic_input = TestQuantDynamicInput(dynamic_quantized_model.inner.quant_input.input_scales_creator,
166+
dynamic_quantized_model.inner.quant_input.lp_dtype,
167+
dynamic_quantized_model.inner.quant_input.hp_dtype)
168+
dynamic_quantized_model.inner.quant_input = test_quant_dynamic_input
144169

145170
for vector in test_vectors:
146171
dynamic_quantized_output = dynamic_quantized_model(*(input.clone() for input in vector.inputs)).to(float)
172+
# We save the calculated scale after the dynamic_quantized_model run the current input and calculates new scale.
173+
# In next iteration, we will have a new scale stored in the class.
174+
current_input_dynamic_scale = dynamic_quantized_model.inner.quant_input.input_scale
147175

148-
current_input_dynamic_scale = dynamic_quantized_model.inner.scale_input
149176
if isinstance(current_input_dynamic_scale, torch.Tensor):
150177
current_input_dynamic_scale = scale_to_scalar(current_input_dynamic_scale)
151178
if scale_method not in SCALE_METHODS_QUANT_ONLY:
152179
assert previous_input_dynamic_scale != current_input_dynamic_scale, f"input scales in dynamic quantization should differ in different tensors {previous_input_dynamic_scale=} {current_input_dynamic_scale=}"
153180
previous_input_dynamic_scale = current_input_dynamic_scale
154181

155-
prepare_model.finish_measurements(dynamic_quantized_model)
182+
if (device_type_id[device_type] == get_gaudi3_type() and is_gaudi2() and scale_method == ScaleMethod.MAXABS_HW):
183+
return run_with_raised_exception(run, ValueError, "Unsupported config: device_for_scales=")
184+
if (get_device_type() != device_type_id[device_type]) or scale_method in _hw_aligned_scale_methods or scale_method in _quant_only_scale_methods:
185+
return run_with_raised_exception(run, ValueError, "Unsupported config: scale_method")
156186

157-
if get_device_type() != device_type_id[device_type] and scale_method != ScaleMethod.MAXABS_HW:
158-
return run_with_raised_exception(run, ValueError, "Unsupported config: scale_method: ")
159-
elif device_type_id[device_type] != get_device_type():
160-
if not (device_type_id[device_type] == get_gaudi2_type() and is_gaudi3()):
161-
return run_with_raised_exception(run, ValueError, "Unsupported config: device_for_scales=")
162-
# TODO [SW-222725]: support HW aligned rounding in dynamic quantization
163-
elif scale_method in _hw_aligned_scale_methods:
164-
return run_with_raised_exception(run, ValueError, "is not supported in dynamic quantization")
165-
return run()
187+
return run()

test/3x/torch/algorithms/fp8_quant/unit_tests/test_layers/test_matmul.py

Lines changed: 28 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -3,33 +3,35 @@
33
import pytest
44
import torch
55

6-
from neural_compressor.torch.algorithms.fp8_quant._quant_common.quant_config import ScaleMethod, _hw_aligned_scale_methods
6+
from neural_compressor.torch.algorithms.fp8_quant._quant_common.quant_config import ScaleMethod, _hw_aligned_scale_methods, _quant_only_scale_methods
77

88
from ...test_hpu_utils import *
99
from ...tester import *
1010

11+
SUPPORTED_DYNAMIC_SCALES= [ScaleMethod.ACT_MAXABS_PCS_POW2_WEIGHT_MAXABS_PTS_POW2_HW]
1112

12-
def get_test_vectors(*, dtype: torch.dtype) -> typing.Iterable[TestVector]:
13+
14+
def get_test_vectors(*, dtype: torch.dtype, atol) -> typing.Iterable[TestVector]:
1315
yield TestVector(
1416
inputs=[
1517
torch.eye(2, dtype=dtype, device="hpu"),
1618
torch.eye(2, dtype=dtype, device="hpu"),
1719
],
18-
atol=0.2,
20+
atol=atol,
1921
)
2022
yield TestVector(
2123
inputs=[
2224
torch.randn((2, 2), dtype=dtype, device="hpu"),
2325
torch.randn((2, 2), dtype=dtype, device="hpu"),
2426
],
25-
atol=0.2,
27+
atol=atol,
2628
)
2729
yield TestVector(
2830
inputs=[
2931
torch.eye(2, dtype=dtype, device="hpu"),
3032
torch.randn((2, 2), dtype=dtype, device="hpu"),
3133
],
32-
atol=0.2,
34+
atol=atol,
3335
)
3436

3537

@@ -57,25 +59,37 @@ def test_matmul_accuracy(hp_dtype: torch.dtype, lp_dtype: torch.dtype, scale_met
5759
if scale_method in SCALE_METHODS_KEY_ERROR:
5860
pytest.xfail("KeyError")
5961
quant_modes = QUANT_MODES_DEFAULT
62+
atol = 0.2
6063
if scale_method in SCALE_METHODS_QUANT_ONLY or dynamic_quantization:
6164
quant_modes = QUANT_MODES_QUANT_ONLY
65+
if scale_method == ScaleMethod.HW_ALIGNED_SINGLE_SCALE:
66+
atol = 1.0
6267
def run():
6368
run_accuracy_test(
6469
module_class=Matmul,
6570
lp_dtype=lp_dtype,
6671
scale_method=scale_method,
67-
test_vectors=get_test_vectors(dtype=hp_dtype),
72+
test_vectors=get_test_vectors(dtype=hp_dtype, atol=atol),
6873
quant_modes=quant_modes,
6974
device_type=device_type,
7075
dynamic_quantization=dynamic_quantization,
7176
)
72-
if get_device_type() != device_type_id[device_type] and scale_method != ScaleMethod.MAXABS_HW:
73-
return run_with_raised_exception(run, ValueError, "Unsupported config: scale_method: ")
74-
elif device_type_id[device_type] != get_device_type():
75-
if not (device_type_id[device_type] == get_gaudi2_type() and is_gaudi3()):
77+
78+
if scale_method == ScaleMethod.MAXABS_HW:
79+
if device_type_id[device_type] == get_gaudi3_type() and is_gaudi2():
80+
# Gaudi3 scales not supported on Gaudi2 so "device_for_scales:Gaudi3" is not supported on Gaudi2 run
7681
return run_with_raised_exception(run, ValueError, "Unsupported config: device_for_scales=")
77-
elif scale_method == ScaleMethod.ACT_MAXABS_PCS_POW2_WEIGHT_MAXABS_PTS_POW2_HW and not dynamic_quantization:
78-
return run_with_raised_exception(run, ValueError, "Unsupported config: scale method ScaleMethod.ACT_MAXABS_PCS_POW2_WEIGHT_MAXABS_PTS_POW2_HW")
79-
elif dynamic_quantization and scale_method in _hw_aligned_scale_methods:
80-
return run_with_raised_exception(run, ValueError, "is not supported in dynamic quantization")
82+
else:
83+
if get_device_type() != device_type_id[device_type]:
84+
# In scale_method different than MAXABS_HW, we don't support device_for_scales so this scale_method config fails
85+
return run_with_raised_exception(run, ValueError, "Unsupported config: scale_method")
86+
87+
if dynamic_quantization:
88+
if scale_method in _hw_aligned_scale_methods or scale_method in _quant_only_scale_methods:
89+
# When in dynamic quantization we don't support hw aligned scale methods and unit scale
90+
return run_with_raised_exception(run, ValueError, "Unsupported config: scale_method")
91+
else :
92+
if scale_method in SUPPORTED_DYNAMIC_SCALES:
93+
# When in static quantization we don't support dynamic scale method
94+
return run_with_raised_exception(run, ValueError, "Unsupported config: scale_method")
8195
return run()

test/3x/torch/algorithms/fp8_quant/unit_tests/test_runtime_scale_patching.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ def temp_directory():
5252
@pytest.mark.parametrize("scale_format", ["SCALAR", "CONST"])
5353
@pytest.mark.parametrize("dynamic_scale_patching", [True, False])
5454
def test_no_assert(scale_method, scale_format,dynamic_scale_patching, temp_directory):
55-
if scale_method in SCALE_METHODS_KEY_ERROR:
55+
if scale_method in SCALE_METHODS_KEY_ERROR :
5656
pytest.xfail("KeyError")
5757
model = TinyModel()
5858
model.eval()
@@ -90,7 +90,7 @@ def run_convert():
9090
finalize_calibration(model)
9191

9292
if scale_method == ScaleMethod.ACT_MAXABS_PCS_POW2_WEIGHT_MAXABS_PTS_POW2_HW:
93-
return run_with_raised_exception(run_convert, ValueError, "Unsupported config: scale method ScaleMethod.ACT_MAXABS_PCS_POW2_WEIGHT_MAXABS_PTS_POW2_HW")
93+
return run_with_raised_exception(run_convert, ValueError, "Unsupported config: scale_method")
9494
if dynamic_scale_patching:
9595
os.environ["RUNTIME_SCALE_PATCHING"] = "1"
9696
if not scale_method in RUNTIME_SCALE_PATCHING_SUPPORTED_METHODS_LIST:

0 commit comments

Comments
 (0)