33import pytest
44import torch
55
6- from neural_compressor .torch .algorithms .fp8_quant ._quant_common .quant_config import ScaleMethod , ScaleFormat , _hw_aligned_scale_methods
6+ from neural_compressor .torch .algorithms .fp8_quant ._quant_common .quant_config import ScaleMethod , ScaleFormat , _hw_aligned_scale_methods , _quant_only_scale_methods
77from neural_compressor .torch .algorithms .fp8_quant ._core .scale_handler import scale_to_scalar
8+ from neural_compressor .torch .algorithms .fp8_quant ._core .quant_dequant import QuantDynamicInput
89
910from ...test_hpu_utils import *
1011from ...tester import *
1112
1213
14+ SUPPORTED_DYNAMIC_SCALES = [ScaleMethod .ACT_MAXABS_PCS_POW2_WEIGHT_MAXABS_PTS_POW2_HW ]
15+ # Test Class to support restoration of calculated scale during runtime with dynamic quantization to test it correctness.
16+ # This is a workaround to avoid saving the scale in the original QuantDynamicInput class as scale saving may cause unwanted graph breaks in torch.compile or issues with hpu_graph.
17+ class TestQuantDynamicInput (QuantDynamicInput ):
18+ def __init__ (self , input_scales_creator , lp_dtype , hp_dtype , * args , ** kwargs ):
19+ super (TestQuantDynamicInput , self ).__init__ (input_scales_creator , lp_dtype , hp_dtype , * args , ** kwargs )
20+ self .input_scale = None
21+ def forward (self , x ):
22+ ret ,scale = super ().forward (x )
23+ # We save the calculated scale during this forward pass to test it correctness.
24+ self .input_scale = scale
25+ return ret , scale
26+
1327def get_test_vectors (* , dtype : torch .dtype , N : int , D_in : int , atol : float = 0.02 , rtol : float = 0.01 ) -> typing .Iterable [TestVector ]:
1428 yield TestVector (
1529 inputs = [torch .ones (N , D_in , dtype = dtype , device = "hpu" , requires_grad = False )],
@@ -24,14 +38,16 @@ def get_test_vectors(*, dtype: torch.dtype, N: int, D_in: int, atol: float = 0.0
2438 rtol = rtol ,
2539 )
2640
27- def check_tests_to_skip (scale_method , scale_format , dynamic_quantization ):
41+ def check_tests_to_skip (scale_method , scale_format , dynamic_quantization , device_type = None ):
2842 if scale_method in SCALE_METHODS_KEY_ERROR :
2943 pytest .xfail ("KeyError" )
3044 # TODO [SW-215692]: Fix segfault
3145 if scale_format == ScaleFormat .CONST or dynamic_quantization :
3246 if scale_method in [ScaleMethod .MAXABS_HW_OPT_WEIGHT , ScaleMethod .MAXABS_POW2_OPT_WEIGHT ]:
3347 pytest .xfail ("Segfault" )
34-
48+ # TODO [SW-225900] HW_ALIGNED_SINGLE_SCALE on gaudi3 fails in test_linear unit test
49+ if scale_method == ScaleMethod .HW_ALIGNED_SINGLE_SCALE and device_type == GAUDI3 :
50+ pytest .xfail ("NoAccuracy" )
3551
3652@pytest .mark .parametrize ("hp_dtype" , [torch .bfloat16 , torch .float32 ], ids = ["bf16" , "fp32" ])
3753@pytest .mark .parametrize ("lp_dtype" , [torch .float8_e4m3fn ], ids = ["fp8_e4m3fn" ])
@@ -49,12 +65,13 @@ def test_linear_accuracy(
4965 use_hpu_graphs : bool ,
5066 dynamic_quantization : bool
5167):
52- check_tests_to_skip (scale_method , scale_format , dynamic_quantization )
68+ check_tests_to_skip (scale_method , scale_format , dynamic_quantization , device_type )
5369 quant_modes = QUANT_MODES_DEFAULT
5470 atol = 0.022
5571 rtol = 0.175
5672 if scale_method == ScaleMethod .MAXABS_ARBITRARY :
5773 atol = 0.078
74+ rtol = 0.3
5875 if scale_method in SCALE_METHODS_QUANT_ONLY or dynamic_quantization :
5976 quant_modes = QUANT_MODES_QUANT_ONLY
6077 if scale_method == ScaleMethod .HW_ALIGNED_SINGLE_SCALE :
@@ -84,21 +101,27 @@ def run():
84101 use_hpu_graphs = use_hpu_graphs ,
85102 dynamic_quantization = dynamic_quantization
86103 )
87- if get_device_type () != device_type_id [ device_type ] and scale_method != ScaleMethod . MAXABS_HW :
88- return run_with_raised_exception ( run , ValueError , "Unsupported config: scale_method: " )
89- elif device_type_id [device_type ] != get_device_type ():
90- if not ( device_type_id [ device_type ] == get_gaudi2_type () and is_gaudi3 ()):
104+
105+ if scale_method == ScaleMethod . MAXABS_HW :
106+ if device_type_id [device_type ] == get_gaudi3_type () and is_gaudi2 ():
107+ # Gaudi3 scales not supported on Gaudi2 so "device_for_scales:Gaudi3" is not supported on Gaudi2 run
91108 return run_with_raised_exception (run , ValueError , "Unsupported config: device_for_scales=" )
92- elif scale_method == ScaleMethod .ACT_MAXABS_PCS_POW2_WEIGHT_MAXABS_PTS_POW2_HW and not dynamic_quantization :
93- return run_with_raised_exception (run , ValueError , "Unsupported config: scale method ScaleMethod.ACT_MAXABS_PCS_POW2_WEIGHT_MAXABS_PTS_POW2_HW" )
94- # TODO [SW-222725]: support HW aligned rounding in dynamic quantization
95- elif dynamic_quantization and scale_method in _hw_aligned_scale_methods :
96- return run_with_raised_exception (run , ValueError , "is not supported in dynamic quantization" )
109+ else :
110+ if get_device_type () != device_type_id [device_type ]:
111+ # In scale_method different than MAXABS_HW, we don't support device_for_scales so this scale_method config fails
112+ return run_with_raised_exception (run , ValueError , "Unsupported config: scale_method" )
113+
114+ if dynamic_quantization :
115+ if scale_method in _hw_aligned_scale_methods or scale_method in _quant_only_scale_methods :
116+ # When in dynamic quantization we don't support hw aligned scale methods and unit scale
117+ return run_with_raised_exception (run , ValueError , "Unsupported config: scale_method" )
118+ else :
119+ if scale_method in SUPPORTED_DYNAMIC_SCALES :
120+ # When in static quantization we don't support dynamic scale method
121+ return run_with_raised_exception (run , ValueError , "Unsupported config: scale_method" )
97122 return run ()
98123
99124
100- #TODO [SW-225078]: Reeanable test, find a way to test scales in dynamic quantization
101- @pytest .mark .skip ("[SW-225078] Find a way to test scales in dynamic quantization" )
102125@pytest .mark .parametrize ("hp_dtype" , [torch .bfloat16 , torch .float32 ], ids = ["bf16" , "fp32" ])
103126@pytest .mark .parametrize ("lp_dtype" , [torch .float8_e4m3fn ], ids = ["fp8_e4m3fn" ])
104127@pytest .mark .parametrize ("scale_method" , ScaleMethod )
@@ -126,8 +149,6 @@ def test_linear_dynamic_quantization(
126149 }
127150 def run ():
128151 test_vectors = get_test_vectors (dtype = hp_dtype , N = N , D_in = D_in )
129- import neural_compressor .torch .algorithms .fp8_quant .prepare_quant .prepare_model as prepare_model
130-
131152 dynamic_quantized_model = WrapModel (module_class , None , ** module_kwargs )
132153 dynamic_quantized_model = setup_quantization (
133154 dynamic_quantized_model ,
@@ -141,25 +162,26 @@ def run():
141162 ** module_kwargs ,
142163 )
143164 previous_input_dynamic_scale = 0
165+ test_quant_dynamic_input = TestQuantDynamicInput (dynamic_quantized_model .inner .quant_input .input_scales_creator ,
166+ dynamic_quantized_model .inner .quant_input .lp_dtype ,
167+ dynamic_quantized_model .inner .quant_input .hp_dtype )
168+ dynamic_quantized_model .inner .quant_input = test_quant_dynamic_input
144169
145170 for vector in test_vectors :
146171 dynamic_quantized_output = dynamic_quantized_model (* (input .clone () for input in vector .inputs )).to (float )
172+ # We save the calculated scale after the dynamic_quantized_model run the current input and calculates new scale.
173+ # In next iteration, we will have a new scale stored in the class.
174+ current_input_dynamic_scale = dynamic_quantized_model .inner .quant_input .input_scale
147175
148- current_input_dynamic_scale = dynamic_quantized_model .inner .scale_input
149176 if isinstance (current_input_dynamic_scale , torch .Tensor ):
150177 current_input_dynamic_scale = scale_to_scalar (current_input_dynamic_scale )
151178 if scale_method not in SCALE_METHODS_QUANT_ONLY :
152179 assert previous_input_dynamic_scale != current_input_dynamic_scale , f"input scales in dynamic quantization should differ in different tensors { previous_input_dynamic_scale = } { current_input_dynamic_scale = } "
153180 previous_input_dynamic_scale = current_input_dynamic_scale
154181
155- prepare_model .finish_measurements (dynamic_quantized_model )
182+ if (device_type_id [device_type ] == get_gaudi3_type () and is_gaudi2 () and scale_method == ScaleMethod .MAXABS_HW ):
183+ return run_with_raised_exception (run , ValueError , "Unsupported config: device_for_scales=" )
184+ if (get_device_type () != device_type_id [device_type ]) or scale_method in _hw_aligned_scale_methods or scale_method in _quant_only_scale_methods :
185+ return run_with_raised_exception (run , ValueError , "Unsupported config: scale_method" )
156186
157- if get_device_type () != device_type_id [device_type ] and scale_method != ScaleMethod .MAXABS_HW :
158- return run_with_raised_exception (run , ValueError , "Unsupported config: scale_method: " )
159- elif device_type_id [device_type ] != get_device_type ():
160- if not (device_type_id [device_type ] == get_gaudi2_type () and is_gaudi3 ()):
161- return run_with_raised_exception (run , ValueError , "Unsupported config: device_for_scales=" )
162- # TODO [SW-222725]: support HW aligned rounding in dynamic quantization
163- elif scale_method in _hw_aligned_scale_methods :
164- return run_with_raised_exception (run , ValueError , "is not supported in dynamic quantization" )
165- return run ()
187+ return run ()
0 commit comments