diff --git a/.azure-pipelines/scripts/codeScan/pyspelling/inc_dict.txt b/.azure-pipelines/scripts/codeScan/pyspelling/inc_dict.txt index 14dad3d64e1..82f6986ad51 100644 --- a/.azure-pipelines/scripts/codeScan/pyspelling/inc_dict.txt +++ b/.azure-pipelines/scripts/codeScan/pyspelling/inc_dict.txt @@ -554,6 +554,7 @@ entrypoint enum env environ +ep eq erf Erf diff --git a/docs/source/mixed_precision.md b/docs/source/mixed_precision.md index b10e73f853e..a011633bd1e 100644 --- a/docs/source/mixed_precision.md +++ b/docs/source/mixed_precision.md @@ -20,38 +20,63 @@ The recently launched 3rd Gen Intel® Xeon® Scalable processor (codenamed Coope ## Mixed Precision Support Matrix -|Framework |BF16 | -|--------------|:-----------:| -|TensorFlow |✔ | -|PyTorch |✔ | -|ONNX |plan to support in the future | -|MXNet |✔ | +|Framework |BF16 |FP16 | +|--------------|:-----------:|:-----------:| +|TensorFlow |✔ |:x: | +|PyTorch |✔ |:x: | +|ONNX Runtime |✔ |✔ | +|MXNet |✔ |:x: | -> **During quantization, BF16 conversion is default enabled. Please refer to this [document](./quantization_mixed_precision.md) for its workflow.** +> **During quantization, BF16 conversion is default enabled, FP16 can be executed if 'device' of config is 'gpu'. Please refer to this [document](./quantization_mixed_precision.md) for its workflow.** ## Get Started with Mixed Precision API -To get a bf16 model, users can use the Mixed Precision API as follows. +To get a bf16/fp16 model, users can use the Mixed Precision API as follows. +Supported precisions for mix precision include bf16 and fp16. If users want to get a pure fp16 or bf16 model, they should add another precision into excluded_precisions. + +- BF16: + ```python from neural_compressor import mix_precision from neural_compressor.config import MixedPrecisionConfig -conf = MixedPrecisionConfig() +conf = MixedPrecisionConfig(excluded_precisions=['fp16']) +converted_model = mix_precision.fit(model, config=conf) +converted_model.save('./path/to/save/') +``` + +- FP16: +```python +from neural_compressor import mix_precision +from neural_compressor.config import MixedPrecisionConfig + +conf = MixedPrecisionConfig( + backend='onnxrt_cuda_ep', + device='gpu', + excluded_precisions=['bf16']) converted_model = mix_precision.fit(model, config=conf) converted_model.save('./path/to/save/') ``` -> **BF16 conversion may lead to accuracy drop. Intel® Neural Compressor provides an accuracy-aware tuning function to reduce accuracy loss, which will fallback converted ops to FP32 automatically to get better accuracy. To enable this function, users only need to provide an evaluation function (or dataloader + metric).** +> **BF16/FP16 conversion may lead to accuracy drop. Intel® Neural Compressor provides an accuracy-aware tuning function to reduce accuracy loss, which will fallback converted ops to FP32 automatically to get better accuracy. To enable this function, users only need to provide an evaluation function (or dataloader + metric).** ## Examples -There are 2 pre-requirements to run BF16 mixed precision examples: +- BF16: + + There are 2 pre-requirements to run BF16 mixed precision examples: + + 1. Hardware: CPU supports `avx512_bf16` instruction set. + 2. Software: intel-tensorflow >= [2.3.0](https://pypi.org/project/intel-tensorflow/2.3.0/) or torch >= [1.11.0](https://download.pytorch.org/whl/torch_stable.html). + + If either pre-requirement can't be met, the program would exit consequently. -- Hardware: CPU supports `avx512_bf16` instruction set. -- Software: intel-tensorflow >= [2.3.0](https://pypi.org/project/intel-tensorflow/2.3.0/) or torch >= [1.11.0](https://download.pytorch.org/whl/torch_stable.html). +- FP16 -If either pre-requirement can't be met, the program would exit consequently. + Currently Intel® Neural Compressor only support FP16 mixed precision for ONNX models. + + To run FP16 mixed precision examples, users need to set 'device' of config to 'gpu' and 'backend' to 'onnxrt_cuda_ep'. diff --git a/neural_compressor/adaptor/onnxrt.py b/neural_compressor/adaptor/onnxrt.py index 36966227b28..924da83bd6e 100644 --- a/neural_compressor/adaptor/onnxrt.py +++ b/neural_compressor/adaptor/onnxrt.py @@ -86,12 +86,13 @@ def __init__(self, framework_specific_info): logger.warning("Dynamic approach doesn't support QDQ format.") # get quantization config file according to backend + config_file = None if self.backend == 'CPUExecutionProvider': config_file = 'onnxrt.yaml' elif self.backend == 'TensorrtExecutionProvider': config_file = 'onnxrt_trt.yaml' elif self.backend == 'CUDAExecutionProvider': - config_file == 'onnxrt_cuda.yaml' + config_file = 'onnxrt_cuda.yaml' else: # pragma: no cover assert False, "{} provider is not supported in current environment, " \ "supported providers: {}".format(self.backend, @@ -128,6 +129,8 @@ def __init__(self, framework_specific_info): for precision in self.query_handler.get_precisions(): if precision != 'fp32': + if self.device == 'cpu' and precision == 'fp16': + continue self.quantizable_op_types += \ self.query_handler.get_op_types_by_precision(precision=precision) @@ -930,6 +933,8 @@ def query_fw_capability(self, model): precisions = query.get_precisions() for precision in precisions: + if precision == 'fp16' and self.device == 'cpu': + continue # get supported optype for target precision optypes = query.get_op_types_by_precision(precision) if \ query.get_op_types_by_precision(precision) != ['*'] else \ @@ -1046,7 +1051,7 @@ def query_fw_capability(self, model): else: # pragma: no cover op_wise.update( {(node.name, node.op_type): copy.deepcopy(optype_wise[node.op_type])}) - + return {'optypewise': optype_wise, 'opwise': op_wise} def _optypewise_filter_for_qdq(self, optype_wise): @@ -1411,12 +1416,17 @@ def _compare(version1, version2): config['capabilities'] = {} # generate other config content including precisions and ops - precisions = [key for key in config['capabilities'].keys()] + precisions = list(version_config.keys() - {'version', 'recipes'}) if 'fp32' not in precisions: precisions.append('fp32') config['precisions'] = {'names': ','.join(precisions)} op_types = {} + for precision in precisions: + if precision in config['capabilities']: + op_types[precision] = [op_type for op_type in config['capabilities'][precision].keys()] + elif precision in version_config: + op_types[precision] = version_config[precision] for precision, precision_config in config['capabilities'].items(): op_types[precision] = [op_type for op_type in precision_config.keys()] if 'fp32' not in op_types: @@ -1485,4 +1495,4 @@ def get_fallback_list(self): def get_specific_cfg_version(self): """Get version of the specific config.""" - return self.config_version \ No newline at end of file + return self.config_version diff --git a/neural_compressor/adaptor/onnxrt_cuda.yaml b/neural_compressor/adaptor/onnxrt_cuda.yaml index d411c0530ec..c58ac012920 100644 --- a/neural_compressor/adaptor/onnxrt_cuda.yaml +++ b/neural_compressor/adaptor/onnxrt_cuda.yaml @@ -97,6 +97,11 @@ 'LSTM': *default_dynamic, } } + fp16: &common_fp16 ['Concat', 'Gather', 'Reshape', 'Squeeze', 'Transpose', 'Unsqueeze', + 'EmbedLayerNormalization', 'Attention', 'Split', 'Sigmoid', 'Relu', 'Mul', 'Pad', 'MaxPool', + 'MatMul', 'LeakyRelu', 'GlobalAveragePool', 'Gemm', 'Conv', 'AveragePool', 'Add', 'Clip'] + bf16: &common_bf16 ['Concat', 'Gather', 'Reshape', 'Squeeze', 'Transpose', 'Unsqueeze', + 'Split', 'Sigmoid', 'Relu', 'Mul', 'MatMul', 'Gemm', 'Add'] recipes: &default_optimization graph_optimization: # from onnxruntime graph_optimization_level level: ['DISABLE_ALL', 'ENABLE_BASIC', 'ENABLE_EXTENDED', 'ENABLE_ALL'] @@ -137,6 +142,8 @@ }, 'dynamic': *ref_1_6_dynamic } + fp16: *common_fp16 + bf16: *common_bf16 recipes: <<: *default_optimization @@ -204,6 +211,8 @@ 'LSTM': *default_dynamic, } } + fp16: *common_fp16 + bf16: *common_bf16 recipes: <<: *default_optimization @@ -278,6 +287,8 @@ 'LSTM': *default_dynamic, } } + fp16: *common_fp16 + bf16: *common_bf16 recipes: <<: *default_optimization @@ -332,6 +343,8 @@ }, 'dynamic': *ref_1_9_dynamic } + fp16: *common_fp16 + bf16: *common_bf16 recipes: <<: *default_optimization @@ -393,6 +406,8 @@ }, 'dynamic': *ref_1_9_dynamic } + fp16: *common_fp16 + bf16: *common_bf16 recipes: <<: *default_optimization @@ -400,6 +415,8 @@ version: name: '1.12.0' int8: *ref_1_11 + fp16: *common_fp16 + bf16: *common_bf16 recipes: <<: *default_optimization @@ -407,5 +424,7 @@ version: name: 'default' int8: *ref_1_6 + fp16: *common_fp16 + bf16: *common_bf16 recipes: - <<: *default_optimization \ No newline at end of file + <<: *default_optimization diff --git a/neural_compressor/adaptor/ox_utils/calibration.py b/neural_compressor/adaptor/ox_utils/calibration.py index 0e6654c6cd2..4b9e302fd02 100644 --- a/neural_compressor/adaptor/ox_utils/calibration.py +++ b/neural_compressor/adaptor/ox_utils/calibration.py @@ -426,7 +426,8 @@ def calculate_quantization_params(self, q_config, quantization_thresholds): qType = 2 # uint8 if tensor_name in output_name_to_nodes: parent = output_name_to_nodes[tensor_name] - if parent and parent.name in q_config and q_config[parent.name] not in ['fp32']: + if parent and parent.name in q_config and \ + q_config[parent.name] not in ['fp32', 'fp16']: scheme = q_config[parent.name]['activation']['scheme'] qType = q_config[parent.name]['activation']['dtype'] elif self.backend in ['TensorrtExecutionProvider']: diff --git a/neural_compressor/adaptor/ox_utils/operators/direct_q8.py b/neural_compressor/adaptor/ox_utils/operators/direct_q8.py index f11c019d66a..9e9a9144f7b 100644 --- a/neural_compressor/adaptor/ox_utils/operators/direct_q8.py +++ b/neural_compressor/adaptor/ox_utils/operators/direct_q8.py @@ -81,25 +81,10 @@ def cast(self): # pragma: no cover return self.quantizer.dtype_cast(self.node, self.dtype) -@op_registry(op_types="Shape, Loop, Slice") -class DirectCastOperator(Operator): # pragma: no cover - """Direct8bit Operator Cast.""" - - def __init__(self, onnx_quantizer, onnx_node): - """Initialization.""" - super(DirectCastOperator, self).__init__(onnx_quantizer, onnx_node) - - def cast(self): - """Cast node.""" - node = self.node - if node.input[0] not in [i.tensor_name for i in self.quantizer.new_value_info.values()]: - return - self.quantizer.dtype_cast(self.node, self.dtype) - @qop_registry(op_types="Reshape, Transpose, Squeeze, Unsqueeze") class QDirectOperator(QOperator): """QDirect Operator.""" def __init__(self, onnx_node, children, initializers): """Initialization.""" - super().__init__(onnx_node, children, initializers) \ No newline at end of file + super().__init__(onnx_node, children, initializers) diff --git a/neural_compressor/adaptor/ox_utils/operators/ops.py b/neural_compressor/adaptor/ox_utils/operators/ops.py index 2cbb1046d5b..29d350ed071 100644 --- a/neural_compressor/adaptor/ox_utils/operators/ops.py +++ b/neural_compressor/adaptor/ox_utils/operators/ops.py @@ -70,7 +70,7 @@ def __init__(self, onnx_quantizer, onnx_node): self.activation_dtype = None self.activation_scheme = 'asym' if self.node.name in self.quantizer.config: - if self.quantizer.config[self.node.name] != 'fp32': + if self.quantizer.config[self.node.name] not in self.quantizer.fallback_list: if 'weight' in self.quantizer.config[self.node.name].keys(): self.per_channel = self.quantizer.config[self.node.name]\ ['weight']['granularity'] == 'per_channel' @@ -162,4 +162,4 @@ def convert(self): node.op_type, inputs, outputs, node.name + '_convert', **kwargs) add_nodes.append(new_node) - return True, add_nodes, inits \ No newline at end of file + return True, add_nodes, inits diff --git a/neural_compressor/adaptor/ox_utils/quantizer.py b/neural_compressor/adaptor/ox_utils/quantizer.py index 085b9c636cf..fbc7c57d0be 100644 --- a/neural_compressor/adaptor/ox_utils/quantizer.py +++ b/neural_compressor/adaptor/ox_utils/quantizer.py @@ -320,7 +320,7 @@ def dfs(match_nodes, node, pattern): if len(outs) > 0: output_dtype = str(self.new_value_info[outs[0]].new_dtype) break - if len(outs) == 0 or all([not self.should_convert(i) for i in children]): + if len(outs) == 0 or all([not self.should_cast(i) for i in children]): return if input_dtype == str(match_nodes[1].attribute[0].i) and \ output_dtype == str(match_nodes[0].attribute[0].i) and \ @@ -355,17 +355,13 @@ def dfs(match_nodes, node, pattern): def dtype_cast(self, node, cfg, keep_io_types=True): # pragma: no cover """Cast node dtype.""" - min_positive_val = 1e-7 - max_finite_val = 1e4 for idx, tensor_name in enumerate(node.input): initializer = find_by_name(tensor_name, self.model.initializer()) if initializer is not None: if initializer.data_type != onnx_proto.TensorProto.FLOAT: continue - new_tensor = cast_tensor(initializer, cfg) - if new_tensor: - self.model.remove_initializer(initializer) - self.model.add_initializer(new_tensor) + do_cast = cast_tensor(initializer, cfg) + if do_cast: self.new_value_info[tensor_name] = ValueInfo(tensor_name, TensorProto.FLOAT, dtype_mapping[cfg]) else: diff --git a/neural_compressor/adaptor/ox_utils/util.py b/neural_compressor/adaptor/ox_utils/util.py index 06a4e8b924a..d99746a5b76 100644 --- a/neural_compressor/adaptor/ox_utils/util.py +++ b/neural_compressor/adaptor/ox_utils/util.py @@ -33,10 +33,16 @@ ms_domain = "com.microsoft" support_pair = { + 'float32 bfloat16': True, + '1 16': True, + 'bfloat16 float32': True, + '16 1': True, 'uint8 uint8': True, '2 2': True, 'float16 float16': True, '10 10': True, + 'bfloat16 bfloat16': True, + '16 16': True, 'float32 float16': True, '1 10': True, 'float16 float32': True, @@ -59,6 +65,7 @@ 'uint64': 13, 'complex64': 14, 'complex128': 15, + 'bf16': 16 } PROVIDERS = { @@ -135,6 +142,26 @@ def split_shared_bias(model): node.input[2] = new_input_name return model +def float_to_float16(tensor): + """Convert float to float16.""" + min_val = 5.96e-08 + max_val = 65504.0 + tensor[(tensor > max_val) & (tensor < float('inf'))] = max_val + tensor[(tensor < min_val) & (tensor > 0)] = min_val + tensor[(tensor > -min_val) & (tensor < 0)] = -min_val + tensor[(tensor < -max_val) & (tensor > float('-inf'))] = -max_val + return np.float16(tensor) + +def float_to_bfloat16(tensor): + """Convert float to bfloat16.""" + min_val = 9.2e-41 + max_val = 3.38953139e38 + tensor[(tensor > max_val) & (tensor < float('inf'))] = max_val + tensor[(tensor < min_val) & (tensor > 0)] = min_val + tensor[(tensor > -min_val) & (tensor < 0)] = -min_val + tensor[(tensor < -max_val) & (tensor > float('-inf'))] = -max_val + return tensor + def cast_tensor(tensor, dtype): # pragma: no cover """Convert tensor float to target dtype. @@ -146,14 +173,19 @@ def cast_tensor(tensor, dtype): # pragma: no cover raise ValueError('Expected input type is an ONNX TensorProto but got %s' % type(tensor)) if tensor.data_type == onnx_proto.TensorProto.FLOAT: - new_tensor = helper.make_tensor( - name=tensor.name, - data_type=dtype_mapping[dtype], - dims=numpy_helper.to_array(tensor).shape, - vals=numpy_helper.to_array(tensor) - ) - return new_tensor - return None + val = numpy_helper.to_array(tensor).copy() + if dtype == 'fp16': + new_val = float_to_float16(val) + elif dtype == 'bf16': + new_val = float_to_bfloat16(val) + else: + raise ValueError('Expect fp16 or bf16 but get {}.'.format(dtype)) + tensor.float_data[:] = [] + tensor.int32_data[:] = [] + tensor.raw_data = new_val.tostring() + tensor.data_type = dtype_mapping[dtype] + return True + return False def remove_init_from_model_input(model): """Remove initializer from model input.""" diff --git a/neural_compressor/conf/config.py b/neural_compressor/conf/config.py index aad406fc37a..8600ebbf883 100644 --- a/neural_compressor/conf/config.py +++ b/neural_compressor/conf/config.py @@ -252,7 +252,7 @@ def percent_to_float(data): lambda s: all(i in ['asym', 'sym', 'asym_float'] for i in s)), Optional('dtype'): And( list, - lambda s: all(i in ['int8', 'uint8', 'fp32', 'bf16'] for i in s)), + lambda s: all(i in ['int8', 'uint8', 'fp32', 'bf16', 'fp16'] for i in s)), Optional('algorithm'): And( list, lambda s: all(i in ['minmax'] for i in s)), @@ -270,7 +270,7 @@ def percent_to_float(data): lambda s: all(i in ['asym', 'sym'] for i in s)), Optional('dtype'): And( list, - lambda s: all(i in ['int8', 'uint8', 'fp32', 'bf16'] for i in s)), + lambda s: all(i in ['int8', 'uint8', 'fp32', 'bf16', 'fp16'] for i in s)), # compute_dtypeis only for PyTorch framework Optional('compute_dtype', default=['uint8']): And( list, @@ -294,13 +294,13 @@ def percent_to_float(data): Optional('dtype', default=None): And( Or(str, list), Use(input_to_list), - lambda s: all(i in ['fp32', 'bf16'] for i in s)), + lambda s: all(i in ['fp32', 'bf16', 'fp16'] for i in s)), }, Optional('activation', default=None): { Optional('dtype', default=None): And( Or(str, list), Use(input_to_list), - lambda s: all(i in ['fp32', 'bf16'] for i in s)), + lambda s: all(i in ['fp32', 'bf16', 'fp16'] for i in s)), } } }) @@ -310,20 +310,20 @@ def percent_to_float(data): Optional('precisions', default={'precisions': ['fp32']}): And( Or(str, list), Use(input_to_list), - lambda s: all(i in [ 'fp32', 'bf16'] for i in s)), + lambda s: all(i in [ 'fp32', 'bf16', 'fp16'] for i in s)), Optional('op_wise', default={'weight': {}, 'activation': {}}): { Optional('weight', default=None): { Optional('dtype', default=None): And( Or(str, list), Use(input_to_list), - lambda s: all(i in ['fp32', 'bf16'] for i in s)), + lambda s: all(i in ['fp32', 'bf16', 'fp16'] for i in s)), }, Optional('activation', default=None): { Optional('dtype', default=None): And( Or(str, list), Use(input_to_list), - lambda s: all(i in ['fp32', 'bf16'] for i in s)), + lambda s: all(i in ['fp32', 'bf16', 'fp16'] for i in s)), } } }) @@ -858,7 +858,7 @@ def percent_to_float(data): Optional('dtype', default=None): And( Or(str, list), Use(input_to_list), - lambda s: all(i in ['int8', 'uint8', 'fp32', 'bf16'] for i in s)), + lambda s: all(i in ['int8', 'uint8', 'fp32', 'bf16', 'fp16'] for i in s)), Optional('algorithm', default=None): And( Or(str, list), Use(input_to_list), @@ -881,7 +881,7 @@ def percent_to_float(data): Optional('dtype', default=None): And( Or(str, list), Use(input_to_list), - lambda s: all(i in ['int8', 'uint8', 'fp32', 'bf16'] for i in s)), + lambda s: all(i in ['int8', 'uint8', 'fp32', 'bf16', 'fp16'] for i in s)), # compute_dtypeis only for PyTorch framework Optional('compute_dtype', default=['uint8']): And( Or(str, list), diff --git a/neural_compressor/conf/pythonic_config.py b/neural_compressor/conf/pythonic_config.py index cf1bcc72074..8945cbe57ed 100644 --- a/neural_compressor/conf/pythonic_config.py +++ b/neural_compressor/conf/pythonic_config.py @@ -164,7 +164,7 @@ def precisions(self): def precisions(self, precisions): if not isinstance(precisions, list): precisions = [precisions] - if check_value('precisions', precisions, str, ['int8', 'uint8', 'fp32', 'bf16']): + if check_value('precisions', precisions, str, ['int8', 'uint8', 'fp32', 'bf16', 'fp16']): self._precisions = precisions class ONNX(MXNet): diff --git a/neural_compressor/config.py b/neural_compressor/config.py index 33d0f72a342..f52ece7dbdf 100644 --- a/neural_compressor/config.py +++ b/neural_compressor/config.py @@ -43,7 +43,7 @@ lambda s: all(i in ['asym', 'sym', 'asym_float'] for i in s)), Optional('dtype'): And( list, - lambda s: all(i in ['int8', 'uint8', 'fp32', 'bf16'] for i in s)), + lambda s: all(i in ['int8', 'uint8', 'fp32', 'bf16', 'fp16'] for i in s)), Optional('algorithm'): And( list, lambda s: all(i in ['minmax'] for i in s))}, @@ -56,7 +56,7 @@ lambda s: all(i in ['asym', 'sym'] for i in s)), Optional('dtype'): And( list, - lambda s: all(i in ['int8', 'uint8', 'fp32', 'bf16', 'None'] for i in s)), + lambda s: all(i in ['int8', 'uint8', 'fp32', 'bf16', 'fp16', 'None'] for i in s)), Optional('algorithm'): And( list, lambda s: all(i in ['minmax', 'kl', 'placeholder'] for i in s))}}) @@ -591,7 +591,7 @@ def excluded_precisions(self): @excluded_precisions.setter def excluded_precisions(self, excluded_precisions): - if check_value("excluded_precisions", excluded_precisions, str, ["bf16"]): + if check_value("excluded_precisions", excluded_precisions, str, ["bf16", "fp16"]): self._excluded_precisions = excluded_precisions self._use_bf16 = "bf16" not in excluded_precisions diff --git a/neural_compressor/contrib/strategy/sigopt.py b/neural_compressor/contrib/strategy/sigopt.py index fde298f27f1..500db31d64e 100644 --- a/neural_compressor/contrib/strategy/sigopt.py +++ b/neural_compressor/contrib/strategy/sigopt.py @@ -224,7 +224,7 @@ def create_exp(self, acc_target): calib_sampling_size_lst = tuning_space.root_item.get_option_by_name('calib_sampling_size').options # step1. collect the ops that support static and dynamic quant_mode_wise_items = OrderedDict() - query_order = ['static', 'dynamic', 'bf16', 'fp32'] + query_order = ['static', 'dynamic', 'bf16', 'fp16', 'fp32'] pre_items = set() for quant_mode in query_order: items = tuning_space.query_items_by_quant_mode(quant_mode) diff --git a/neural_compressor/contrib/strategy/tpe.py b/neural_compressor/contrib/strategy/tpe.py index 8d8a14fcbc2..25bde710ed9 100644 --- a/neural_compressor/contrib/strategy/tpe.py +++ b/neural_compressor/contrib/strategy/tpe.py @@ -197,7 +197,7 @@ def traverse(self): calib_sampling_size_lst = tuning_space.root_item.get_option_by_name('calib_sampling_size').options # step1. collect the ops that support static and dynamic quant_mode_wise_items = OrderedDict() - query_order = ['static', 'dynamic', 'bf16', 'fp32'] + query_order = ['static', 'dynamic', 'bf16', 'fp16', 'fp32'] pre_items = set() for quant_mode in query_order: items = tuning_space.query_items_by_quant_mode(quant_mode) diff --git a/neural_compressor/data/datasets/dataset.py b/neural_compressor/data/datasets/dataset.py index d7e66064840..cc4851f3297 100644 --- a/neural_compressor/data/datasets/dataset.py +++ b/neural_compressor/data/datasets/dataset.py @@ -143,7 +143,7 @@ def __getitem__(self, index): "pytorch_fx": PyTorchDatasets, "onnxrt_qdq": ONNXRTQLDatasets, "onnxrt_qlinearops": ONNXRTQLDatasets, - "onnxrt_qoperator": ONNXRTQLDatasets, + "onnxruntime": ONNXRTQLDatasets, "onnxrt_integerops": ONNXRTITDatasets, } @@ -164,14 +164,14 @@ class Datasets(object): # pragma: no cover Args: framework (str): framework name, like:"tensorflow", "tensorflow_itex", "mxnet", "onnxrt_qdq", "onnxrt_qlinearops", "onnxrt_integerops", - "pytorch", "pytorch_ipex", "pytorch_fx", "onnxrt_qoperator". + "pytorch", "pytorch_ipex", "pytorch_fx", "onnxruntime". """ def __init__(self, framework): """Initialize the attributes of class.""" assert framework in ["tensorflow", "tensorflow_itex", \ "mxnet", "onnxrt_qdq", "onnxrt_qlinearops", "onnxrt_integerops", \ - "pytorch", "pytorch_ipex", "pytorch_fx", "onnxrt_qoperator"], \ + "pytorch", "pytorch_ipex", "pytorch_fx", "onnxruntime"], \ "framework support tensorflow pytorch mxnet onnxrt" self.datasets = framework_datasets[framework]().datasets @@ -203,7 +203,7 @@ def __getitem__(self, dataset_type): "pytorch_fx": PYTORCHFX_DATASETS, "onnxrt_integerops": ONNXRTIT_DATASETS, "onnxrt_qdq": ONNXRTQL_DATASETS, - "onnxrt_qoperator": ONNXRTQL_DATASETS, + "onnxruntime": ONNXRTQL_DATASETS, "onnxrt_qlinearops": ONNXRTQL_DATASETS, } @@ -232,7 +232,7 @@ def decorator_dataset(cls): "onnxrt_qlinearops", "onnxrt_integerops", "onnxrt_qdq", - "onnxrt_qoperator", + "onnxruntime", ], "The framework support tensorflow mxnet pytorch onnxrt" dataset_name = dataset_type + dataset_format if dataset_name in registry_datasets[single_framework].keys(): diff --git a/neural_compressor/data/filters/filter.py b/neural_compressor/data/filters/filter.py index 7abda00e054..137baa667d7 100644 --- a/neural_compressor/data/filters/filter.py +++ b/neural_compressor/data/filters/filter.py @@ -86,7 +86,7 @@ def __init__(self): "mxnet": MXNetFilters, "onnxrt_qlinearops": ONNXRTQLFilters, "onnxrt_qdq": ONNXRTQLFilters, - "onnxrt_qoperator": ONNXRTQLFilters, + "onnxruntime": ONNXRTQLFilters, "onnxrt_integerops": ONNXRTITFilters, } @@ -98,7 +98,7 @@ def __init__(self): "mxnet": MXNET_FILTERS, "onnxrt_integerops": ONNXRT_IT_FILTERS, "onnxrt_qdq": ONNXRT_QL_FILTERS, - "onnxrt_qoperator": ONNXRT_QL_FILTERS, + "onnxruntime": ONNXRT_QL_FILTERS, "onnxrt_qlinearops": ONNXRT_QL_FILTERS} @@ -109,14 +109,14 @@ class FILTERS(object): # pragma: no cover framework (str): frameworks in ["tensorflow", "tensorflow_itex", "mxnet", "onnxrt_qdq", "pytorch", "pytorch_ipex", "pytorch_fx", "onnxrt_integerops", - "onnxrt_qlinearops", "onnxrt_qoperator"]. + "onnxrt_qlinearops", "onnxruntime"]. """ def __init__(self, framework): """Initialize the attribute of class.""" assert framework in ["tensorflow", "tensorflow_itex", "mxnet", "onnxrt_qdq", "pytorch", "pytorch_ipex", "pytorch_fx", - "onnxrt_integerops", "onnxrt_qlinearops", "onnxrt_qoperator"], \ + "onnxrt_integerops", "onnxrt_qlinearops", "onnxruntime"], \ "framework support tensorflow pytorch mxnet onnxrt" self.filters = framework_filters[framework]().filters self.framework = framework @@ -155,7 +155,7 @@ def decorator_transform(cls): "onnxrt_integerops", "onnxrt_qdq", "onnxrt_qlinearops", - "onnxrt_qoperator" + "onnxruntime" ], "The framework support tensorflow mxnet pytorch onnxrt" if filter_type in registry_filters[single_framework].keys(): raise ValueError('Cannot have two transforms with the same name') diff --git a/neural_compressor/data/transforms/transform.py b/neural_compressor/data/transforms/transform.py index 5df77fa690a..972839e9e9a 100644 --- a/neural_compressor/data/transforms/transform.py +++ b/neural_compressor/data/transforms/transform.py @@ -279,7 +279,7 @@ def _get_general(self): "pytorch_fx": PyTorchTransforms, "onnxrt_qlinearops": ONNXRTQLTransforms, "onnxrt_integerops": ONNXRTITTransforms, - "onnxrt_qoperator": ONNXRTQLTransforms, + "onnxruntime": ONNXRTQLTransforms, "onnxrt_qdq": ONNXRTQLTransforms} # transform registry will register transforms into these dicts @@ -298,7 +298,7 @@ def _get_general(self): "pytorch_fx": PYTORCH_TRANSFORMS, "onnxrt_qlinearops": ONNXRT_QL_TRANSFORMS, "onnxrt_qdq": ONNXRT_QL_TRANSFORMS, - "onnxrt_qoperator": ONNXRT_QL_TRANSFORMS, + "onnxruntime": ONNXRT_QL_TRANSFORMS, "onnxrt_integerops": ONNXRT_IT_TRANSFORMS, } @@ -316,7 +316,7 @@ def __init__(self, framework, process): framework (str): different framework type like tensorflow, pytorch and so on process (str): process type, the value can be preprocess, postprocess or general """ - assert framework in ("tensorflow", "tensorflow_itex", "onnxrt_qoperator", \ + assert framework in ("tensorflow", "tensorflow_itex", "onnxruntime", \ "pytorch", "pytorch_ipex", "pytorch_fx", "onnxrt_qdq", \ "onnxrt_qlinearops", "onnxrt_integerops", "mxnet"), \ "framework support tensorflow pytorch mxnet onnxrt" @@ -375,7 +375,7 @@ def decorator_transform(cls): "onnxrt_qlinearops", "onnxrt_qdq", "onnxrt_integerops", - "onnxrt_qoperator", + "onnxruntime", ], "The framework support tensorflow mxnet pytorch onnxrt" if transform_type in registry_transforms[single_framework][process].keys(): raise ValueError('Cannot have two transforms with the same name') diff --git a/neural_compressor/experimental/data/dataloaders/dataloader.py b/neural_compressor/experimental/data/dataloaders/dataloader.py index c879b0b45d1..5ae5424b9d7 100644 --- a/neural_compressor/experimental/data/dataloaders/dataloader.py +++ b/neural_compressor/experimental/data/dataloaders/dataloader.py @@ -33,6 +33,6 @@ "onnxrt_qlinearops": ONNXRTDataLoader, "onnxrt_integerops": ONNXRTDataLoader, "onnxrt_qdq": ONNXRTDataLoader, - "onnxrt_qoperator": ONNXRTDataLoader, + "onnxruntime": ONNXRTDataLoader, } diff --git a/neural_compressor/experimental/data/datasets/dataset.py b/neural_compressor/experimental/data/datasets/dataset.py index b591ebb074f..589f5ee2b67 100644 --- a/neural_compressor/experimental/data/datasets/dataset.py +++ b/neural_compressor/experimental/data/datasets/dataset.py @@ -143,7 +143,7 @@ def __getitem__(self, index): "pytorch_fx": PyTorchDatasets, "onnxrt_qdq": ONNXRTQLDatasets, "onnxrt_qlinearops": ONNXRTQLDatasets, - "onnxrt_qoperator": ONNXRTQLDatasets, + "onnxruntime": ONNXRTQLDatasets, "onnxrt_integerops": ONNXRTITDatasets, } @@ -164,14 +164,14 @@ class Datasets(object): Args: framework (str): framework name, like:"tensorflow", "tensorflow_itex", "mxnet", "onnxrt_qdq", "onnxrt_qlinearops", "onnxrt_integerops", - "pytorch", "pytorch_ipex", "pytorch_fx", "onnxrt_qoperator". + "pytorch", "pytorch_ipex", "pytorch_fx", "onnxruntime". """ def __init__(self, framework): """Initialize the attributes of class.""" assert framework in ["tensorflow", "tensorflow_itex", \ "mxnet", "onnxrt_qdq", "onnxrt_qlinearops", "onnxrt_integerops", \ - "pytorch", "pytorch_ipex", "pytorch_fx", "onnxrt_qoperator"], \ + "pytorch", "pytorch_ipex", "pytorch_fx", "onnxruntime"], \ "framework support tensorflow pytorch mxnet onnxrt" self.datasets = framework_datasets[framework]().datasets @@ -203,7 +203,7 @@ def __getitem__(self, dataset_type): "pytorch_fx": PYTORCHFX_DATASETS, "onnxrt_integerops": ONNXRTIT_DATASETS, "onnxrt_qdq": ONNXRTQL_DATASETS, - "onnxrt_qoperator": ONNXRTQL_DATASETS, + "onnxruntime": ONNXRTQL_DATASETS, "onnxrt_qlinearops": ONNXRTQL_DATASETS, } @@ -232,7 +232,7 @@ def decorator_dataset(cls): "onnxrt_qlinearops", "onnxrt_integerops", "onnxrt_qdq", - "onnxrt_qoperator", + "onnxruntime", ], "The framework support tensorflow mxnet pytorch onnxrt" dataset_name = dataset_type + dataset_format if dataset_name in registry_datasets[single_framework].keys(): diff --git a/neural_compressor/experimental/data/filters/filter.py b/neural_compressor/experimental/data/filters/filter.py index b2e0ad25225..0f30839c394 100644 --- a/neural_compressor/experimental/data/filters/filter.py +++ b/neural_compressor/experimental/data/filters/filter.py @@ -86,7 +86,7 @@ def __init__(self): "mxnet": MXNetFilters, "onnxrt_qlinearops": ONNXRTQLFilters, "onnxrt_qdq": ONNXRTQLFilters, - "onnxrt_qoperator": ONNXRTQLFilters, + "onnxruntime": ONNXRTQLFilters, "onnxrt_integerops": ONNXRTITFilters, } @@ -98,7 +98,7 @@ def __init__(self): "mxnet": MXNET_FILTERS, "onnxrt_integerops": ONNXRT_IT_FILTERS, "onnxrt_qdq": ONNXRT_QL_FILTERS, - "onnxrt_qoperator": ONNXRT_QL_FILTERS, + "onnxruntime": ONNXRT_QL_FILTERS, "onnxrt_qlinearops": ONNXRT_QL_FILTERS} @@ -109,14 +109,14 @@ class FILTERS(object): framework (str): frameworks in ["tensorflow", "tensorflow_itex", "mxnet", "onnxrt_qdq", "pytorch", "pytorch_ipex", "pytorch_fx", "onnxrt_integerops", - "onnxrt_qlinearops", "onnxrt_qoperator"]. + "onnxrt_qlinearops", "onnxruntime"]. """ def __init__(self, framework): """Initialize the attribute of class.""" assert framework in ["tensorflow", "tensorflow_itex", "mxnet", "onnxrt_qdq", "pytorch", "pytorch_ipex", "pytorch_fx", - "onnxrt_integerops", "onnxrt_qlinearops", "onnxrt_qoperator"], \ + "onnxrt_integerops", "onnxrt_qlinearops", "onnxruntime"], \ "framework support tensorflow pytorch mxnet onnxrt" self.filters = framework_filters[framework]().filters self.framework = framework @@ -155,7 +155,7 @@ def decorator_transform(cls): "onnxrt_integerops", "onnxrt_qdq", "onnxrt_qlinearops", - "onnxrt_qoperator" + "onnxruntime" ], "The framework support tensorflow mxnet pytorch onnxrt" if filter_type in registry_filters[single_framework].keys(): raise ValueError('Cannot have two transforms with the same name') diff --git a/neural_compressor/experimental/data/transforms/transform.py b/neural_compressor/experimental/data/transforms/transform.py index 5df77fa690a..972839e9e9a 100644 --- a/neural_compressor/experimental/data/transforms/transform.py +++ b/neural_compressor/experimental/data/transforms/transform.py @@ -279,7 +279,7 @@ def _get_general(self): "pytorch_fx": PyTorchTransforms, "onnxrt_qlinearops": ONNXRTQLTransforms, "onnxrt_integerops": ONNXRTITTransforms, - "onnxrt_qoperator": ONNXRTQLTransforms, + "onnxruntime": ONNXRTQLTransforms, "onnxrt_qdq": ONNXRTQLTransforms} # transform registry will register transforms into these dicts @@ -298,7 +298,7 @@ def _get_general(self): "pytorch_fx": PYTORCH_TRANSFORMS, "onnxrt_qlinearops": ONNXRT_QL_TRANSFORMS, "onnxrt_qdq": ONNXRT_QL_TRANSFORMS, - "onnxrt_qoperator": ONNXRT_QL_TRANSFORMS, + "onnxruntime": ONNXRT_QL_TRANSFORMS, "onnxrt_integerops": ONNXRT_IT_TRANSFORMS, } @@ -316,7 +316,7 @@ def __init__(self, framework, process): framework (str): different framework type like tensorflow, pytorch and so on process (str): process type, the value can be preprocess, postprocess or general """ - assert framework in ("tensorflow", "tensorflow_itex", "onnxrt_qoperator", \ + assert framework in ("tensorflow", "tensorflow_itex", "onnxruntime", \ "pytorch", "pytorch_ipex", "pytorch_fx", "onnxrt_qdq", \ "onnxrt_qlinearops", "onnxrt_integerops", "mxnet"), \ "framework support tensorflow pytorch mxnet onnxrt" @@ -375,7 +375,7 @@ def decorator_transform(cls): "onnxrt_qlinearops", "onnxrt_qdq", "onnxrt_integerops", - "onnxrt_qoperator", + "onnxruntime", ], "The framework support tensorflow mxnet pytorch onnxrt" if transform_type in registry_transforms[single_framework][process].keys(): raise ValueError('Cannot have two transforms with the same name') diff --git a/neural_compressor/experimental/metric/metric.py b/neural_compressor/experimental/metric/metric.py index 31f0550b071..7915daddf63 100644 --- a/neural_compressor/experimental/metric/metric.py +++ b/neural_compressor/experimental/metric/metric.py @@ -121,7 +121,7 @@ def __init__(self) -> None: "onnxrt_qlinearops": ONNXRTQLMetrics, "onnxrt_integerops": ONNXRTITMetrics, "onnxrt_qdq": ONNXRTQLMetrics, - "onnxrt_qoperator": ONNXRTQLMetrics} + "onnxruntime": ONNXRTQLMetrics} # user/model specific metrics will be registered here TENSORFLOW_METRICS = {} @@ -141,7 +141,7 @@ def __init__(self) -> None: "onnxrt_qlinearops": ONNXRT_QL_METRICS, "onnxrt_qdq": ONNXRT_QL_METRICS, "onnxrt_integerops": ONNXRT_IT_METRICS, - "onnxrt_qoperator": ONNXRT_QL_METRICS, + "onnxruntime": ONNXRT_QL_METRICS, } @@ -161,7 +161,7 @@ def __init__(self, framework: str): assert framework in ("tensorflow", "tensorflow_itex","keras", "pytorch", "pytorch_ipex", "pytorch_fx", "onnxrt_qdq", "onnxrt_qlinearops", "onnxrt_integerops", "mxnet", - "onnxrt_qoperator"), \ + "onnxruntime"), \ "framework support tensorflow pytorch mxnet onnxrt" self.metrics = framework_metrics[framework]().metrics @@ -212,7 +212,7 @@ def decorator_metric(cls): "onnxrt_qlinearops", "onnxrt_integerops", "onnxrt_qdq", - "onnxrt_qoperator", + "onnxruntime", "pytorch", "pytorch_ipex", "pytorch_fx", diff --git a/neural_compressor/experimental/mixed_precision.py b/neural_compressor/experimental/mixed_precision.py index e3abe923d5e..438f2e749bb 100644 --- a/neural_compressor/experimental/mixed_precision.py +++ b/neural_compressor/experimental/mixed_precision.py @@ -102,7 +102,7 @@ def __call__(self): assert isinstance(self._model, BaseModel), 'need set your Model for mixed precision....' if 'onnx' in self.framework and 'bf16' in self._precisions: logger.warning("Mixed precision doesn't support bf16 for ONNX models.") - sys.exit(0) + self._precisions.remove('bf16') if 'bf16' in self._precisions and not CpuInfo().bf16: # pragma: no cover if os.getenv('FORCE_BF16') == '1': @@ -111,7 +111,19 @@ def __call__(self): else: logger.warning("Mixed precision exits due to the hardware " \ "doesn't support bf16 instruction.") - sys.exit(0) + self._precisions.remove('bf16') + + if 'fp16' in self._precisions and 'gpu' not in self.conf.usr_cfg.device: + if os.getenv('FORCE_FP16') == '1': + logger.warning("Mixed precision will generate fp16 graph although " \ + "the hardware doesn't support fp16 instruction.") + else: + logger.warning("Mixed precision exits due to the hardware " \ + "doesn't support fp16 instruction.") + self._precisions.remove('fp16') + + if self._precisions == ['fp32'] or len(self._precisions) == 0: + sys.exit(0) cfg = self.conf.usr_cfg if self.framework == 'tensorflow': diff --git a/neural_compressor/metric/metric.py b/neural_compressor/metric/metric.py index a4786e63ab8..1eeb604714a 100644 --- a/neural_compressor/metric/metric.py +++ b/neural_compressor/metric/metric.py @@ -140,7 +140,7 @@ def __init__(self) -> None: "onnxrt_qlinearops": ONNXRTQLMetrics, "onnxrt_integerops": ONNXRTITMetrics, "onnxrt_qdq": ONNXRTQLMetrics, - "onnxrt_qoperator": ONNXRTQLMetrics} + "onnxruntime": ONNXRTQLMetrics} # user/model specific metrics will be registered here TENSORFLOW_METRICS = {} @@ -159,7 +159,7 @@ def __init__(self) -> None: "onnxrt_qlinearops": ONNXRT_QL_METRICS, "onnxrt_qdq": ONNXRT_QL_METRICS, "onnxrt_integerops": ONNXRT_IT_METRICS, - "onnxrt_qoperator": ONNXRT_QL_METRICS, + "onnxruntime": ONNXRT_QL_METRICS, } @@ -179,7 +179,7 @@ def __init__(self, framework: str): assert framework in ("tensorflow", "tensorflow_itex", "pytorch", "pytorch_ipex", "pytorch_fx", "onnxrt_qdq", "onnxrt_qlinearops", "onnxrt_integerops", "mxnet", - "onnxrt_qoperator"), \ + "onnxruntime"), \ "framework support tensorflow pytorch mxnet onnxrt" self.metrics = framework_metrics[framework]().metrics @@ -230,7 +230,7 @@ def decorator_metric(cls): "onnxrt_qlinearops", "onnxrt_integerops", "onnxrt_qdq", - "onnxrt_qoperator", + "onnxruntime", "pytorch", "pytorch_ipex", "pytorch_fx", diff --git a/neural_compressor/mix_precision.py b/neural_compressor/mix_precision.py index 7b1c4da56fa..2b99359f990 100644 --- a/neural_compressor/mix_precision.py +++ b/neural_compressor/mix_precision.py @@ -70,9 +70,9 @@ def fit(model, config=None, eval_func=None, eval_dataloader=None, eval_metric=No converted_model = mix_precision.fit(model, config=conf) """ assert isinstance(config, MixedPrecisionConfig), "Please provide MixedPrecisionConfig!" - conf = Config(quantization=config) + conf = Config(quantization=config, benchmark=None, pruning=None, distillation=None, nas=None) converter = MixedPrecision(conf) - precisions = ["bf16", "fp32"] + precisions = ["bf16", "fp16", "fp32"] precisions = list(set(precisions) - set(config.excluded_precisions)) converter.precisions = precisions converter.model = model diff --git a/neural_compressor/strategy/auto_mixed_precision.py b/neural_compressor/strategy/auto_mixed_precision.py index ca0903ad71c..b654a015fe3 100644 --- a/neural_compressor/strategy/auto_mixed_precision.py +++ b/neural_compressor/strategy/auto_mixed_precision.py @@ -46,9 +46,9 @@ def next_tune_cfg(self): # filter quantization dtype # TODO align with the old mixed-precison - target_dtype = self.cfg.graph_optimization.precisions if self.cfg.graph_optimization \ + target_dtypes = self.cfg.graph_optimization.precisions if self.cfg.graph_optimization \ else self.cfg.mixed_precision.precisions - + target_dtypes = list(set(target_dtypes) - set(['fp32'])) tuning_space = self.tuning_space initial_op_tuning_cfg = {} for item in tuning_space.root_item.options: @@ -56,19 +56,24 @@ def next_tune_cfg(self): op_name, op_type = item.name initial_op_tuning_cfg[item.name] = OpTuningConfig(op_name, op_type, 'fp32', tuning_space) + if not target_dtypes: + target_dtypes = ['bf16'] # step1. target_dtype AMAP, collect the ops that support target_dtype - if not target_dtype: - target_dtype = 'bf16' - else: - target_dtype = target_dtype[0] - bf16_items = tuning_space.query_items_by_quant_mode(target_dtype) - bf16_items_name = [item.name for item in bf16_items] - op_tuning_cfg = deepcopy(initial_op_tuning_cfg) - for op_name_type in bf16_items_name: - op_tuning_cfg[op_name_type] = OpTuningConfig(op_name_type[0], op_name_type[1], target_dtype, tuning_space) - calib_sampling_size = 1 - op_tuning_cfg['calib_sampling_size'] = calib_sampling_size - yield op_tuning_cfg + bf16_items_name = [] + op_tuning_cfg = {} + for idx, target_dtype in enumerate(target_dtypes): + bf16_items = tuning_space.query_items_by_quant_mode(target_dtype) + if len(bf16_items) == 0 and \ + not (idx == len(target_dtypes) - 1 and len(bf16_items_name) == 0): + continue + bf16_items_name = [item.name for item in bf16_items] + op_tuning_cfg = deepcopy(initial_op_tuning_cfg) + for op_name_type in bf16_items_name: + op_tuning_cfg[op_name_type] = \ + OpTuningConfig(op_name_type[0], op_name_type[1], target_dtype, tuning_space) + calib_sampling_size = 1 + op_tuning_cfg['calib_sampling_size'] = calib_sampling_size + yield op_tuning_cfg # step2. fallback target_dtype = 'fp32' diff --git a/neural_compressor/utils/utility.py b/neural_compressor/utils/utility.py index 408e85a64d7..996756d90eb 100644 --- a/neural_compressor/utils/utility.py +++ b/neural_compressor/utils/utility.py @@ -50,7 +50,7 @@ 'pytorch_ipex': ['torch', 'intel_extension_for_pytorch'], 'onnxrt_qlinearops': ['onnx', 'onnxruntime'], 'onnxrt_integerops': ['onnx', 'onnxruntime'], - 'onnxrt_qoperator': ['onnx', 'onnxruntime'], + 'onnxruntime': ['onnx', 'onnxruntime'], 'mxnet': ['mxnet'], } diff --git a/test/adaptor/onnxrt_adaptor/test_adaptor_onnxrt.py b/test/adaptor/onnxrt_adaptor/test_adaptor_onnxrt.py index 65b1b133348..23d3fa21ded 100644 --- a/test/adaptor/onnxrt_adaptor/test_adaptor_onnxrt.py +++ b/test/adaptor/onnxrt_adaptor/test_adaptor_onnxrt.py @@ -913,6 +913,20 @@ def test_adaptor(self): adaptor.quantize(tune_cfg, common.Model(self.gather_model), self.gather_dataloader) self.assertTrue(len(adaptor.quantizable_ops), 2) + framework_specific_info['device'] = 'gpu' + framework_specific_info['backend'] = 'onnxrt_cuda_ep' + + tune_cfg = {'calib_iteration': 1, + 'op': {('Matmul', 'MatMul'): {'activation': {'dtype': ['uint8'], 'quant_mode': 'static'}, + 'weight': {'dtype': ['int8']}}, + ('add', 'Add'): {'activation': {'dtype': 'fp16', 'quant_mode': 'static'}, + 'weight': {'dtype': 'fp16'}}, + ('add2', 'Add'): {'activation': {'dtype': 'fp16', 'quant_mode': 'static'}, + 'weight': {'dtype': 'fp16'}}}} + adaptor = FRAMEWORKS[framework](framework_specific_info) + model = adaptor.quantize(tune_cfg, common.Model(self.matmul_model), self.matmul_dataloader) + self.assertEqual(len([i for i in model.model.graph.node if i.op_type == 'Cast']), 2) + for fake_yaml in ["gather.yaml"]: quantizer = Quantization(fake_yaml) quantizer.model = self.gather_model diff --git a/test/mixed_precision/test_mixed_precision.py b/test/mixed_precision/test_mixed_precision.py index 26255e5a4ef..0054003a900 100644 --- a/test/mixed_precision/test_mixed_precision.py +++ b/test/mixed_precision/test_mixed_precision.py @@ -200,6 +200,32 @@ def build_pt_model(): resnet18 = LazyImport("torchvision.models.resnet18") return resnet18() + +def build_yaml(): + fake_yaml = """ + device: gpu + model: + name: test + framework: onnxrt_qlinearops + + mixed_precision: + precisions: fp16 + + evaluation: + accuracy: + metric: + MSE: + compare_label: False + dataloader: + dataset: + dummy: + shape: [[5,1,5,5], [5,1,5,1]] + label: True + """ + with open("test.yaml", "w", encoding="utf-8") as f: + f.write(fake_yaml) + + class MatmulDataset: def __init__(self): self.data = [] @@ -249,6 +275,12 @@ def test_on_non_enabled_dtype(self): output_model = mix_precision.fit(self.onnx_model, conf) self.assertEqual(cm.exception.code, 0) + conf = MixedPrecisionConfig(excluded_precisions=["fp16"]) + with self.assertRaises(SystemExit) as cm: + output_model = mix_precision.fit(self.tf_model, conf) + self.assertEqual(cm.exception.code, 0) + + class TestMixedPrecision(unittest.TestCase): @classmethod def setUpClass(self): @@ -258,6 +290,7 @@ def setUpClass(self): self.matmul_dataset = MatmulDataset() self.tf_model = build_tf_graph() self.pt_model = build_pt_model() + build_yaml() @classmethod def tearDownClass(self): @@ -265,6 +298,31 @@ def tearDownClass(self): del os.environ['FORCE_BF16'] shutil.rmtree("./saved", ignore_errors=True) shutil.rmtree("./nc_workspace", ignore_errors=True) + os.remove("test.yaml") + + def test_mixed_precision_with_evaluation(self): + from neural_compressor.experimental import common + from neural_compressor.experimental.metric.metric import ONNXRT_QL_METRICS + # test onnx + conf = MixedPrecisionConfig(device='gpu', backend='onnxrt_cuda_ep') + + #output_model = mix_precision.fit(self.onnx_model, conf) + #self.assertTrue(any([i.op_type == 'Cast' for i in output_model.nodes()])) + + tuning_criterion = TuningCriterion(max_trials=3, timeout=1000000) + conf = MixedPrecisionConfig(device='gpu', tuning_criterion=tuning_criterion, backend='onnxrt_cuda_ep') + output_model = mix_precision.fit(self.onnx_model, + conf, + eval_dataloader=common.DataLoader(self.matmul_dataset), + eval_metric=ONNXRT_QL_METRICS["MSE"]()) + self.assertTrue(any([i.op_type == 'Cast' for i in output_model.nodes()])) + + from neural_compressor.conf.config import MixedPrecision_Conf + from neural_compressor.experimental import MixedPrecision + converter = MixedPrecision(MixedPrecision_Conf('test.yaml')) + converter.model = self.onnx_model + output_model = converter.fit() + self.assertTrue(any([i.op_type != 'Cast' for i in output_model.nodes()])) def test_mixed_precision_with_eval_func(self): def eval(model): diff --git a/test/quantization/test_quantization.py b/test/quantization/test_quantization.py index f8723f3bc87..029a0bad469 100644 --- a/test/quantization/test_quantization.py +++ b/test/quantization/test_quantization.py @@ -234,7 +234,7 @@ def build_fake_strategy(): " for calib_sampling_size in calib_sampling_size_lst: \n", " # step1. collect the ops that support static and dynamic \n", " quant_mode_wise_items = OrderedDict() \n", - " query_order = ['static', 'dynamic', 'bf16', 'fp32'] \n", + " query_order = ['static', 'dynamic', 'bf16', 'fp16', 'fp32'] \n", " pre_items = set() \n", " for quant_mode in query_order: \n", " items = tuning_space.query_items_by_quant_mode(quant_mode) \n",