U
    h%{                     @   s$  d dl Z d dlZd dlmZ ddlmZmZmZmZ ddl	m
Z
 ddlmZ ddlmZmZmZmZmZ ddlmZmZmZ G d	d
 d
ZG dd deZG dd deZeeedddZejdddejejdddejdfedddZdddejddddfeedddZeeedddZ dS )    N)Path   )CalibrationDataReaderCalibrationMethodTensorsDatacreate_calibrator)ONNXQuantizer)QDQQuantizer)QuantFormatQuantizationMode	QuantTypeload_model_with_shape_infermodel_has_pre_process_metadata)IntegerOpsRegistryQDQRegistryQLinearOpsRegistryc                   @   s*   e Zd ZejejddddddfddZdS )QuantConfigNFc	           	      C   sL   |pg }|pg }|pg }|| _ || _|| _|| _|| _|| _|| _|| _dS )a  
        This is the Base class for both Static and Dynamic Quantize Configuration
        Args:
            activation_type:
                quantization data type of activation. Please refer to
                https://onnxruntime.ai/docs/performance/quantization.html for more details on data type selection
            weight_type:
                quantization data type of weight. Please refer to
                https://onnxruntime.ai/docs/performance/quantization.html for more details on data type selection
            op_types_to_quantize:
                specify the types of operators to quantize, like ['Conv'] to quantize Conv only.
                It quantizes all supported operators by default.
            nodes_to_quantize:
                List of nodes names to quantize. When this list is not None only the nodes in this list
                are quantized.
                example:
                [
                    'Conv__224',
                    'Conv__252'
                ]
            nodes_to_exclude:
                List of nodes names to exclude. The nodes in this list will be excluded from quantization
                when it is not None.
            per_channel: quantize weights per channel
            reduce_range:
                quantize weights with 7-bits. It may improve the accuracy for some models running on non-VNNI machine,
                especially for per-channel mode
            use_external_data_format: option used for large size (>2GB) model. Set to False by default.
        N)op_types_to_quantizeper_channelreduce_rangeweight_typeactivation_typenodes_to_quantizenodes_to_excludeuse_external_data_format)	selfr   r   r   r   r   r   r   r    r   E/tmp/pip-unpacked-wheel-socb9apf/onnxruntime/quantization/quantize.py__init__   s    )zQuantConfig.__init__)__name__
__module____qualname__r   QUInt8QInt8r   r   r   r   r   r      s   r   c                       sB   e Zd Zejejejejdddddddfe	d fddZ
  ZS )StaticQuantConfigNFcalibration_data_readerc              
      s<   t  j||||||	|
|d || _|| _|| _|p4i | _dS )a  
        This is the derived class for static Quantize Configuration

        Args:
            calibration_data_reader:
                a calibration data reader. It enumerates calibration data and generates inputs for the original model.
            calibrate_method:
                Current calibration methods supported are MinMax, Entropy and Percentile.
            quant_format: QuantFormat{QOperator, QDQ}.
                QOperator format quantizes the model with quantized operators directly.
                QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
            extra_options:
                key value pair dictionary for various options in different case. Current used:
                    extra.Sigmoid.nnapi = True/False  (Default is False)
                    ActivationSymmetric = True/False: symmetrize calibration data for activations (default is False).
                    WeightSymmetric = True/False: symmetrize calibration data for weights (default is True).
                    EnableSubgraph = True/False : Default is False. If enabled, subgraph will be quantized.
                                                  Dyanmic mode currently is supported. Will support more in future.
                    ForceQuantizeNoInputCheck = True/False :
                        By default, some latent operators like maxpool, transpose, do not quantize if their input is not
                        quantized already. Setting to True to force such operator always quantize input and so generate
                        quantized output. Also the True behavior could be disabled per node using the nodes_to_exclude.
                    MatMulConstBOnly = True/False:
                        Default is False for static mode. If enabled, only MatMul with const B will be quantized.
                    AddQDQPairToWeight = True/False :
                        Default is False which quantizes floating-point weight and feeds it to solely inserted
                        DeQuantizeLinear node. If True, it remains floating-point weight and inserts both
                        QuantizeLinear/DeQuantizeLinear nodes to weight.
                    OpTypesToExcludeOutputQuantization = list of op type :
                        Default is []. If any op type is specified, it won't quantize the output of ops with this
                        specific op types.
                    DedicatedQDQPair = True/False :
                        Default is False. When inserting QDQ pair, multiple nodes can share a single QDQ pair as their
                        inputs. If True, it will create identical and dedicated QDQ pair for each node.
                    QDQOpTypePerChannelSupportToAxis = dictionary :
                        Default is {}. Set channel axis for specific op type, for example: {'MatMul': 1}, and it's
                        effective only when per channel quantization is supported and per_channel is True. If specific
                        op type supports per channel quantization but not explicitly specified with channel axis,
                        default channel axis will be used.
                    CalibTensorRangeSymmetric = True/False :
                        Default is False. If enabled, the final range of tensor during calibration will be explicitly
                        set to symmetric to central point "0".
                    CalibMovingAverage = True/False :
                        Default is False. If enabled, the moving average of the minimum and maximum values will be
                        computed when the calibration method selected is MinMax.
                    CalibMovingAverageConstant = float :
                        Default is 0.01. Constant smoothing factor to use when computing the moving average of the
                        minimum and maximum values. Effective only when the calibration method selected is MinMax and
                        when CalibMovingAverage is set to True.
                    QuantizeBias = True/False :
                        Default is True which quantizes floating-point biases and it solely inserts
                        a DeQuantizeLinear node. If False, it remains floating-point bias and does not insert
                        any quantization nodes associated with biases.
                        This extra option is only effective when quant_format is QuantFormat.QDQ.
                    SmoothQuant = True/False :
                        Default is False. If enabled, SmoothQuant algorithm will be applied before quantization to do
                        fake input channel quantization.
                    SmoothQuantAlpha = float :
                        Default is 0.5. It only works if SmoothQuant is True. It controls the difficulty of weight
                        and activation quantization. A larger alpha value could be used on models with more significant
                        activation outliers to migrate more quantization difficulty to weights.
                    SmoothQuantFolding = True/False :
                        Default is True. It only works if SmoothQuant is True. If enabled, inserted Mul ops during
                        SmoothQuant will be folded into the previous op if the previous op is foldable.
            execution_provider : A enum indicates the Execution Provider such as: CPU, TRT, NNAPI, SNE, etc.
        Raises:
            ValueError: Raise ValueError if execution provider is unknown
        )r   r   r   r   r   r   r   r   N)superr   r&   calibrate_methodquant_formatextra_options)r   r&   r(   r)   r   r   r   r   r   r   r   r   r*   	__class__r   r   r   O   s    T
zStaticQuantConfig.__init__)r   r    r!   r   MinMaxr
   QDQr   r#   r   r   __classcell__r   r   r+   r   r$   N   s   r$   c                       s0   e Zd Zejdddddddf fdd	Z  ZS )DynamicQuantConfigNFc	           	   	      s(   t  j|||||||d |p i | _dS )a  
        This is a class for dynamic Quant Configuration

        Args:
            extra_options: key value pair dictionary for various options in different case. Current used:
                extra.Sigmoid.nnapi = True/False  (Default is False)
                ActivationSymmetric = True/False: symmetrize calibration data for activations (default is False).
                WeightSymmetric = True/False: symmetrize calibration data for weights (default is True).
                EnableSubgraph = True/False :
                    Default is False. If enabled, subgraph will be quantized. Dynamic mode currently is supported. Will
                    support more in the future.
                ForceQuantizeNoInputCheck = True/False :
                    By default, some latent operators like maxpool, transpose, do not quantize if their input is not
                    quantized already. Setting to True to force such operator always quantize input and so generate
                    quantized output. Also the True behavior could be disabled per node using the nodes_to_exclude.
                MatMulConstBOnly = True/False:
                    Default is True for dynamic mode. If enabled, only MatMul with const B will be quantized.
            execution_provider : A enum indicates the Execution Provider such as: CPU, TRT, NNAPI, SNE, etc.

        Raises:
            ValueError: Raise ValueError if execution provider is unknown
        )r   r   r   r   r   r   r   N)r'   r   r*   )	r   r   r   r   r   r   r   r   r*   r+   r   r   r      s    !	zDynamicQuantConfig.__init__)r   r    r!   r   r#   r   r/   r   r   r+   r   r0      s   r0   r)   r   r   c                 C   s   |t jkr|t jkrtd|t jkr@|t jkr@td| d|t jkrd|t jkrdtd| d|t jkr|t jkr| tjkrtd d S )NzrONNXRuntime quantization doesn't support data format:activation_type=QuantType.QInt8, weight_type=QuantType.QUInt8zFONNXRuntime quantization doesn't support data format: activation_type=z@ !=QuantType.QFLOAT8E4M3FN, weight_type=QuantType.QFLOAT8E4M3FN.zkONNXRuntime quantization doesn't support data format: activation_type=QuantType.QFLOAT8E4M3FN, weight_type=z!=QuantType.QFLOAT8E4M3FNzvPlease use QuantFormat.QDQ for activation type QInt8 and weight type QInt8. Or it will lead to bad performance on x64.)	r   r#   r"   
ValueErrorQFLOAT8E4M3FNr
   r.   loggingwarningr1   r   r   r   check_static_quant_arguments   s     

r6   Fr%   c           !         s  |t jks|t jkr&|tjkr&tdp,i |
p4g }
|	p<g }	|pDg }tj}|r\t|dkrtt	
 }tt
 }tt|| }tt| }t|}|std dddg}fdd|D }d	d
rddl}z|d W n> tk
r$ } zt| d td|W 5 d}~X Y nX ddlddl}ddlm}  fdd}dd |jjD | }|| ||}~|ddddj}|
 fdd|jjD  t!j"dd}t|j#$d% } |j&|| dd tt| }t!j"ddj}t't| |t|$d% |||d}|(  |) }t*|t+s\t,dt-| d t-| d~W 5 Q R X t.||| |t/j0krt1||||d||||	|
|} nt2||||d||||	|
|} | 3  | j4|| |std! d	d
r|5  dS )"a  
    Given an onnx model and calibration data reader, create a quantized onnx model and save it into a file
    It is recommended to use QuantFormat.QDQ format from 1.11 with activation_type = QuantType.QInt8 and weight_type
    = QuantType.QInt8. If model is targeted to GPU/TRT, symmetric activation and weight are required. If model is
    targeted to CPU, asymmetric activation and symmetric weight are recommended for balance of performance and
    accuracy.

    Args:

        model_input: file path of model to quantize
        model_output: file path of quantized model
        calibration_data_reader: a calibration data reader. It
            enumerates calibration data and generates inputs for the
            original model.
        quant_format: QuantFormat{QOperator, QDQ}.
            QOperator format quantizes the model with quantized operators directly.
            QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
        activation_type:
            quantization data type of activation. Please refer to
            https://onnxruntime.ai/docs/performance/quantization.html for more details on data type selection
        calibrate_method:
            Current calibration methods supported are MinMax and Entropy.
                Please use CalibrationMethod.MinMax or CalibrationMethod.Entropy as options.
        op_types_to_quantize:
                specify the types of operators to quantize, like ['Conv'] to quantize Conv only.
                It quantizes all supported operators by default.
        per_channel: quantize weights per channel
        reduce_range:
            quantize weights with 7-bits. It may improve the accuracy for some models running on non-VNNI machine,
            especially for per-channel mode
        weight_type:
            quantization data type of weight. Please refer to
            https://onnxruntime.ai/docs/performance/quantization.html for more details on data type selection
        nodes_to_quantize:
            List of nodes names to quantize. When this list is not None only the nodes in this list
            are quantized.
            example:
            [
                'Conv__224',
                'Conv__252'
            ]
        nodes_to_exclude:
            List of nodes names to exclude. The nodes in this list will be excluded from quantization
            when it is not None.
        use_external_data_format: option used for large size (>2GB) model. Set to False by default.
        extra_options:
            key value pair dictionary for various options in different case. Current used:
                extra.Sigmoid.nnapi = True/False  (Default is False)
                ActivationSymmetric = True/False: symmetrize calibration data for activations (default is False).
                WeightSymmetric = True/False: symmetrize calibration data for weights (default is True).
                EnableSubgraph = True/False : Default is False. If enabled, subgraph will be quantized.
                                              Dyanmic mode currently is supported. Will support more in the future.
                ForceQuantizeNoInputCheck = True/False :
                    By default, some latent operators like maxpool, transpose, do not quantize if their input is not
                    quantized already. Setting to True to force such operator always quantize input and so generate
                    quantized output. Also, the True behavior could be disabled per node using the nodes_to_exclude.
                MatMulConstBOnly = True/False:
                    Default is False for static mode. If enabled, only MatMul with const B will be quantized.
                AddQDQPairToWeight = True/False :
                    Default is False which quantizes floating-point weight and feeds it to solely inserted
                    DeQuantizeLinear node. If True, it remains floating-point weight and inserts both
                    QuantizeLinear/DeQuantizeLinear nodes to weight.
                OpTypesToExcludeOutputQuantization = list of op type :
                    Default is []. If any op type is specified, it won't quantize the output of ops with this
                    specific op types.
                DedicatedQDQPair = True/False :
                    Default is False. When inserting QDQ pair, multiple nodes can share a single QDQ pair as their
                    inputs. If True, it will create identical and dedicated QDQ pair for each node.
                QDQOpTypePerChannelSupportToAxis = dictionary :
                    Default is {}. Set channel axis for specific op type, for example: {'MatMul': 1}, and it's
                    effective only when per channel quantization is supported and per_channel is True. If specific
                    op type supports per channel quantization but not explicitly specified with channel axis,
                    default channel axis will be used.
                CalibTensorRangeSymmetric = True/False :
                    Default is False. If enabled, the final range of tensor during calibration will be explicitly
                    set to symmetric to central point "0".
                CalibMovingAverage = True/False :
                    Default is False. If enabled, the moving average of the minimum and maximum values will be
                    computed when the calibration method selected is MinMax.
                CalibMovingAverageConstant = float :
                    Default is 0.01. Constant smoothing factor to use when computing the moving average of the
                    minimum and maximum values. Effective only when the calibration method selected is MinMax and
                    when CalibMovingAverage is set to True.
                SmoothQuant = True/False :
                    Default is False. If enabled, SmoothQuant algorithm will be applied before quantization to do
                    fake input channel quantization.
                SmoothQuantAlpha = float :
                    Default is 0.5. It only works if SmoothQuant is True. It controls the difficulty of weight
                    and activation quantization. A larger alpha value could be used on models with more significant
                    activation outliers to migrate more quantization difficulty to weights.
                SmoothQuantFolding = True/False :
                    Default is True. It only works if SmoothQuant is True. If enabled, inserted Mul ops during
                    SmoothQuant will be folded into the previous op if the previous op is foldable.
    zIOnly Distribution calibration method is supported for float quantization.r   Please consider to run pre-processing before quantization. Refer to example: https://github.com/microsoft/onnxruntime-inference-examples/blob/main/quantization/image_classification/cpu/ReadMe.md )ZCalibTensorRangeSymmetricZ	symmetric)ZCalibMovingAverageZmoving_average)ZCalibMovingAverageConstantZaveraging_constantc                    s$   i | ]\}}| kr|  |qS r   )get).0namekey)r*   r   r   
<dictcomp>  s      z#quantize_static.<locals>.<dictcomp>ZSmoothQuantFNz/neural_compressor.adaptor.ox_utils.smooth_quant.zLneural-compressor is not correctly installed. Please check your environment.)ORTSmoothQuantc                  3   s"     } | D ]}|d fV  qd S )N)deepcopy)Zdata_readerdata)r&   copyr   r   inc_dataloader  s    
z'quantize_static.<locals>.inc_dataloaderc                 S   s   g | ]
}|j qS r   r:   r9   ir   r   r   
<listcomp>  s     z#quantize_static.<locals>.<listcomp>ZSmoothQuantAlphag      ?ZSmoothQuantFoldingTc                    s   g | ]}|j  kr|j qS r   rC   rD   )
orig_nodesr   r   rF     s     
 z
ort.quant.)prefixzsq_model.onnx)Zsave_as_external_datazaugmented_model.onnx)Zaugmented_model_pathr(   r   r*   zUnexpected type z" for tensors_range and calibrator=zPlease consider pre-processing before quantization. See https://github.com/microsoft/onnxruntime-inference-examples/blob/main/quantization/image_classification/cpu/ReadMe.md )6r   r3   r   Distributionr2   r   Z
QLinearOpslenlistr   keysr   setr   r   r   r4   r5   r8   	importlibimport_module	ExceptionerrorRuntimeErrorrA   onnxZ/neural_compressor.adaptor.ox_utils.smooth_quantr>   graphnodeZ	transformmodelextendtempfileTemporaryDirectoryr:   joinpathas_posixZ
save_modelr   Zcollect_dataZcompute_data
isinstancer   	TypeErrortyper6   r
   Z	QOperatorr   r	   quantize_modelsave_model_to_filecleanup)!model_inputmodel_outputr&   r)   r   r   r   r   r   r   r   r   r(   r*   modeZq_linear_opsZqdq_opsrV   pre_processedZcalib_extra_options_keysZcalib_extra_optionsrN   erS   r>   rB   Z
dataloadersqZsq_pathZquant_tmp_dirZ
calibratorZtensors_range	quantizerr   )r&   rA   r*   rG   r   quantize_static   s    n


 

ri   )rb   rc   c
                 C   s   |	pi }	|pg }|pg }|pg }t j}
|r6t|dkrBtt }tt| }t|}|sdt	
d d|	krtd|	d< t||||
d|tjd||||	}|  |j|| dS )a	  Given an onnx model, create a quantized onnx model and save it into a file

    Args:
        model_input: file path of model to quantize
        model_output: file path of quantized model
        op_types_to_quantize:
            specify the types of operators to quantize, like ['Conv'] to quantize Conv only.
            It quantizes all supported operators by default.
        per_channel: quantize weights per channel
        reduce_range:
            quantize weights with 7-bits. It may improve the accuracy for some models running on non-VNNI machine,
            especially for per-channel mode
        weight_type:
            quantization data type of weight. Please refer to
            https://onnxruntime.ai/docs/performance/quantization.html for more details on data type selection
        nodes_to_quantize:
            List of nodes names to quantize. When this list is not None only the nodes in this list
            are quantized.
            example:
            [
                'Conv__224',
                'Conv__252'
            ]
        nodes_to_exclude:
            List of nodes names to exclude. The nodes in this list will be excluded from quantization
            when it is not None.
        use_external_data_format: option used for large size (>2GB) model. Set to False by default.
        extra_options:
            key value pair dictionary for various options in different case. Current used:
                extra.Sigmoid.nnapi = True/False  (Default is False)
                ActivationSymmetric = True/False: symmetrize calibration data for activations (default is False).
                WeightSymmetric = True/False: symmetrize calibration data for weights (default is True).
                EnableSubgraph = True/False :
                    Default is False. If enabled, subgraph will be quantized. Dynamic mode currently is supported. Will
                    support more in the future.
                ForceQuantizeNoInputCheck = True/False :
                    By default, some latent operators like maxpool, transpose, do not quantize if their input is not
                    quantized already. Setting to True to force such operator always quantize input and so generate
                    quantized output. Also the True behavior could be disabled per node using the nodes_to_exclude.
                MatMulConstBOnly = True/False:
                    Default is True for dynamic mode. If enabled, only MatMul with const B will be quantized.
    r   r7   ZMatMulConstBOnlyTFN)r   Z
IntegerOpsrJ   rK   r   rL   r   r   r   r4   r5   r   r   r"   r_   rV   r`   )rb   rc   r   r   r   r   r   r   r   r*   rd   rV   re   rh   r   r   r   quantize_dynamic  s>    6rj   rb   rc   Zquant_configc                 C   s   t |trHt| ||j|j|j|j|j|j|j	|j
|j|j|j|jd n@t |trt| ||j|j|j	|j
|j|j|j|jd
 ntddS )zQuantize a model with QuantConfig.

    Args:
        model_input (Path): Path to the model to quantize.
        model_output (Path): Path to save the quantized model.
        quant_config (QuantConfig): Quantization Configuration.
    )r(   r)   r   r   r   r   r   r   r   r   r*   )r   r   r   r   r   r   r   r*   z\Invalid quantization config type, it must be either StaticQuantConfig or DynamicQuantConfig.N)r\   r$   ri   r&   r(   r)   r   r   r   r   r   r   r   r   r*   r0   rj   r]   rk   r   r   r   quantizeI  s>    

rl   )!r4   rX   pathlibr   Z	calibrater   r   r   r   Zonnx_quantizerr   Zqdq_quantizerr	   Zquant_utilsr
   r   r   r   r   registryr   r   r   r   r$   r0   r6   r.   r#   r-   ri   rj   rl   r   r   r   r   <module>   sT   7e. sa