
    hx                       S SK Jr  S SKrS SKrS SKrS SKJr  S SKJr  S SK	J
r
  S SKrSSKJrJrJrJr  SSKJr  SS	KJr  SS
KJrJrJrJrJrJrJrJr  SSKJrJ r J!r!  SSK"J#r#   " S S5      r$ " S S\$5      r%\RL                  S\RN                  \RP                  SSSSSSSSSSS4                             SS jjr) " S S\$5      r*SS jr+\RX                  SSS\RP                  \RP                  SSS\RL                  SS4     SS jjr-SSS\RP                  SSSS4   SS jjr.      SS jr/g)    )annotationsN)Callable)Path)Any   )CalibrationDataReaderCalibrationMethodTensorsDatacreate_calibrator)ONNXQuantizer)QDQQuantizer)MODEL_SIZE_THRESHOLDQuantFormatQuantizationMode	QuantTypeload_model_with_shape_infermodel_has_pre_process_metadata&save_and_reload_model_with_shape_inferupdate_opset_version)IntegerOpsRegistryQDQRegistryQLinearOpsRegistry)TensorQuantOverridesHelperc                  V    \ rS rSr\R
                  \R                  SSSSSS4S jrSrg)QuantConfig"   NFc	                    U=(       d    / nU=(       d    / nU=(       d    / nX0l         X`l        Xpl        X l        Xl        X@l        XPl        Xl        g)a  
This is the Base class for both Static and Dynamic Quantize Configuration
Args:
    activation_type:
        quantization data type of activation. Please refer to
        https://onnxruntime.ai/docs/performance/quantization.html for more details on data type selection
    weight_type:
        quantization data type of weight. Please refer to
        https://onnxruntime.ai/docs/performance/quantization.html for more details on data type selection
    op_types_to_quantize:
        specify the types of operators to quantize, like ['Conv'] to quantize Conv only.
        It quantizes all supported operators by default.
    nodes_to_quantize:
        List of nodes names to quantize. When this list is not None only the nodes in this list
        are quantized.
        example:
        [
            'Conv__224',
            'Conv__252'
        ]
    nodes_to_exclude:
        List of nodes names to exclude. The nodes in this list will be excluded from quantization
        when it is not None.
    per_channel: quantize weights per channel
    reduce_range:
        quantize weights with 7-bits. It may improve the accuracy for some models running on non-VNNI machine,
        especially for per-channel mode
    use_external_data_format: option used for large size (>2GB) model. Set to False by default.
N)op_types_to_quantizeper_channelreduce_rangeweight_typeactivation_typenodes_to_quantizenodes_to_excludeuse_external_data_format)	selfr"   r!   r   r#   r$   r   r    r%   s	            [/var/www/fran/franai/venv/lib/python3.13/site-packages/onnxruntime/quantization/quantize.py__init__QuantConfig.__init__#   sV    R ,1r-339r$8!&(&.!2 0(@%    )r"   r$   r#   r   r   r    r%   r!   )	__name__
__module____qualname____firstlineno__r   QUInt8QInt8r(   __static_attributes__ r*   r'   r   r   "   s,     "((OO!!&3Ar*   r   c                     ^  \ rS rSr\R
                  \R                  \R                  \R                  SSSSSSSS4 SU 4S jjjr
SrU =r$ )StaticQuantConfigY   NFc                ~   > [         TU ]  UUUUUU	U
US9  Xl        X l        X0l        Xl        U=(       d    0 U l        g)a   
This is the derived class for static Quantize Configuration

Args:
    calibration_data_reader:
        a calibration data reader. It enumerates calibration data and generates inputs for the original model.
    calibrate_method:
        Current calibration methods supported are MinMax, Entropy and Percentile.
    quant_format: QuantFormat{QOperator, QDQ}.
        QOperator format quantizes the model with quantized operators directly.
        QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
    calibration_providers: Execution providers to run the session during calibration. Default is None which uses
        [ "CPUExecutionProvider" ].
    extra_options:
        key value pair dictionary for various options in different case. Current used:
            extra.Sigmoid.nnapi = True/False  (Default is False)
            ActivationSymmetric = True/False: symmetrize calibration data for activations (default is False).
            WeightSymmetric = True/False: symmetrize calibration data for weights (default is True).
            EnableSubgraph = True/False : Default is False. If enabled, subgraph will be quantized.
                                          Dyanmic mode currently is supported. Will support more in future.
            ForceQuantizeNoInputCheck = True/False :
                By default, some latent operators like maxpool, transpose, do not quantize if their input is not
                quantized already. Setting to True to force such operator always quantize input and so generate
                quantized output. Also the True behavior could be disabled per node using the nodes_to_exclude.
            MatMulConstBOnly = True/False:
                Default is False for static mode. If enabled, only MatMul with const B will be quantized.
            AddQDQPairToWeight = True/False :
                Default is False which quantizes floating-point weight and feeds it to solely inserted
                DeQuantizeLinear node. If True, it remains floating-point weight and inserts both
                QuantizeLinear/DeQuantizeLinear nodes to weight.
            OpTypesToExcludeOutputQuantization = list of op type :
                Default is []. If any op type is specified, it won't quantize the output of ops with this
                specific op types.
            DedicatedQDQPair = True/False :
                Default is False. When inserting QDQ pair, multiple nodes can share a single QDQ pair as their
                inputs. If True, it will create identical and dedicated QDQ pair for each node.
            QDQOpTypePerChannelSupportToAxis = dictionary :
                Default is {}. Set channel axis for specific op type, for example: {'MatMul': 1}, and it's
                effective only when per channel quantization is supported and per_channel is True. If specific
                op type supports per channel quantization but not explicitly specified with channel axis,
                default channel axis will be used.
            CalibTensorRangeSymmetric = True/False :
                Default is False. If enabled, the final range of tensor during calibration will be explicitly
                set to symmetric to central point "0".
            CalibMovingAverage = True/False :
                Default is False. If enabled, the moving average of the minimum and maximum values will be
                computed when the calibration method selected is MinMax.
            CalibMovingAverageConstant = float :
                Default is 0.01. Constant smoothing factor to use when computing the moving average of the
                minimum and maximum values. Effective only when the calibration method selected is MinMax and
                when CalibMovingAverage is set to True.
            QuantizeBias = True/False :
                Default is True which quantizes floating-point biases and it solely inserts
                a DeQuantizeLinear node. If False, it remains floating-point bias and does not insert
                any quantization nodes associated with biases.
                This extra option is only effective when quant_format is QuantFormat.QDQ.
            SmoothQuant = True/False :
                Default is False. If enabled, SmoothQuant algorithm will be applied before quantization to do
                fake input channel quantization.
            SmoothQuantAlpha = float :
                Default is 0.5. It only works if SmoothQuant is True. It controls the difficulty of weight
                and activation quantization. A larger alpha value could be used on models with more significant
                activation outliers to migrate more quantization difficulty to weights.
            SmoothQuantFolding = True/False :
                Default is True. It only works if SmoothQuant is True. If enabled, inserted Mul ops during
                SmoothQuant will be folded into the previous op if the previous op is foldable.
            UseQDQContribOps = True/False :
                Default is False. If enabled, the inserted QuantizeLinear and DequantizeLinear ops will have the
                `com.microsoft` domain, which forces use of ONNX Runtime's QuantizeLinear and DequantizeLinear
                contrib op implementations. The contrib op implementations may support features not standardized
                into the ONNX specification (e.g., 16-bit quantization types).
            MinimumRealRange = float|None :
                Default is None. If set to a floating-point value, the calculation of the quantization parameters
                (i.e., scale and zero point) will enforce a minimum range between rmin and rmax. If (rmax-rmin)
                is less than the specified minimum range, rmax will be set to rmin + MinimumRealRange. This is
                necessary for EPs like QNN that require a minimum floating-point range when determining
                quantization parameters.
            TensorQuantOverrides = dictionary :
                Default is {}. Set tensor quantization overrides. The key is a tensor name and the value is a
                list of dictionaries. For per-tensor quantization, the list contains a single dictionary. For
                per-channel quantization, the list contains a dictionary for each channel in the tensor.
                Each dictionary contains optional overrides with the following keys and values.
                    'quant_type' = QuantType : The tensor's quantization data type.
                    'scale' =  Float         : The scale value to use. Must also specify `zero_point` if set.
                    'zero_point' = Int       : The zero-point value to use. Must also specify `scale` is set.
                    'symmetric' = Bool       : If the tensor should use symmetric quantization. Invalid if also
                                               set `scale` or `zero_point`.
                    'reduce_range' = Bool    : If the quantization range should be reduced. Invalid if also
                                               set `scale` or `zero_point`.
                    'rmax' = Float           : Override the maximum real tensor value in calibration data.
                                               Invalid if also set `scale` or `zero_point`.
                    'rmin' = Float           : Override the minimum real tensor value in calibration data.
                                               Invalid if also set `scale` or `zero_point`.
            QDQKeepRemovableActivations = True/False:
                Default is False. If true, "removable" activations (e.g., Clip or Relu) will not be removed, and
                will be explicitly represented in the QDQ model. If false, these activations are automatically
                removed if activations are asymmetrically quantized. Keeping these activations is necessary if
                optimizations or EP transformations will later remove QuantizeLinear/DequantizeLinear
                operators from the model.
            QDQDisableWeightAdjustForInt32Bias = True/False:
                Default is False. If true, QDQ quantizer will not adjust the weight's scale when the bias
                has a scale (input_scale * weight_scale) that is too small.
    execution_provider : A enum indicates the Execution Provider such as: CPU, TRT, NNAPI, SNE, etc.
Raises:
    ValueError: Raise ValueError if execution provider is unknown
)r"   r!   r   r#   r$   r   r    r%   N)superr(   calibration_data_readercalibrate_methodquant_formatcalibration_providersextra_options)r&   r8   r9   r:   r"   r!   r   r#   r$   r   r    r%   r;   r<   	__class__s                 r'   r(   StaticQuantConfig.__init__Z   sW    v 	+#!5/-#%%= 	 		
 (?$ 0(%:"*0br*   )r9   r8   r;   r<   r:   )r8   r   )r+   r,   r-   r.   r	   MinMaxr   QDQr   r0   r(   r1   __classcell__r=   s   @r'   r4   r4   Y   sP     +11 __!OO!!&"I1!6I1 I1r*   r4   Fc                f  ^# [         R                  [         R                  1n[         R                  [         R                  1n1 Skn[        U [        R                  5      (       a  U O[        R                  " U SS9n[        5       nSn[        U(       a  [        R                  " U5      O0 5      nUR                  R                   H+  n[        R                  R!                  U5      (       d  M)  SnM-     U(       a  [        U5      OSn[        U["        5      (       a  [        U5      O	[        5       nUR                  R$                   H  nU(       a  UR&                  U;  a  M  UR(                  U;   a  M.  [+        U5      (       a+  U" UU5      (       a  UR-                  UR(                  5        Mi  UR-                  UR&                  5        M     UU
UUSUR/                  5       S.nU(       aD  / SQnU VVs0 s H   u  nnUU;   d  M  UUR1                  U5      _M"     n nnUR3                  U 5        [5        S UR6                   5       5      n!U!R8                  S	:  aL  UR;                  U5      m#[=        U#4S
 jUR?                  5        5       5      n"UT#;   d  UT#;   d  U"(       a  SUS'   U(       a  UR3                  U5        [A        UU[B        RD                  UUU(       a  UO[#        URG                  U5      5      [#        U5      UU	U=(       d    URI                  5       [J        :  UUS9$ s  snnf )aw  
Returns a configuration suitable that quantizes the entire model to integer precision.

Params:
    model_input: Path to the input model file or ModelProto.
    calibration_data_reader: Calibration data reader.
    calibrate_methode: The calibration method. Defaults to MinMax.
    activation_type: The default activation quantization type. Defaults to QUInt8.
    weight_type: The default weight quantization type. Defaults to QInt8.
    activation_symmetric: True if activations should be quantized symmetrically (i.e, rmax == -rmin) by default.
        Defaults to false. For int8 and int16, this results in zero-point values of 0. For uint8 and uint16,
        the zero-point values are 127 and 32,767, respectively.
    weight_symmetric: True if weights should be quantized symmetrically (i.e., rmax == -rmin) by default.
        Defaults to None. If set to None, weight_symmetric is assumed true if a weight's quant type is a signed int.
    per_channel: Global option that determines if a fixed set of operator types should be quantized per-channel.
        Defaults to false. Alternatively, use the tensor-level `tensor_quant_overrides` to select individual operators
        and their quantization axes.
    reduce_range: quantize weights with 1 less bit of precision (e.g., 7 bits for QInt8). Defaults to false.
        May improve the accuracy for some models running on non-VNNI machine, especially for per-channel mode.
    keep_removable_activations: Defaults to false. If true, "removable" activations (e.g., Clip or Relu) will not
                    be removed, and will be explicitly represented in the QDQ model. If false, these activations
                    are automatically removed if activations are asymmetrically quantized. Keeping these activations
                    is necessary if optimizations or EP transformations will later remove
                    QuantizeLinear/DequantizeLinear operators from the model.
    min_real_range: Default is None. If set to a floating-point value, the calculation of the quantization parameters
        (i.e., scale and zero point) will enforce a minimum range between rmin and rmax. If (rmax - rmin)
        is less than the specified minimum range, rmax will be set to rmin + min_real_range.
    tensor_quant_overrides: tensor-level quantization overrides. Defaults to None.
        The key is a tensor name and the value is a list of dictionaries. For per-tensor quantization, the list
        contains a single dictionary. For per-channel quantization, the list contains either a dictionary for
        each channel in the tensor or a single dictionary that is assumed to apply to all channels. An 'axis'
        key must be present in the first dictionary for per-channel quantization.

        Each dictionary contains optional overrides with the following keys and values.
            'quant_type' = QuantType : The tensor's quantization data type.
            'axis' = Int             : The per-channel axis. Must be present for per-channel weights.
            'scale' =  Float         : The scale value to use. Must also specify `zero_point` if set.
            'zero_point' = Int       : The zero-point value to use. Must also specify `scale` is set.
            'symmetric' = Bool       : If the tensor should use symmetric quantization. Invalid if also
                                        set `scale` or `zero_point`.
            'reduce_range' = Bool    : If the quantization range should be reduced. Invalid if also
                                        set `scale` or `zero_point`. Only valid for initializers.
            'rmax' = Float           : Override the maximum real tensor value in calibration data.
                                        Invalid if also set `scale` or `zero_point`.
            'rmin' = Float           : Override the minimum real tensor value in calibration data.
                                        Invalid if also set `scale` or `zero_point`.
            'convert' = Dict         : A nested dictionary with the same keys for an activation
                                       tensor that should be converted to another quantization type.
            'convert["recv_nodes"] = Set : Set of node names that consume the converted activation,
                                           other nodes get the original type. If not specified,
                                           assume all consumer nodes get the converted type.
    calibration_providers: Execution providers to run the session during calibration. Default is None which uses
        [ "CPUExecutionProvider" ].
    op_types_to_quantize: List of operator types to quantize. If None, all operators other than Cast, DequantizeLinear,
        and QuantizeLinear are quantized.
    nodes_to_exclude: List of nodes names to exclude from quantization. Alternatively, can provide a function that
        accepts an onnx.ModelProto and onnx.NodeProto as arguments and returns true if the give onnx.NodeProto
        should be excluded from quantization.
    extra_options: Additional options specified as string key/value pairs. Refer to the documentation for
        `quantize_static` for valid keys and values.

Returns:
    A StaticQuantConfig object
>   CastQuantizeLinearDequantizeLinearF)load_external_dataTN)MinimumRealRangeQDQKeepRemovableActivationsActivationSymmetricWeightSymmetricForceQuantizeNoInputCheckTensorQuantOverrides))	symmetricCalibTensorRangeSymmetric)moving_averageCalibMovingAverage)averaging_constantCalibMovingAverageConstant)max_intermediate_outputsCalibMaxIntermediateOutputs)
percentileCalibPercentilec              3  h   #    U  H(  oR                   S :X  d  UR                   S:X  d  M$  Uv   M*     g7f) zai.onnxN)domain).0xs     r'   	<genexpr>!get_qdq_config.<locals>.<genexpr>v  s(     _!3Axx2~U^I^aa!3s   #2	2   c              3  ,   >#    U  H	  oT;   v   M     g 7fNr2   )r[   topset21_typess     r'   r]   r^   y  s     *jGi!+=Gis   UseQDQContribOps)r9   r:   r"   r!   r   r$   r   r    r%   r;   r<   )&r   QInt16QUInt16QInt4QUInt4
isinstanceonnx
ModelProto
load_modelsetr   copydeepcopygraphinitializerexternal_data_helperuses_external_datalistnodeop_typenamecallableaddget_dictgetupdatenextopset_importversionunionanyget_quant_typesr4   r   r@   
differenceByteSizer   )$model_inputr8   r9   calibrate_argsr"   r!   activation_symmetricweight_symmetricr   r    keep_removable_activationsmin_real_rangetensor_quant_overridesr;   r   r$   r<   	q16_typesq4_typesop_types_to_excludemodelop_typesmodel_has_external_dataoverrides_helperrq   op_types_to_quantize_setnodes_to_exclude_setru   final_extra_optionscalib_extra_options_keysrw   keycalib_extra_options
onnx_opsetoverrides_have_opset21_typesrc   s$                                      @r'   get_qdq_configr      s   f !!9#4#45I!1!12HH k4??33 	__[UC 
 uH#11G,-R
 {{..$$77DD&*# / =Qs#78VZ4>?OQU4V4V3/0\_\a   #<T(T99,,$%%*:5$*G*G $$TYY/LL& ! +'A3+%) 0 9 9 ; $
  >V
=UktSY]aoYo)C##D))=U 	 
 	""#67 _!3!3__JB!1'**jGWGgGgGi*j'j$m+{m/KOk6: 23 ""=1) __'$8 d8CVCVWjCk>l23!"9"eU^^=MQe=e3) %
s   8L-	L-c                  P   ^  \ rS rSr\R
                  SSSSSSS4U 4S jjrSrU =r$ )DynamicQuantConfigi  NFc	           
     L   > [         T	U ]  UUUUUUUS9  U=(       d    0 U l        g)a  
This is a class for dynamic Quant Configuration

Args:
    extra_options: key value pair dictionary for various options in different case. Current used:
        extra.Sigmoid.nnapi = True/False  (Default is False)
        ActivationSymmetric = True/False: symmetrize calibration data for activations (default is False).
        WeightSymmetric = True/False: symmetrize calibration data for weights (default is True).
        EnableSubgraph = True/False :
            Default is False. If enabled, subgraph will be quantized. Dynamic mode currently is supported. Will
            support more in the future.
        ForceQuantizeNoInputCheck = True/False :
            By default, some latent operators like maxpool, transpose, do not quantize if their input is not
            quantized already. Setting to True to force such operator always quantize input and so generate
            quantized output. Also the True behavior could be disabled per node using the nodes_to_exclude.
        MatMulConstBOnly = True/False:
            Default is True for dynamic mode. If enabled, only MatMul with const B will be quantized.
    execution_provider : A enum indicates the Execution Provider such as: CPU, TRT, NNAPI, SNE, etc.

Raises:
    ValueError: Raise ValueError if execution provider is unknown
)r   r   r    r!   r#   r$   r%   N)r7   r(   r<   )
r&   r!   r   r#   r$   r   r    r%   r<   r=   s
            r'   r(   DynamicQuantConfig.__init__  s<    B 	!5#%#/-%= 	 	
 +0br*   )r<   )	r+   r,   r-   r.   r   r0   r(   r1   rA   rB   s   @r'   r   r     s+     OO!!&*1 *1r*   r   c                   U[         R                  :X  a  U[         R                  :X  a  [        S5      eU[         R                  :w  a#  U[         R                  :X  a  [        SU S35      eU[         R                  :X  a#  U[         R                  :w  a  [        SU S35      e[         R
                  [         R                  /nX;   d  X#;   a  U [        R                  :w  a  [        S5      eU[         R                  :X  aA  U[         R                  :X  a,  U [        R                  :w  a  [        R                  " S5        g g g g )NzrONNXRuntime quantization doesn't support data format:activation_type=QuantType.QInt8, weight_type=QuantType.QUInt8zFONNXRuntime quantization doesn't support data format: activation_type=z@ !=QuantType.QFLOAT8E4M3FN, weight_type=QuantType.QFLOAT8E4M3FN.zkONNXRuntime quantization doesn't support data format: activation_type=QuantType.QFLOAT8E4M3FN, weight_type=z!=QuantType.QFLOAT8E4M3FNz8Only QuantFormat.QDQ supports 16-bit quantization types.zvPlease use QuantFormat.QDQ for activation type QInt8 and weight type QInt8. Or it will lead to bad performance on x64.)r   r0   r/   
ValueErrorQFLOAT8E4M3FNre   rf   r   r@   loggingwarning)r:   r"   r!   r   s       r'   check_static_quant_argumentsr     s,   )//)kY=M=M.ML
 	
 )111kYE\E\6\TUdTe fN N
 	

 )111kYE\E\6\&-'@B
 	

 !!9#4#45I$(@lVaVeVeFeSTT)//)kY__.LQ]alapapQp9	
 Rq.L)r*   c                  ^ U[         R                  :X  d  U[         R                  :X  a  U[        R                  :w  a  [	        S5      eU=(       d    0 nU
=(       d    / n
U	=(       d    / n	U=(       d    / n[
        R                  nU(       a  [        U5      S:X  aS  [        [        R                  " 5       5      n[        [        R                  " 5       5      n[        [        UU-   5      5      n[        U [        R                  5      (       a  [!        U 5      O[#        [%        U 5      5      n['        U5      nU(       d  [(        R*                  " S5        / SQnU VVs0 s H   u  nnUU;   d  M  UUR-                  U5      _M"     nnnUR-                  SS5      (       Ga]  SSKn UR1                  S5        SSKJn  U4S jnUR<                  R>                   Vs/ s H  nUR@                  PM     nnU" 5       nU" U UU5      nAURC                  UR-                  SS5      UR-                  SS5      5      n[D        RF                  " SS9n [%        U R@                  5      RI                  S5      RK                  5       n URM                  U 5        U
RO                  URP                  R<                  R>                   Vs/ s H!  nUR@                  U;  d  M  UR@                  PM#     sn5        [#        [%        U 5      5      n[S        UU5      n!U!ULn"U"(       a  U!n[D        RF                  " SS9 n#U"(       a  [T        RV                  " U5      n [        U [        R                  5      (       a@  [%        U#5      RI                  S5      RK                  5       n$[        RX                  " U U$SS9  U$n [[        [%        U 5      U[%        U#5      RI                  S5      RK                  5       UUUUS9n%UR-                  SS5      n&U&(       aa  [        T5      n'U'U&-  S:w  a  [	        SU' SU& S35      e[]        SU'U&5       H)  n(U(U&-   n)TR_                  U(U)S9  U%Ra                  T5        M+     OU%Ra                  T5        U%Rc                  5       n*[        U*[d        5      (       d$  [g        S[i        U*5       S[i        U%5       S	35      eA%SSS5        [k        X7U5        U[l        Rn                  L a  [q        UUUUSUUW*U	U
UU5      n+O[s        UUUUUW*U	U
UU5
      n+U+Ru                  5         U+RP                  Rw                  X5        U(       d  [(        R*                  " S5        UR-                  SS5      (       a  W Ry                  5         ggs  snnf ! [2         a*  n[(        R4                  " U S	35        [7        S
5      UeSnAff = fs  snf s  snf ! , (       d  f       GN= f) aV(  
Given an onnx model and calibration data reader, create a quantized onnx model and save it into a file
It is recommended to use QuantFormat.QDQ format from 1.11 with activation_type = QuantType.QInt8 and weight_type
= QuantType.QInt8. If model is targeted to GPU/TRT, symmetric activation and weight are required. If model is
targeted to CPU, asymmetric activation and symmetric weight are recommended for balance of performance and
accuracy.

Args:

    model_input: file path of model or ModelProto to quantize
    model_output: file path of quantized model
    calibration_data_reader: a calibration data reader. It
        enumerates calibration data and generates inputs for the
        original model.
    quant_format: QuantFormat{QOperator, QDQ}.
        QOperator format quantizes the model with quantized operators directly.
        QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor.
    activation_type:
        quantization data type of activation. Please refer to
        https://onnxruntime.ai/docs/performance/quantization.html for more details on data type selection
    calibrate_method:
        Current calibration methods supported are MinMax and Entropy.
            Please use CalibrationMethod.MinMax or CalibrationMethod.Entropy as options.
    op_types_to_quantize:
            specify the types of operators to quantize, like ['Conv'] to quantize Conv only.
            It quantizes all supported operators by default.
    per_channel: quantize weights per channel
    reduce_range:
        quantize weights with 7-bits. It may improve the accuracy for some models running on non-VNNI machine,
        especially for per-channel mode
    weight_type:
        quantization data type of weight. Please refer to
        https://onnxruntime.ai/docs/performance/quantization.html for more details on data type selection
    nodes_to_quantize:
        List of nodes names to quantize. When this list is not None only the nodes in this list
        are quantized.
        example:
        [
            'Conv__224',
            'Conv__252'
        ]
    nodes_to_exclude:
        List of nodes names to exclude. The nodes in this list will be excluded from quantization
        when it is not None.
    use_external_data_format: option used for large size (>2GB) model. Set to False by default.
    calibration_providers: Execution providers to run the session during calibration. Default is None which uses
        [ "CPUExecutionProvider" ]
    extra_options:
        key value pair dictionary for various options in different case. Current used:
            extra.Sigmoid.nnapi = True/False  (Default is False)
            ActivationSymmetric = True/False: symmetrize calibration data for activations (default is False).
            WeightSymmetric = True/False: symmetrize calibration data for weights (default is True).
            EnableSubgraph = True/False : Default is False. If enabled, subgraph will be quantized.
                                          Dyanmic mode currently is supported. Will support more in the future.
            ForceQuantizeNoInputCheck = True/False :
                By default, some latent operators like maxpool, transpose, do not quantize if their input is not
                quantized already. Setting to True to force such operator always quantize input and so generate
                quantized output. Also, the True behavior could be disabled per node using the nodes_to_exclude.
            MatMulConstBOnly = True/False:
                Default is False for static mode. If enabled, only MatMul with const B will be quantized.
            AddQDQPairToWeight = True/False :
                Default is False which quantizes floating-point weight and feeds it to solely inserted
                DeQuantizeLinear node. If True, it remains floating-point weight and inserts both
                QuantizeLinear/DeQuantizeLinear nodes to weight.
            OpTypesToExcludeOutputQuantization = list of op type :
                Default is []. If any op type is specified, it won't quantize the output of ops with this
                specific op types.
            DedicatedQDQPair = True/False :
                Default is False. When inserting QDQ pair, multiple nodes can share a single QDQ pair as their
                inputs. If True, it will create identical and dedicated QDQ pair for each node.
            QDQOpTypePerChannelSupportToAxis = dictionary :
                Default is {}. Set channel axis for specific op type, for example: {'MatMul': 1}, and it's
                effective only when per channel quantization is supported and per_channel is True. If specific
                op type supports per channel quantization but not explicitly specified with channel axis,
                default channel axis will be used.
            CalibTensorRangeSymmetric = True/False :
                Default is False. If enabled, the final range of tensor during calibration will be explicitly
                set to symmetric to central point "0".
            CalibStridedMinMax = Optional[int] :
                Default is None. If set to an integer, during calculation of the min-max, only stride amount of
                data will be used and then all results will be merged in the end.
            CalibMovingAverage = True/False :
                Default is False. If enabled, the moving average of the minimum and maximum values will be
                computed when the calibration method selected is MinMax.
            CalibMovingAverageConstant = float :
                Default is 0.01. Constant smoothing factor to use when computing the moving average of the
                minimum and maximum values. Effective only when the calibration method selected is MinMax and
                when CalibMovingAverage is set to True.
            CalibMaxIntermediateOutputs = Optional[int] :
                Default is None. If set to an integer, during calculation of the min-max range of the tensors
                it will load at max value number of outputs before computing and merging the range. This will
                produce the same result as all computing with None, but is more memory efficient.
            SmoothQuant = True/False :
                Default is False. If enabled, SmoothQuant algorithm will be applied before quantization to do
                fake input channel quantization.
            SmoothQuantAlpha = float :
                Default is 0.5. It only works if SmoothQuant is True. It controls the difficulty of weight
                and activation quantization. A larger alpha value could be used on models with more significant
                activation outliers to migrate more quantization difficulty to weights.
            SmoothQuantFolding = True/False :
                Default is True. It only works if SmoothQuant is True. If enabled, inserted Mul ops during
                SmoothQuant will be folded into the previous op if the previous op is foldable.
            UseQDQContribOps = True/False :
                Default is False. If enabled, the inserted QuantizeLinear and DequantizeLinear ops will have the
                `com.microsoft` domain, which forces use of ONNX Runtime's QuantizeLinear and DequantizeLinear
                contrib op implementations. The contrib op implementations may support features not standardized
                into the ONNX specification (e.g., 16-bit quantization types).
            MinimumRealRange = float|None :
                Default is None. If set to a floating-point value, the calculation of the quantization parameters
                (i.e., scale and zero point) will enforce a minimum range between rmin and rmax. If (rmax - rmin)
                is less than the specified minimum range, rmax will be set to rmin + MinimumRealRange. This is
                necessary for EPs like QNN that require a minimum floating-point range when determining
                quantization parameters.
            TensorQuantOverrides = dictionary :
                Default is {}. Set tensor quantization overrides. The key is a tensor name and the value is a
                list of dictionaries. For per-tensor quantization, the list contains a single dictionary. For
                per-channel quantization, the list contains a dictionary for each channel in the tensor.
                Each dictionary contains optional overrides with the following keys and values.
                    'quant_type' = QuantType : The tensor's quantization data type.
                    'scale' =  Float         : The scale value to use. Must also specify `zero_point` if set.
                    'zero_point' = Int       : The zero-point value to use. Must also specify `scale` is set.
                    'symmetric' = Bool       : If the tensor should use symmetric quantization. Invalid if also
                                               set `scale` or `zero_point`.
                    'reduce_range' = Bool    : If the quantization range should be reduced. Invalid if also
                                               set `scale` or `zero_point`.
                    'rmax' = Float           : Override the maximum real tensor value in calibration data.
                                               Invalid if also set `scale` or `zero_point`.
                    'rmin' = Float           : Override the minimum real tensor value in calibration data.
                                               Invalid if also set `scale` or `zero_point`.
            QDQKeepRemovableActivations = True/False:
                Default is False. If true, "removable" activations (e.g., Clip or Relu) will not be removed, and
                will be explicitly represented in the QDQ model. If false, these activations are automatically
                removed if activations are asymmetrically quantized. Keeping these activations is necessary if
                optimizations or EP transformations will later remove QuantizeLinear/DequantizeLinear
                operators from the model.
            QDQDisableWeightAdjustForInt32Bias = True/False:
                Default is False. If true, QDQ quantizer will not adjust the weight's scale when the bias
                has a scale (input_scale * weight_scale) that is too small.
zIOnly Distribution calibration method is supported for float quantization.r   Please consider to run pre-processing before quantization. Refer to example: https://github.com/microsoft/onnxruntime-inference-examples/blob/main/quantization/image_classification/cpu/ReadMe.md ))rO   rN   )rQ   rP   )rS   rR   )rU   rT   )rW   rV   SmoothQuantFNz/neural_compressor.adaptor.ox_utils.smooth_quant.zLneural-compressor is not correctly installed. Please check your environment.)ORTSmoothQuantc               3  X   >#    [         R                  " T5      n U  H	  nUS 4v   M     g 7fra   )rn   ro   )data_readerdatar8   s     r'   inc_dataloader'quantize_static.<locals>.inc_dataloader  s)     --(?@K#Dj  $s   '*SmoothQuantAlphag      ?SmoothQuantFoldingTz
ort.quant.)prefixzsq_model.onnxzmodel_input.onnx)save_as_external_datazaugmented_model.onnx)augmented_model_pathr9   r%   	providersr<   CalibStridedMinMaxzTotal data size (z#) is not divisible by stride size (z).)start_index	end_indexzUnexpected type z" for tensors_range and calibrator=zPlease consider pre-processing before quantization. See https://github.com/microsoft/onnxruntime-inference-examples/blob/main/quantization/image_classification/cpu/ReadMe.md )=r   r   r	   Distributionr   r   
QLinearOpslenrt   r   keysr   rm   ri   rj   rk   r   r   r   r   r   r   r{   	importlibimport_module	ExceptionerrorRuntimeError/neural_compressor.adaptor.ox_utils.smooth_quantr   rp   ru   rw   	transformtempfileTemporaryDirectoryjoinpathas_posixsaveextendr   r   rn   ro   
save_modelr   range	set_rangecollect_datacompute_datar
   	TypeErrortyper   r   	QOperatorr   r   quantize_modelsave_model_to_filecleanup),r   model_outputr8   r:   r   r   r    r"   r!   r#   r$   r%   r9   r;   r<   modeq_linear_opsqdq_opsr   pre_processedr   rw   r   r   r   er   r   i
orig_nodes
dataloadersqsq_pathupdated_modelis_model_updatedquant_tmp_diroutput_path
calibratorstridetotal_data_sizestartr   tensors_range	quantizers,     `                                         r'   quantize_staticr     s^   x )111[ID[D[5[0===hii!'RM'-2)/R/52&&D3';#<#A.3356{'')*#Cw(>$?@ k4??33 	/{;(k):; 
 9?M	
  9Q8P$TX\iTi$]t$$8P   ..	v##$UV
 	S	!
 ',kk&6&67&6aff&6
7#%
K\B]../A3GIZIZ[oquIvw--\B7<<(11/BKKM

;1B1B1G1G d1GA166YcKc1G de+D,=>(<M$E1		$	$L	9]--.Kk4??33}-667IJSSUKOO&*
 &K& !%m!4!=!=>T!U!^!^!`-%=+-

 ""#7>!"9:O'1, #4_4EEhiohppr!sttq/6:!FN	'11ey1Y''(?@ ;
 ##$;<"//1-55"4#6"77YZ^_iZjYkklm  S 
:V !L{,,,! 
	 ! 
	 OO&&|N	
 .. /o  	vMMQCq'"mntuu	v 8 !e 
:	9sC   U<.U<$V V99V>V>)FW
V6%V11V6
Wc
                z   U	=(       d    0 n	U=(       d    / nU=(       d    / nU=(       d    / n[         R                  n
U(       a  [        U5      S:X  a  [        [        R
                  " 5       5      n[        U [        R                  5      (       a  [        U 5      O[        [        U 5      5      n[        U5      nU(       d  [        R                  " S5        SU	;  a  SU	S'   [        X5      n[!        UUUU
SU["        R$                  SUUUU	5      nUR'                  5         UR(                  R+                  X5        g)a3	  Given an onnx model, create a quantized onnx model and save it into a file

Args:
    model_input: file path of model or ModelProto to quantize
    model_output: file path of quantized model
    op_types_to_quantize:
        specify the types of operators to quantize, like ['Conv'] to quantize Conv only.
        It quantizes all supported operators by default.
    per_channel: quantize weights per channel
    reduce_range:
        quantize weights with 7-bits. It may improve the accuracy for some models running on non-VNNI machine,
        especially for per-channel mode
    weight_type:
        quantization data type of weight. Please refer to
        https://onnxruntime.ai/docs/performance/quantization.html for more details on data type selection
    nodes_to_quantize:
        List of nodes names to quantize. When this list is not None only the nodes in this list
        are quantized.
        example:
        [
            'Conv__224',
            'Conv__252'
        ]
    nodes_to_exclude:
        List of nodes names to exclude. The nodes in this list will be excluded from quantization
        when it is not None.
    use_external_data_format: option used for large size (>2GB) model. Set to False by default.
    extra_options:
        key value pair dictionary for various options in different case. Current used:
            extra.Sigmoid.nnapi = True/False  (Default is False)
            ActivationSymmetric = True/False: symmetrize calibration data for activations (default is False).
            WeightSymmetric = True/False: symmetrize calibration data for weights (default is True).
            EnableSubgraph = True/False :
                Default is False. If enabled, subgraph will be quantized. Dynamic mode currently is supported. Will
                support more in the future.
            ForceQuantizeNoInputCheck = True/False :
                By default, some latent operators like maxpool, transpose, do not quantize if their input is not
                quantized already. Setting to True to force such operator always quantize input and so generate
                quantized output. Also the True behavior could be disabled per node using the nodes_to_exclude.
            MatMulConstBOnly = True/False:
                Default is True for dynamic mode. If enabled, only MatMul with const B will be quantized.
r   r   MatMulConstBOnlyTFN)r   
IntegerOpsr   rt   r   r   ri   rj   rk   r   r   r   r   r   r   r   r   r   r/   r   r   r   )r   r   r   r   r    r!   r#   r$   r%   r<   r   r   r   r   s                 r'   quantize_dynamicr     s"   l "'RM'-2)/R/52&&D3';#<#A#$6$;$;$=> k4??33 	/{;(k):; 
 9?M	
 .,0() 4EI OO&&|Nr*   c                j   [        U[        5      (       a  [        U UUR                  UR                  UR
                  UR                  UR                  UR                  UR                  UR                  UR                  UR                  UR                  UR                  UR                  S9  g[        U[         5      (       ac  [#        U UUR                  UR                  UR                  UR                  UR                  UR                  UR                  UR                  S9
  gSSKJnJn  [        X$5      (       aj  [        U [*        R,                  5      (       a  U O[*        R.                  " U 5      nU" XRS9nUR1                  5         UR2                  R5                  US5        g[7        S5      e)	a  Quantize a model with QuantConfig.

Args:
    model_input (str | Path | ModelProto): Path to the model or ModelProto to quantize.
    model_output (str | Path): Path to save the quantized model.
    quant_config (QuantConfig | WeightOnlyQuantConfig): Quantization Configuration.
)r9   r:   r"   r!   r   r#   r$   r   r    r%   r;   r<   )r!   r   r#   r$   r   r    r%   r<   r   )MatMulNBitsQuantizerWeightOnlyQuantConfig)algo_configTztInvalid quantization config type, it must be either StaticQuantConfig, DynamicQuantConfig, or WeightOnlyQuantConfig.N)ri   r4   r   r8   r9   r:   r"   r!   r   r#   r$   r   r    r%   r;   r<   r   r   matmul_nbits_quantizerr   r   rj   rk   loadprocessr   r   r   )r   r   quant_configr   r   r   quants          r'   quantizer     sj    , 12200)::%22(88$00!-!B!B*<<)::$00%22%1%J%J"."D"D&44	
$ 
L"4	5	5$00!-!B!B*<<)::$00%22%1%J%J&44	
 	Xl::#-k4??#K#KKQUQZQZ[fQgE(IEMMOKK**<>@ r*   )r   str | Path | onnx.ModelProtor8   r   r   zdict[str, Any] | Noner   boolr   zbool | Noner   r   r    r   r   r   r   zfloat | Noner   z&dict[str, list[dict[str, Any]]] | Noner;   list[str] | Noner   r   r$   zDlist[str] | Callable[[onnx.ModelProto, onnx.NodeProto], bool] | Noner<   zdict | Nonereturnr4   )r:   r   r"   r   r!   r   )r   r   r   
str | Pathr8   r   )r   r   r   r   )r   r   r   r   r   r   )0
__future__r   rn   r   r   collections.abcr   pathlibr   typingr   rj   	calibrater   r	   r
   r   onnx_quantizerr   qdq_quantizerr   quant_utilsr   r   r   r   r   r   r   r   registryr   r   r   r   r   r   r4   r?   r/   r0   r   r   r   r@   r   r   r   r2   r*   r'   <module>r     s   #    $    _ _ ) '	 	 	 J I >4A 4AnJ1 J1` '--,0$$!&$(',#'EI.2-1]a!%#j-j2j *	j j "j j j !%j !j Cj ,j +j  [!j" #j$ %jZ+1 +1\
D OO"&--x-xx 3x|	 "cO-cOcOL9-99 9r*   