U
    hY                     @   s  d dl Z d dlZd dlZd dlmZ d dlmZ d dlZd dlZd dlm	Z	m
Z
mZ d dlmZ d dlmZmZmZmZ d dlmZ d dlmZmZmZ zd d	lmZ W n ek
r   dZY nX d
ZdZdZdZdZ dZ!dZ"dZ#dZ$i Z%dd e&e
D Z'G dd deZ(G dd deZ)G dd deZ*G dd deZ+ej
j,e-dej
j.e-dej
j/eiZ0djdd Z1dkd"d#Z2d$d% Z3dld&d'Z4dmd(d)Z5dnd*d+Z6G d,d- d-Z7G d.d/ d/Z8G d0d1 d1Z9d2d3 Z:d4d5 Z;d6d7 Z<d8d9 Z=ee>ed:d;d<Z?d=d> Z@d?d@ ZAdodBdCZBedDdEdFZCeedGdHdIZDe	dJdKdLZEe	eFdMdNdOZGe	dJdPdQZHe	eFdMdRdSZIee	dTdUdVZJe	e	dMdWdXZKe
ejLdYdZd[ZMe>e>d\d]d^ZNe>e>d\d_d`ZOe>dadbdcZPe>dadddeZQe>dadfdgZRe>dadhdiZSdS )p    N)Enum)Path)
ModelProtoTensorProtoexternal_data_helper)onnx_pb)
make_graph
make_model	make_nodemake_tensor_value_info)ReferenceEvaluator)GraphOptimizationLevelInferenceSessionSessionOptions)float8e4m3fnzonnx.quantizez0.1.0zai.onnxzcom.microsoftQuantizeLinearZ_QuantizeLinear_InputZDequantizeLinearZ_DequantizeLinear_OutputZ
_quantizedc                 C   s(   i | ] }t tt|trtt||qS  )
isinstancegetattrr   int).0kr   r   H/tmp/pip-unpacked-wheel-socb9apf/onnxruntime/quantization/quant_utils.py
<dictcomp>"   s       r   c                   @   s(   e Zd ZdZdZdd Zedd ZdS )QuantizationModer      c                 C   s   | j S Nnameselfr   r   r   __str__-   s    zQuantizationMode.__str__c                 C   s*   z
t |  W S  tk
r$   t Y nX d S r   )r   KeyError
ValueError)moder   r   r   from_string0   s    
zQuantizationMode.from_stringN)__name__
__module____qualname__Z
IntegerOpsZ
QLinearOpsr!   staticmethodr%   r   r   r   r   r   )   s
   r   c                   @   s(   e Zd ZdZdZdd Zedd ZdS )QuantizedValueTyper   r   c                 C   s   | j S r   r   r   r   r   r   r!   <   s    zQuantizedValueType.__str__c                 C   s*   z
t |  W S  tk
r$   t Y nX d S r   )r*   r"   r#   )vr   r   r   r%   ?   s    
zQuantizedValueType.from_stringN)r&   r'   r(   ZInputZInitializerr!   r)   r%   r   r   r   r   r*   8   s
   r*   c                   @   s8   e Zd ZdZdZdZdd Zedd Ze	dd	 Z
d
S )	QuantTyper   r      c                 C   s   | j S r   r   r   r   r   r   r!   L   s    zQuantType.__str__c                 C   s*   z
t |  W S  tk
r$   t Y nX d S r   )r,   r"   r#   )tr   r   r   r%   O   s    
zQuantType.from_stringc                 C   sD   | t jkrtjS | t jkr tjS | t jkr0tjS td| dd S )NzUnexpected value qtype=.)	r,   QInt8r   INT8QUInt8UINT8QFLOAT8E4M3FNFLOAT8E4M3FNr#   r   r   r   r   tensor_typeV   s    


zQuantType.tensor_typeN)r&   r'   r(   r0   r2   r4   r!   r)   r%   propertyr6   r   r   r   r   r,   G   s   
r,   c                   @   s(   e Zd ZdZdZdd Zedd ZdS )QuantFormatr   r   c                 C   s   | j S r   r   r   r   r   r   r!   e   s    zQuantFormat.__str__c                 C   s*   z
t |  W S  tk
r$   t Y nX d S r   )r8   r"   r#   )formatr   r   r   r%   h   s    
zQuantFormat.from_stringN)r&   r'   r(   Z	QOperatorZQDQr!   r)   r%   r   r   r   r   r8   a   s
   r8   Zint8uint8c                 C   sx  | t kstd|  d| tjjtjjtjjtjjfkr|dkrPtd|dt	t
tdg dgtjd| g dgdtd	d
ddgdggdtd
tjd tdtjd gtd| d g}t|}|d |tj|tjdd S t |  }t|tjkrdnd|d krdn|}	t|tjkr$dnd|d kr4dn|}
t|tj|  | }tj||	|
|d ||S d S )NUnexpected data type . requested. Only INT8 and UINT8 are supported.r   z2zero_point is expected to be null for float 8 not r/   Constant
zero_point)valuer   XscaleYZqu)r@   rA         )out)ONNX_TYPE_TO_NP_TYPEAssertionError
onnx_protor   r5   ZFLOAT8E4M3FNUZZ
FLOAT8E5M2ZFLOAT8E5M2FNUZNotImplementedErrorr	   r   r
   onnxhelperZmake_tensorr   FLOATr   runastypenumpyfloat32maxr:   minasarrayroundZclip)qTypeZarrrA   r>   lowhighZ
onnx_modelrefdtypeZcliplowZcliphighZarr_fp32r   r   r   quantize_nparrayw   sD       &&(r[   Fc                 C   s   |dks|dk r$t d| d| t| d} t|d}|rZtt| t|}| } |
 }||  t||  }|ttjjk rd}d}nt	|| |  }||gS )a  Calculate the scale s and zero point z for the quantization relation
    r = s(q-z), where r are the original values and q are the corresponding
    quantized values.

    r and z are calculated such that every value within [rmin,rmax] has an
    approximate representation within [qmin,qmax]. In addition, qmin <= z <=
    qmax is enforced. If the symmetric flag is set to True, the interval
    [rmin,rmax] is symmetrized to [-absmax, +absmax], where
    absmax = max(abs(rmin), abs(rmax)).

    :parameter rmin: minimum value of r
    :parameter rmax: maximum value of r
    :parameter qmin: minimum value representable by the target quantization data type
    :parameter qmax: maximum value representable by the target quantization data type
    :return: zero and scale [z, s]

    r   zBqmin and qmax must meet requirement: qmin <= 0 <= qmax while qmin:z, qmmax:      ?)
r#   rS   rR   absfloatrP   ZfinforQ   ZtinyrU   )rminrmaxqminqmax	symmetricZabsmaxrA   r>   r   r   r   compute_scale_zp   s    

rd   c                    s   | t krj| tjkrRddlm   fddtddD }tjdd |D tjd}nt	d|  d	|t | < t
t |  }d}|| }||gS )
ar  Calculate the scale s for a float8 type (E4M3FN).
    The function assumes the coefficient distribution and the float 8
    distribution are similar to two gaussian laws.

    :return: zero and scale [z, s]

    More details in notebook `quantization_fp8.ipynb
    <https://github.com/microsoft/onnxruntime/blob/main/docs/python/notebooks/quantization_fp8.ipynb>`_.
    r   float8e4m3_to_float32c                    s   g | ]} |qS r   r   )r   ire   r   r   
<listcomp>   s     z+compute_scale_zp_float8.<locals>.<listcomp>   c                 S   s$   g | ]}t |st |s|qS r   )rP   isnanisinf)r   fr   r   r   rh      s     
 
 )rZ   zQuantization to element_type=z not implemented.)FLOAT8_DISTRIBUTIONSr   r5   Zonnx.numpy_helperrf   rangerP   arrayrQ   r#   std)Zelement_typerp   Z
all_valuesvaluesZstd_f8ZzerorA   r   re   r   compute_scale_zp_float8   s    

 rr   c              
   C   sH  d}d}d}d}t | r(t| }t| }|tjkr|r>tdt| }t||\}}t	|t
| ||}	t|	tj d@ dkrt
| }
td|
  d|
  d|	  d|	  d	|||||	fS |tjtjfkr4t | rt|||d	\}}t|||||\}}t	|t
| ||}	|||||	fS td
| ddS )a  
    :param data: data to quantize
    :param qType: data type to quantize to. Supported types UINT8 and INT8
    :param symmetric: whether symmetric quantization is used or not. This is applied to INT8.
    :return: minimum, maximum, zero point, scale, and quantized weights

    To pack weights, we compute a linear transformation

    - when data `type == uint8` mode, from `[rmin, rmax]` -> :math:`[0, 2^{b-1}]` and
    - when data `type == int8`, from `[-m , m]` -> :math:`[-(2^{b-1}-1), 2^{b-1}-1]` where
        `m = max(abs(rmin), abs(rmax))`

    and add necessary intermediate nodes to trasnform quantized weight to full weight using the equation

    :math:`r = S(q-z)`, where

    - *r*: real original value
    - *q*: quantized value
    - *S*: scale
    - *z*: zero point
    r   r\   z1Unsupported option reduce_range=True for float 8.rE   z+One of the quantized value is NaN data in [z, z], quantized_data in [z].rc   zUnexpected value for qType=r/   N)lenrS   rR   r   r5   RuntimeErrorrP   rp   rr   r[   rT   anyrO   r:   Zravelr1   r3   get_qmin_qmax_for_qTyperd   r#   )datarV   rc   reduce_ranger_   r`   r>   rA   rp   quantized_dataZnp_datara   rb   r   r   r   quantize_data   s4    


,
r{   c                 C   s   | t jjkr|rdnd\}}nZ| t jjkrR|r@|r6dnd\}}qx|rHdnd\}}n&| t jjkrhtdntd|  d||fS )	z
    Return qmin and qmax, the minimum and maximum value representable by the given qType
    :parameter qType: onnx.onnx_pb.TensorProto.UINT8 or onnx.onnx_pb.TensorProto.UINT8
    :return: qmin, qmax
    )r   rE   )r   rD   )i@   )rC   rE   )irE   z;This function is not implemented for float 8 as not needed.r;   r<   )rI   r   r3   r1   r5   rJ   r#   rV   ry   rc   ra   rb   r   r   r   rw     s    
rw   c                 C   s   t | ||d\}}|| S )z
    Helper function to get the quantization range for a type.
        parameter qType: quantization type.
        return: quantization range.
    rs   )rw   r}   r   r   r   get_qrange_for_qType,  s    r~   c                   @   s    e Zd ZdZg g dfddZdS )QuantizedInitializerzJ
    Represents a linearly quantized weight input from ONNX operators
    Nc
           
      C   s:   || _ || _|| _|| _|| _|| _|| _|| _|	| _d S r   )	r   initializerrminsrmaxszero_pointsscalesrx   rz   axis)
r    r   r   r   r   r   r   rx   rz   r   r   r   r   __init__;  s    zQuantizedInitializer.__init__r&   r'   r(   __doc__r   r   r   r   r   r   6  s
   r   c                   @   s   e Zd ZdZdddZdS )QuantizedValuezI
    Represents a linearly quantized value (input\output\intializer)
    Nc	           	      C   s4   || _ || _|| _|| _|| _|| _|| _|| _d S r   )original_nameZq_name
scale_nameZzp_nameZ
value_typer   	node_type
node_qtype)	r    r   Znew_quantized_namer   Zzero_point_nameZquantized_value_typer   r   r   r   r   r   r   Z  s    zQuantizedValue.__init__)NNNr   r   r   r   r   r   U  s
      r   c                   @   s   e Zd ZdZdd ZdS )BiasToQuantizez+
    Represents a bias to be quantized
    c                 C   s   || _ || _|| _d S r   )	bias_name
input_nameweight_name)r    r   r   r   r   r   r   r   t  s    zBiasToQuantize.__init__Nr   r   r   r   r   r   o  s   r   c                 C   s   | j dkrtd| j d| j dkr.| j}n| j dkr@| j}n| j dkrR| j}n| j dkrd| j}n| j dkrv| j}nt| j d	kr| j}nb| j d
kr| j	}nP| j dkr| j
}n>| j dkr| j}n,| j dkr| j}ntd| j d| j  d| j|iS )z
    Convert attribute to kwarg format for use with onnx.helper.make_node.
        :parameter attribute: attribute in AttributeProto format.
        :return: attribute in {key: value} format.
    r   z
attribute z does not have type specified.r   r-                     	   
   z has unsupported type r/   )typer#   r   rl   rg   sr.   gZfloatsZintsstringsZtensorsZgraphs)	attributer?   r   r   r   attribute_to_kwargz  s0    










r   c                    s*    fdd|D }t |dkr&|d S dS )z
    Helper function to find item by name in a list.
        parameter item_name: name of the item.
        parameter item_list: list of items.
        return: item if found. None otherwise.
    c                    s   g | ]}|j  kr|qS r   r   )r   item	item_namer   r   rh     s     
 z find_by_name.<locals>.<listcomp>r   N)rt   )r   Z	item_listitemsr   r   r   find_by_name  s    r   c                 C   s,   d}t dt|D ]}|| | kr|}q|S )zC
    Helper function to return index of an item in a node list
    r   )rn   rt   )Z	elem_nameZ	elem_listZelem_idxrg   r   r   r   get_elem_index  s
    r   c                 C   s   t jd| |g|S )z
    Helper function to create a Mul node.
        parameter inputs: list of input names.
        parameter output: output name.
        parameter name: name of the node.
        return: Mul node in NodeProto format.
    ZMul)rK   rL   r
   )inputsoutputr   r   r   r   get_mul_node  s    r   )filename
identifierreturnc                 C   s   | j | j| | j S )zp
    Helper function to generate a identifiable filepath by concatenating the given identifier as a suffix.
    )parentjoinpathstemsuffix)r   r   r   r   r   generate_identified_filename  s    r   c                 C   s   dd l }dd lm} dd l}|j|jd td t|  td t| |j| |dd |d |	d |
d	 |  d S )
Nr   )	thresholdz
Histogram:zHistogram Edges:T)fillzTensor valueZCountszTensor value V.S. Counts)sysZmatplotlib.pyplotZpyplotrP   Zset_printoptionsmaxsizeprintZstairsZxlabelZylabeltitleshow)histZ
hist_edgesr   ZpltrP   r   r   r   
apply_plot  s    


r   c              	   C   sV  ddl }ddl}ddlm  m  m} ddlm  m  m} t	d|   t
dd}|||  W 5 Q R X |d}g }t|  D ]t}| | }	ttt|	d t|	d }
||}||
}|| ||| ||| ||}|| q||t| |D ]}|| q| }|| ||| ||}| | |! }t
dd	}|| W 5 Q R X t"j#$d
ddkr|j%|d}|& }t'|D ],}|(|}t	|)  t	|*  qt
dd\}t|  D ]H}| | }
|d ttt|
d t|
d  }|| |d qW 5 Q R X dS )z>
    Helper function to write calibration table to files.
    r   Nzcalibration cache: zcalibration.jsonwi   r   zcalibration.flatbufferswbZQUANTIZATION_DEBUG)r   1zcalibration.cache 
)+jsonflatbuffersZ5onnxruntime.quantization.CalTableFlatBuffers.KeyValueZquantizationZCalTableFlatBuffersKeyValueZ5onnxruntime.quantization.CalTableFlatBuffers.TrtTableTrtTablelogginginfoopenwritedumpsZBuildersortedkeysstrrR   r]   ZCreateStringZKeyValueStartZKeyValueAddKeyZKeyValueAddValueZKeyValueEndappendZTrtTableStartDictVectorrt   ZPrependUOffsetTRelativeZ	EndVectorZTrtTableStartZTrtTableAddDictZTrtTableEndZFinishZOutputosenvirongetZGetRootAsTrtTableZ
DictLengthrn   DictZKeyValue)Zcalibration_cacher   r   r   r   fileZbuilderZkey_value_listkeyrq   r?   Zflat_keyZ
flat_value	key_valueZ	main_dictZ	cal_tablebufZdict_lenrg   r   r   r   r   write_calibration_table  sV    








&
r   -C6?c                 C   s   | dk tj}| dk tj}| }| j| }|s:dS |t| t| }|dk shtd|||f |  tj}||| | |  7 }|dk dkst|S )a~  Given a discrete distribution (may have not been normalized to 1),
    smooth it by replacing zeros with eps multiplied by a scaling factor
    and taking the corresponding amount off the non-zero values.
    Ref: http://web.engr.illinois.edu/~hanj/cs412/bk3/KL-divergence.pdf
         https://github.com//apache/incubator-mxnet/blob/master/python/mxnet/contrib/quantization.py
    r   r   r\   z"n_zeros=%d, n_nonzeros=%d, eps1=%f)rO   rP   rQ   sumsizer^   rH   )pZepsZis_zerosZis_nonzerosZn_zerosZ
n_nonzerosZeps1r   r   r   r   smooth_distribution  s     
r   )
model_pathc                 C   s4   t j|  dd}|jjD ]}t|r dS qdS )NF)Zload_external_dataT)rK   loadas_posixgraphr   r   Zuses_external_data)r   modelZ
intializerr   r   r   model_has_external_data9  s
    
r   )r   opt_model_pathc                 C   sF   t  }| |_tj|_i }dg|d< t|  |fddgi|}dS )z
        Generate model that applies graph optimization (constant folding, etc.)
        parameter model_path: path to the original onnx model
        parameter opt_model_path: path to the optimized onnx model
    :return: optimized onnx model
    ZConstantSharingZdisabled_optimizersZ	providersZCPUExecutionProviderN)r   r   Zoptimized_model_filepathr   ZORT_ENABLE_BASICZgraph_optimization_levelr   )r   r   Zsess_optionkwargs_r   r   r   optimize_modelA  s    

r   )r   c                 C   s>   ddi}| j r,| j D ]}||j|ji qtj| | dS )z>Tag the model that it went through quantization pre-processingonnx.quant.pre_processonnxruntime.quantNmetadata_propsupdater   r?   rK   rL   Zset_model_props)r   r   propr   r   r   add_pre_process_metadataQ  s
    
r   )r   r   c                 C   s0   | j r,| j D ]}|jdkr|jdkr dS qdS )zCCheck the model whether it went through quantization pre-processingr   r   TFr   r   r?   )r   r   r   r   r   model_has_pre_process_metadataZ  s
    
r   c                 C   s>   ddi}| j r,| j D ]}||j|ji qtj| | d S )N
onnx.inferr   r   )r   r   r   r   r   r   add_infer_metadatac  s
    
r   c                 C   s0   | j r,| j D ]}|jdkr|jdkr dS qdS )Nr   r   TFr   )r   r   r   r   r   model_has_infer_metadatak  s
    
r   )r   r   c                 C   sB   t | d}tjt| t| t| }t| |  |S )Nz	-inferred)	r   rK   Zshape_inferenceZinfer_shapes_pathr   r   r   r   unlink)r   Zinferred_model_pathr   r   r   r   load_model_with_shape_infers  s    
r   c              
   C   sN   t jdd8}t|d}tj| | dd t|W  5 Q R  S Q R X d S )Nz
ort.quant.)prefixz
model.onnxT)Zsave_as_external_data)tempfileTemporaryDirectoryr   r   rK   Z
save_modelr   r   )r   Zquant_tmp_dirr   r   r   r   &save_and_reload_model_with_shape_infer|  s    r   )r   r   c                 C   s:   | j tjjkrtj| S td| j dt	| j   d S )Nz&Only float type is supported. Weights z is )
Z	data_typerI   r   rM   rK   Znumpy_helperZto_arrayr#   r   type_to_name)r   r   r   r   tensor_proto_to_array  s
    r   )tensor_namer   c                 C   s   | d S )NZ_QuantizeLinearr   r   r   r   r   add_quant_suffix  s    r   c                 C   s   | t  S r   )QUANT_INPUT_SUFFIXr   r   r   r   add_quant_input_suffix  s    r   )r   c                 C   s   | d S )NZ_QuantizeLinear_Outputr   r   r   r   r   add_quant_output_suffix  s    r   c                 C   s   | d S )NZ_DequantizeLinearr   r   r   r   r   add_dequant_suffix  s    r   c                 C   s   | d S )NZ_DequantizeLinear_Inputr   r   r   r   r   add_dequant_input_suffix  s    r  c                 C   s   | t  S r   )DEQUANT_OUTPUT_SUFFIXr   r   r   r   add_dequant_output_suffix  s    r  )NN)F)F)FF)FF)r   )Tr   r   r   enumr   pathlibr   rP   rK   r   r   r   r   rI   Zonnx.helperr   r	   r
   r   Zonnx.referencer   Zonnxruntimer   r   r   Z#onnx.reference.custom_element_typesr   ImportErrorZ__producer____version__Zonnx_domainZ	ms_domainZQUANT_OP_NAMEr   ZDEQUANT_OP_NAMEr  ZTENSOR_NAME_QUANT_SUFFIXrm   dirr   r   r*   r,   r8   r1   rZ   r3   r5   rG   r[   rd   rr   r{   rw   r~   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   boolr   r   r   r   r   Zndarrayr   r   r   r   r   r  r  r   r   r   r   <module>   s   
   
%
*
6


%C
				