U
    hT&                     @   s.  d dl Z d dlZd dlmZ d dlmZmZ d dlZd dl	mZ
 d dlZd dlmZmZmZmZ ddlmZ ddlmZmZ eedd	d
ZeedddZeeeedddZee
jejdddZG dd dZdd Zedkr*e Z e j!Z"e j#Z$e j%Z&eee"Z'ee'd Z(e()  e(j'*e$d dS )    N)Path)ListTuple)
GraphProto
ModelProto	NodeProtoTensorProto   )	ONNXModel)attribute_to_kwargload_model_with_shape_infer)
quant_typereturnc                 C   s   dS )N     )r   r   r   U/tmp/pip-unpacked-wheel-socb9apf/onnxruntime/quantization/matmul_weight4_quantizer.py__q4_block_size   s    r   c                 C   s2   | t jkrd}n| t jkr d}ntd|  |S )N      zUnsupported quantization type: )MatMulWeight4QuantizerBlkQ4SymBlkQ4Zp8
ValueError)r   	blob_sizer   r   r   __q4_blob_size   s    

r   )r   rowscolsr   c                 C   s,   t | }t| }|| d | }|| | S )Nr	   )r   r   )r   r   r   
block_sizer   k_blocksr   r   r   __q4_buf_size'   s    r   )r   
fp32weightr   c                 C   sT  t |jdkrtd|j\}}t| }t| }|| d | }|| }|| }|dkrnt|d|fdfd}d}	tj|| |fdd}
t|D ]}|d	d	|f }t	||}|D ]}|
|	 }|	d7 }	| t
jkrtt|}|| }|d
 }d}nbt|}t|}t|d}t|d}|| d }|}|dkrJd||  }tdtdt|}|dkrpd| nd}td|}|d |d< |d |d< |d |d< |d |d< d}| t
jkr||d< d}|d }tt|| | ddd}t	||}|D ]>}t|dd t|dd d|||d < |d7 }qqq|
dS )z!4b quantize fp32 weight to a blob   z9Current int4 block quantization only supports 2D tensors!r	   r   )r   r   ZconstantZuint8)ZdtypeNi   g           g      ?f         r      )lenshaper   r   r   nppadzerosrangesplitr   r   Zargmaxabsminmaxroundstructpackr   ZclipZrintastypeZ
bitwise_orZ
left_shiftZreshape)r   r    r   r   r   r   r   Zpadded_rowsZpad_lenZblob_idxpackednZncolZblksZblkZpacked_blobZamax_idxZbmaxZscaleZzpZvminZvmaxZzero_point_fpZreciprocal_scaleZbfZblob_offsetZnum_segsZblk_intZsegssegr   r   r   int4_block_quant.   sb    







"0r;   c                   @   sx   e Zd ZdZdZdZeedddZe	e
e eeef ddd	Zee
e ed
ddZe
e dddZdd ZdS )r   z2Perform 4b quantization of constant MatMul weightsr   r	   )modelr   c                 C   s   t || _|| _d S )N)r
   r<   r   )selfr<   r   r   r   r   __init__x   s    
zMatMulWeight4Quantizer.__init__)
graph_pathr   c                 C   sL   t t|d ddD ]2}|| }|jD ]}|j| kr&||f    S q&qdS )Nr	   r)   )NN)r/   r*   initializername)rA   r?   gidgraphZtensorr   r   r   Z__get_initializer|   s    

z(MatMulWeight4Quantizer.__get_initializer)nodegraph_stackr   c                 C   sD  |j dkr|S |jd }t||\}}|dkr4|S tj|}t|jdkrR|S |j\}}t	| j
|}	tj|	}
|jd |
_|j| |jD ]}|j|kr|j|  qqtjt||gtj}|jd |_|j|
|g i }| j
|d< tjjd|jd	 |
j|jg|jd	 g|jr0|jd nd
dd|}|S )zdIf the node is MatMul with fp32 const weight, quantize the weight with int4, and return the new nodeZMatMulr	   Nr!   Z_Q4Z_shapeZblk_quant_type
MatMulFpQ4r    com.microsoft)inputsoutputsrA   domain)rF   )op_typeinputr   (_MatMulWeight4Quantizer__get_initializeronnxZnumpy_helperZto_arrayr*   r+   r;   r   Z
from_arrayrA   r@   remover,   arrayr7   Zint64extendhelper	make_nodeoutput)r=   rD   rE   ZinputBBZBs_graphZB_arrayr   r   r8   ZB_quantrM   ZB_shapekwargsZmatmul_q4_noder   r   r   _q4_matmul_node_weight   sD    





 
z-MatMulWeight4Quantizer._q4_matmul_node_weight)rE   c                 C   s  g }|d }|j D ]}dd |jD }t|ri }|jD ]}|jtjjkrh||j |j	| 
|i}nN|jtjjkrg }	|jD ] }
||
 |	| 
|g q|j	|	i}nt|}|| q8tjj|j|j|jfd|j	i|}|| || q|d |j | |  |S )Nr)   c                 S   s,   g | ]$}|j tjjks$|j tjjkr|qS r   )typerO   AttributeProtoGRAPHZGRAPHS).0attrr   r   r   
<listcomp>   s    z<MatMulWeight4Quantizer._process_subgraph.<locals>.<listcomp>rA   rD   )rD   	attributer*   rY   rO   rZ   r[   appendgrA   _process_subgraphZgraphsrR   r   updaterS   rT   rL   rM   rU   rX   Z
ClearFieldpop)r=   rE   Z	new_nodesrC   rD   Zgraph_attrsrW   r]   kvvalueZsubgraphr   r   r   rb      sD    



  
z(MatMulWeight4Quantizer._process_subgraphc                 C   sZ   | j  g}| j  }d}|D ]}|jdkrd}q|sL|tjddg | | d S )NFrH   Tr	   )	r<   rC   opset_importrK   rR   rO   rS   Zmake_opsetidrb   )r=   rE   rg   Zhas_ms_domainZopsetr   r   r   process   s    

zMatMulWeight4Quantizer.processN)__name__
__module____qualname____doc__r   r   r   intr>   staticmethodr   r   r   r   rN   r   rX   rb   rh   r   r   r   r   r   k   s   *&r   c                  C   sD   t jdd} | jdddd | jdddd | jd	dd
d |  S )Na
  Blockwise int4 quantization for MatMul 2D weight matrices.

A weight matrix is partitioned into into blocks, where each block is a
continguous subset inside each column. Each block is quantized into a
set of 4b integers with a scaling factor and an optional offset.
)descriptionz--input_modelTzPath to the input model file)requiredhelpz--output_modelzPath to the output model filez--quant_bin_pathzCurrently quantization code is implemented in a separate binary
(onnxruntime_mlas_q4dq) that is compiled with Onnxruntime native code.
Path to this binary needs to be provided here.)argparseArgumentParseradd_argument
parse_args)parserr   r   r   ru      s    	ru   __main__F)+rr   r5   pathlibr   typingr   r   Znumpyr,   Znumpy.typingZnptrO   Zonnx.onnx_pbr   r   r   r   Z
onnx_modelr
   Zquant_utilsr   r   rm   r   r   r   Z	ArrayLikeZndarrayr;   r   ru   ri   argsZinput_modelZinput_model_pathZoutput_modelZoutput_model_pathZquant_bin_pathZq4dq_bin_pathr<   Zquantrh   Zsave_model_to_filer   r   r   r   <module>   s0   =y

