U
    h"                     @   sH  d dl Z d dlZd dlZd dlmZmZ d dlZd dlmZ	 d dl
Z
d dlmZmZmZmZ d dlmZ ddlmZ ddlmZ ejdejd	 eeZG d
d dZdd ZedkrDe Zejreej  ej!Z"ej#Z$ej%&e$re'de$ d e(de$ de
)e"Z*ee*ej+ej,ej-dZ.e./  e.j*0e$d dS )    N)ListTuple)
GraphProto
ModelProto	NodeProtoTensorProto)quantize_matmul_4bits   )	ONNXModel)attribute_to_kwargz2%(asctime)s %(name)s [%(levelname)s] - %(message)s)formatlevelc                   @   s   e Zd ZdZdeeedddZee	e
 eee
f dddZejejd	d
dZee	e
 edddZe	e
 dddZdd ZdS )MatMul4BitsQuantizerz2Perform 4b quantization of constant MatMul weightsN)model
block_sizeis_symmetricc                 C   s0   |d krg }t || _|| _|| _t|| _d S )N)r
   r   r   r   setnodes_to_exclude)selfr   r   r   r    r   S/tmp/pip-unpacked-wheel-socb9apf/onnxruntime/quantization/matmul_4bits_quantizer.py__init__   s    
zMatMul4BitsQuantizer.__init__)
graph_pathreturnc                 C   sL   t t|d ddD ]2}|| }|jD ]}|j| kr&||f    S q&qdS )Nr	   )NN)rangeleninitializername)r   r   gidgraphZtensorr   r   r   Z__get_initializer%   s    

z&MatMul4BitsQuantizer.__get_initializer)
fp32weightr   c              	   C   s   t |jdkrtd|j\}}| j}|d }|| d | }|| }|| }|dkrlt|d|fdfd}tj|||fdd}	tj|| |jd}
tj|| d d dd}t|	||
||||| j	 |	|
|fS )	z!4b quantize fp32 weight to a blob   z9Current int4 block quantization only supports 2D tensors!r	   r   )r   r   ZconstantZuint8)dtype)
r   shape
ValueErrorr   nppadzerosr#   r   r   )r   r!   rowscolsr   Z	blob_sizeZk_blocksZpadded_rowsZpad_lenpackedscalesZ
zero_pointr   r   r   int4_block_quant.   s    
z%MatMul4BitsQuantizer.int4_block_quant)nodegraph_stackr   c                 C   s  |j dkr|S td|j d |j| jkrFtd|j d |S |jd }t||\}}|dkrvtd |S tj	
|}t|jd	krtd
 |S | |\}}}	tj	|}
|jd |
_|jD ]}|j|kr|j|  qqtj	|}|jd |_|j|
|g |jd |
j|jg}| jsbtj	|	}|jd |_|j|g ||j i }|j\}}||d< ||d< d|d< | j|d< tjjd||jd g|jr|jd nddd|}td|j d |S )zdIf the node is MatMul with fp32 const weight, quantize the weight with int4, and return the new nodeZMatMulzstart to quantize z ...zexclude to quantize z$ as specified by nodes_to_exclude...r	   Nz2MatMul doesn't have const weight. Skip to quantizer"   z)MatMul weight is not 2D. Skip to quantizeZ_Q4Z_scalesr   Z_zero_pointsKN   bitsr   MatMulNBits com.microsoft)inputsoutputsr   domainzcomplete quantization of )r4   )op_typeloggerinfor   r   inputr   &_MatMul4BitsQuantizer__get_initializeronnxZnumpy_helperZto_arrayr   r$   r-   Z
from_arrayremover   extendr   appendr   helper	make_nodeoutput)r   r.   r/   ZinputBBZBs_graphZB_arrayr+   r,   Zzero_pointsZB_quantr=   Zscales_tensorZinput_namesZ	zp_tensorkwargsr)   r*   Zmatmul_q4_noder   r   r   _q4_matmul_node_weightE   sb    







 
	z+MatMul4BitsQuantizer._q4_matmul_node_weight)r/   c                 C   s  g }|d }|j D ]}dd |jD }t|ri }|jD ]}|jtjjkrh||j |j	| 
|i}nN|jtjjkrg }	|jD ] }
||
 |	| 
|g q|j	|	i}nt|}|| q8tjj|j|j|jfd|j	i|}|| || q|d |j | |  |S )Nr   c                 S   s,   g | ]$}|j tjjks$|j tjjkr|qS r   )typer?   AttributeProtoGRAPHGRAPHS).0attrr   r   r   
<listcomp>   s    z:MatMul4BitsQuantizer._process_subgraph.<locals>.<listcomp>r   r.   )r.   	attributer   rI   r?   rJ   rK   rB   gr   _process_subgraphrL   ZgraphsrA   r   updaterC   rD   r:   r=   rE   rH   Z
ClearFieldpop)r   r/   Z	new_nodesr    r.   Zgraph_attrsrG   rN   kvvalueZsubgraphr   r   r   rR      sD    



  
z&MatMul4BitsQuantizer._process_subgraphc                 C   sd   | j  g}| j  }d}|D ]}|jdkrd}q|sL|tjddg | | | j 	  d S )NFr6   Tr	   )
r   r    opset_importr9   rA   r?   rC   Zmake_opsetidrR   Zclean_initializers)r   r/   rW   Zhas_ms_domainZopsetr   r   r   process   s    


zMatMul4BitsQuantizer.process)N)__name__
__module____qualname____doc__r   intboolr   staticmethodr   r   r   r   r>   nptZ	ArrayLiker&   Zndarrayr-   r   rH   rR   rX   r   r   r   r   r      s   =&r   c                  C   s   t jdd} | jdddd | jdddd | jd	d
dd | jdd
ddd | jddd
dd | jd
d | jddtd
g dd |  S )Na
  Blockwise int4 quantization for MatMul 2D weight matrices.

A weight matrix is partitioned into into blocks, where each block is a
continguous subset inside each column. Each block is quantized into a
set of 4b integers with a scaling factor and an optional offset.
)descriptionz--input_modelTzPath to the input model file)requiredhelpz--output_modelzPath to the output model filez--block_sizeF    )rb   defaultz--symmetricz4Indicate whether to quantize the model symmetrically)rb   re   rc   z-vz	--verbose
store_true)rb   action)verbosez--nodes_to_exclude+zBSpecify the nodes to be excluded from quantization with node names)nargsrI   rb   re   rc   )argparseArgumentParseradd_argumentset_defaultsstr
parse_args)parserr   r   r   rp      s.    	   	rp   __main__zfile z already exists)r   T)1rk   loggingostypingr   r   Znumpyr&   Znumpy.typingr`   r?   Zonnx.onnx_pbr   r   r   r   Zonnxruntime.capi._pybind_stater   Z
onnx_modelr
   Zquant_utilsr   basicConfigINFO	getLoggerrY   r;   r   rp   argsrh   setLevelDEBUGZinput_modelZinput_model_pathZoutput_modelZoutput_model_pathpathexistserror	Exceptionloadr   r   Z	symmetricr   ZquantrX   Zsave_model_to_filer   r   r   r   <module>   s8   
 

