
    hǍ                     4   S r SSKrSSKrSSKrSSKrSSKrSSKrSSKJr  SSK	J
r
  SSKrSSKJr  SSKJr  \R"                  " S5      r SS	 jrSS jrSS jrSS jrSS jrS r0 S
SS0 SS/S4S jrS rS r        S S jr0 S
SSSSSSSSSS/4S jrg)!zWeightOnly for onnxrt adaptor.    N)numpy_helper)np_dtype_to_tensor_dtype   )	ONNXModel)simple_progress_barneural_compressorc	           	         X2-  S-  n	[         R                  " UR                  S   U	4SS9n
U R                  S   SU< SU< 3-   nU R                  S   U/n/ n0 nSnUS	:X  a1  US
S
2S
S
S24   US
S
2SS
S24   S	-  -  nUS
S
2S
U	24   U
S
S
2S
S
24'   O"US:X  a  Un
O[        R                  SU S35        [         R                  " U
SXI45      n
[         R                  " USU45      nUR                  [         R                  :X  d   UR                  [         R                  :X  d   e[        R                  R                  U R                  S   S-   [        UR                  5      UR                  UR                  5       SS9nUR                  UR                   5        UR                  U5        UGb}  US:X  a  UR#                  S5      nOUS	:X  a  [         R$                  " UR                  S   S-   S-  SSS9n[         R&                  " UR                  S   U-  U-  5      R                  S5      nUS
S
S2   nUSS
S2   nUUS-     S-  UU   R)                  5       -  UUS-  '   UUS-     S-  UU   R)                  5       S	-  -  UUS-  '   O[+        SU S35      e[         R                  " UUS   S45      n[        R                  R                  U R                  S   S-   SUR                  UR                  5       SS9nUR                  UR                   5        UR                  U5        US   US'   US   US'   X.S'   X>S'   US:  a  XS'   [        R                  R                  USU
R                  U
R                  5       SS9nUR                  U5        [        R                  R,                  " U4UU R.                  U R                   (       a  U R                   S-   [1        U5      -   OS[1        U5      -   SS.UD6nUU4$ )a
  Build MatMulNBits node.

Args:
    node: original matmul node
    weight_shape: original weight shape
    num_bits (int): num_bits
    group_size (int): how many elements share one scale/zp
    k_blocks (int): block number
    q_weight (array): quantized weight
    scale (array): scale
    zero_point (array): zero point
    accuracy_level (int): accuracy level. Support 0 (unset), 1(fp32), 2(fp16), 3(bf16), or 4(int8).

Returns:
    matmul_weight_only_node: MatMulNBits node
    new_inits: initializers of the new node
   r   uint8dtyper   _QGMatMulNBits   N   z8MatMulNBits does not have kernel support for num_bits = ._scaleTname	data_typedimsvalsraw         _zpKNbits
block_sizeaccuracy_levelzcom.microsoft)inputsoutputsr   domain)npzerosshapeinputloggererrorreshaper   float32float16onnxhelpermake_tensorr   tobytesappendr   astypefullarangeravel
ValueError	make_nodeoutputstr)nodeweight_shapenum_bits
group_sizek_blocksq_weightscale
zero_pointr$   	blob_sizepackedq_weight_nameinput_names	new_initskwargsop_typeq_weight_pairsscale_tensor	packed_zpidxeven_idxodd_idx	zp_tensorq_weight_tensormatmul_weight_only_nodes                            p/var/www/fran/franai/venv/lib/python3.13/site-packages/onnxruntime/quantization/neural_compressor/weight_only.pymake_matmul_weight_only_noderW   ,   s   8 %*IXXx~~a()4GDFJJqMbAj^$DDM::a=-0KIFG 1}!!SqS&)HQ1W,=,BB%a)m4q!t	QOPXzYZ[\ZZX 9:F JJur8n-E;;"**$rzz(AAA;;**ZZ]X%*5;;7[[]]_ + L |(()\" q="))'2I]!1!1!!4q!8Q >7SI))J,,Q/8;hFGOOPRSC3Q3xH!$Q$iG(1(a-(@4(G:V^K_KeKeKg'gIh!m$'0A'>'E*U\J]JcJcJeijJj&kIgl#WX`WaabcddJJy<?B*?@	KK++A&!)//PYPaPaPcim , 
	 	9>>*# q/F3Kq/F3K6N%<#1 kk--\\^^ . O _%"kk3315TYYH-s8}@T  #I--    r       asymc           	      (   [         R                  " U SU45      n US:X  d  US:X  a  SU-  S-
  nSnO*US:X  a$  US:w  a  SUS-
  -  S-
  OSnUS:w  a	  SUS-
  -  * OSn[         R                  " U SSS	9U-  n[         R                  " U SSS	9U-  n	US:X  a  [         R                  " [         R
                  " U5      [         R
                  " U	5      5      n
[         R                  " U	R                  5      nU
S:  nX   S
-  R                  [         R                  5      WW-
  -  X'   US:X  a   [         R                  " UR                  5      O'[         R                  " U	R                  SS9SUS-
  -  -  nGO%[         R                  " U	R                  5      n[         R                  " X-
  X:g     R                  5       R                  5        Vs/ s H  n[        U5      WW-
  -  PM     sn5      XU	:g  '   US:X  a4  [         R                  " UR                  5      U-
  U-  R                  5       Ol[         R                  " S[         R                   " W[         R                  " UR                  5      U-
  U-  R                  5       5      5      R                  S5      n[         R"                  " XR$                  S9n[         R&                  " XUS9  [         R(                  " XUS9  [         R                  " XS9  [         R*                  " UWWUS9  XU4$ s  snf )a  Quantize tensor per group.

Args:
    data : input weight
    num_bits (int, optional): num_bits. Defaults to 4.
    group_size (int, optional): how many elements share one scale/zp. Defaults to 4.
    scheme (str, optional): quantization scheme. Defaults to "asym".
    dtype (str, optional): data type. Defaults to "int".
    ratio (float, optional): percentile of clip. Defaults to 1.0.

Returns:
    output: quantized weight
    scale: scale
    zero_point: zero point
r   rZ   uintr   r   r   symTaxiskeepdimsg       @intr   r   out)r(   r.   minmaxmaximumabsonesr*   r6   float64r)   arrayflattentolistfloatroundminimum
empty_liker   divideaddclip)datar@   rA   schemer   ratiomaxqminqrminrmax	max_rangerD   maskrE   irC   s                   rV   quant_tensorr~      s     ::dR,-D5F?({Q	5*2a-qX\"Q&Q)1Qx!|$%B66$Q.6D66$Q.6DJJrvvd|RVVD\:	

#1} ,44RZZ@D4KP%*e^BHHU[[!SZ9[_`empqeq_r9s 	 

# hh04T\/J/R/R/T/[/[/]^/]!U1X%/]^
dl
 ~ hhu{{#d*e3::<Arzz$"((5;;2G$2NRW1W0^0^0`abiijqr 	 }}T5HIIdx(FF8X.HHX$GGHdDh/J&& _s   
Lc                    [         R                  " U SU45      R                  [         R                  5      n SU-  S-
  nSn[         R                  " U S-  SSS9n[         R
                  " XR-  5      n[         R                  " U[         R                  " U 5      5      n[         R                  " U SSS9n[         R                  " U SSS9n	[         R                  " USSS9n
[         R                  " Xp-  SSS9n[         R                  " U	R                  U R                  S9nX:g  nX4-
  X   X   -
  -  X'   SU-  n[         R                  " [         R                  " XU-
  -  5      XC5      nX-  U-   U -
  n[         R                  " UUS-  -  SSS9nSnS	nSn[        U5       GH  n[         R                  " U	R                  U R                  S9n[         R                   " UUU-  -   U-   U-
  /5      R                  U R                  5      S   nX:g  nUX   X   -
  -  UU'   [         R                  " [         R                  " UX-
  -  5      XC5      nUU-  n[         R                  " USSS9n[         R                  " UU-  SSS9n[         R                  " UU -  SSS9n[         R"                  " U
U-  US-  5      nU
U-  UU-  -
  U-  nUU-  UU-  -
  U-  nUU-  U-   U -
  n[         R                  " UUS-  -  SSS9n [         R                   " U 5      n![         R                   " U5      n"[         R$                  " U!U":  5      S   n#UU#S
S
24   UU#S
S
24'   U U#   UU#'   UU#   UU#'   UU#   UU#'   GM     [         R                  " U* U-  R                  5       SU5      R                  S5      n$UR                  [         R&                  5      n[         R(                  " XR                  S9n%[         R*                  " XU%S9  [         R                  " U%U$U%S9  [         R                  " U%U%S9  [         R                  " U%XCU%S9  U%UU$4$ )a  Quantize tensor per group based on k quant.

Ref: https://github.com/ggml-org/llama.cpp/blob/64eda5deb9859e87a020e56bab5d2f9ca956f1de/ggml/src/ggml-quants.c

Args:
    data : input weight
    num_bits (int, optional): num_bits. Defaults to 4.
    group_size (int, optional): how many elements share one scale/zp. Defaults to 32.

Returns:
    output: quantized weight
    scale: scale
    zero_point: zero point
r   r   r   r   Tr^   r      皙?Nr   rb   )r(   r.   r6   r/   sumsqrtrr   rg   rd   re   rh   r*   r   rs   rn   rangerj   subtractwhereri   rp   rq   )&rt   r@   rA   rw   rx   sum_x2av_xweightsry   rz   sum_wsum_xiscaler|   rD   
quant_datadiffbest_madnsteprdeltarrminis_
iscale_newfactorquant_data_newmul_weights_quant_data_newsum_lsum_l2sum_xlD
this_scalethis_minmadmad_1
best_mad_1idx_to_replacerE   rC   s&                                         rV   quant_tensor_k_quant_cpur      s    ::dR,-44RZZ@Dh;?DDVVD!G!d3F776&'DffT266$<(G66$Q.D66$Q.DFF7T2EFF7>D9EWWTZZtzz2F<DKDJ$;<FLJE&4K"894FJ$t+Dvvga'a$?HEFEU|WWTZZtzz:
56C</$6=>?FFtzzRSTU|!TZ$*%<=
4**D!EtR%,~%="1DI2^C!VZ[2T9DQKKq1fnuu}49
UNUV^3q8N*X5<ffWtQw&Q>XXh'
%*"45a8(6~q7H(I
>1$%#&~#6  *> :n'7^1 4 D5E/002At<CCGLJLL$E}}T5HIIdx(FF8ZX.HHX8$GGHdh/UJ&&rX   c                     SSK nSSKnUR                  R                  5       (       Ga  UR	                  U 5      n U R                  SU45      R                  UR                  5      n SU-  S-
  nSnUR                  U S-  SSS9nUR                  Xr-  5      nUR                  XR                  U 5      5      n	UR                  U SSS9n
UR                  U SSS9nUR                  U	SSS9nUR                  X-  SSS9nUR                  UR                  U R                   S9nX:g  nXV-
  X   X   -
  -  X'   SU-  nUR#                  UR%                  XU
-
  -  5      Xe5      nUU-  U
-   U -
  nUR                  U	US-  -  SSS9nS	nS
nSn['        U5       GH  nUR                  UR                  U R                   S9nUR)                  UUU-  -   U-   U-
  /5      R                  U R                   5      S   nX:g  nUX   X   -
  -  UU'   UR#                  UR%                  UX
-
  -  5      Xe5      nU	U-  nUR                  USSS9nUR                  UU-  SSS9nUR                  UU -  SSS9nUR+                  UU-  US-  5      nUU-  UU-  -
  U-  n UU-  UU-  -
  U-  n!U U-  U!-   U -
  nUR                  U	US-  -  SSS9n"UR)                  U"5      n#UR)                  U5      n$UR-                  U#U$:  5      S   n%UU%SS24   UU%SS24'   U"U%   UU%'   U U%   UU%'   U!U%   U
U%'   GM     UR#                  U
* U-  R%                  5       SU5      R                  S5      n&UR                  UR.                  5      nUR1                  U UR                   S9n'UR3                  U UU'S9  UR                  U'U&U'S9  UR%                  U'U'S9  UR#                  U'XeU'S9  U'R5                  5       UR5                  5       U&R5                  5       4$ [6        R9                  S5        [;        XU5      $ ! [<         a$    [6        R?                  S5        [;        XU5      s $ f = f)a  Quantize tensor per group based on k quant.

Ref: https://github.com/ggml-org/llama.cpp/blob/64eda5deb9859e87a020e56bab5d2f9ca956f1de/ggml/src/ggml-quants.c

Args:
    data : input weight
    num_bits (int, optional): num_bits. Defaults to 4.
    group_size (int, optional): how many elements share one scale/zp. Defaults to 4.

Returns:
    output: quantized weight
    scale: scale
    zero_point: zero point
r   Nr   r   r   Tr^   r   r   r   r   rb   zqTry to use k-quant quantization on CUDA. However, CUDA is not available.Fall back to k-quant quantization on CPU.zNow we are using k-quant quantization on cpu, which is time consuming.Please consider install cupy to speed up on CUDA. See https://cupy.dev/Please also install torch to check CUDA availability.) cupytorchcudais_availableasarrayr.   r6   r/   r   r   rr   rg   rd   re   rh   r*   r   rs   rn   r   rj   r   r   ri   rp   rq   getr,   warningr   ImportErrorinfo)(rt   r@   rA   cpr   rw   rx   r   r   r   ry   rz   r   r   r   r|   rD   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rE   rC   s(                                           rV   quant_tensor_k_quant_cudar     s   ID::""$$::d#D<<Z 0188DDh;?DDVVD!G!dV;F776./DffT66$<0G66$Q66D66$Q66DFF7TF:EFF7>DFAEWWTZZtzzW:F<D KDJ,CDFLJE&4K*@!A4NJ:%,t3Dvvga/a$vGHEFEU|WWTZZtzzWB
56C<#7$#>#E"FGNNtzzZ[\]|#)TZ$*-D#E
4 !#*2L)Mt!Z-4~-E*9DQ :^ KRS^bc :T ATXYKKq9#fnuu}<A
"UNUV^;q@!N2X=DffWtQw.QfFXXh/
!#%**<!=a!@0>~q?P0Q
>1,-+.~+>((2>(Bn%'/'?^$1 $4 D5E/!8!8!:AtDKKGTJLL,E}}T}=HIIdExI0FF8ZXF6HHX8H,GGHdhG7<<>599;
0@@@NN< ,DJGG DD	

 (
CCDs   PP%  P% %+QQc                 r    U R                   n[        XX#XE5      u  pxn	[        R                  " XU	-
  -  U5      $ )a  Quant dequant tensor per group.

Args:
    data : input weight
    num_bits (int, optional): num_bits. Defaults to 4.
    group_size (int, optional): how many elements share one scale/zp. Defaults to 4.
    scheme (str, optional): quantization scheme. Defaults to "asym".
    dtype (str, optional): data type. Defaults to "int".
    ratio (float, optional): percentile of clip. Defaults to 1.0.

Returns:
    output: quant-dequant weight
)r*   r~   r(   r.   )
rt   r@   rA   ru   r   rv   	org_shapeweightrD   zps
             rV   
qdq_tensorr   y  s8     

I$TZVF2::e{+Y77rX   c                     US:X  a  U $ U R                   nX!-  nXCS   -
  nUS:  a  [        R                  " U SU4S4S5      n U $ )zPad tensor rowi so that it can be is divisible by group_size.

Args:
    weight (array): weight
    group_size (int): how many elements share one scale/zp
    k_blocks (int): the number of block

Returns:
    weight: paded weight
r   r   )r   r   constant)r*   r(   pad)r   rA   rB   org_w_shapepadded_rowspad_lens         rV   
pad_tensorr     sU     R,,K'KN*G{!Wv 6
CMrX   CPUExecutionProviderk_quantc	                    [        U 5      n U R                  b)  [        R                  R	                  U R                  5      OSn	/ n
/ n[        U R                  5        Vs/ s H  oR                  S;   d  M  UPM     sn5      nSnU R                  5        GH  nUR                  S;   a  US-  n[        X5        UR                  S;   d  M6  U R                  UR                  S   5      c  MW  UR                  UR                  0 5      S:w  d  My  U R                  UR                  S   5      n[        R                  " UU	S9R                  5       n[        UR                   5      S:w  a  M  UR"                  nUR                  U;   a3  XR                     S	   nXR                     S
   nXR                     S   nUR                   nUS:w  a  UOUS   nUS   S-
  U-  S-   nU R%                  UR                  S   5      n['        UUU5      nUS:H  =(       d    US:H  nU(       a  US:X  a  [)        UR*                  X#5      u  nnnO:[-        UR*                  X#USUR                  UR                  S   S5      5      u  nnn[/        UUUUUUR1                  S5      UR1                  U5      US:X  d  US:X  a  UOSUS9	u  nnU R3                  U5        UR5                  U5        U
R5                  U5        GO	[7        UR*                  X#USUR                  UR                  S   S5      5      n[8        R:                  " UUS   S45      n[8        R<                  " U5      nUSUS   2SS24   R1                  U5      n[>        R@                  RC                  UR                  S   SU< SU< 3-   [E        U5      UR                   URG                  5       SS9nU RI                  U5        UR                  UR                  S'   US:X  d  GMp  U RK                  U5        GM     U RM                  U
5        U RO                  U5        U RQ                  5         U $ s  snf )a|  Quant the model with round to nearst method.

Args:
    model (ModelProto or ONNXModel): onnx model
    weight_config (dict): quantization config
            For example,
            weight_config = {
                'fc2':
                    {
                        'bits': 4,
                        'group_size': 32,
                        'scheme': 'sym',
                        'algorithm': 'RTN'
                    }
            }
    num_bits (int, optional): num_bits. Default is 4.
    group_size (int, optional): how many elements share one scale/zp. Default is 32.
    scheme (str, optional): sym or asym. Defaults to "asym".
    ratios (dict, optional): percentile of clip. Defaults to {}.
    accuracy_level (int): accuracy level. Support 0 (unset),1(fp32), 2(fp16), 3(bf16), or 4(int8).
    providers (list): providers to use

Returns:
    model: fake quantized ONNXModel
N MatMulr   r   fp32)base_dirr   r"   rA   ru   r   r   r
   r   r\   r   rZ   	r>   r?   r@   rA   rB   rC   rD   rE   r$   ra   r   r   Tr   ))r   
model_pathospathdirnamelennodesrL   r   get_initializerr+   r   r   r   to_arraycopyr*   r   get_initializer_share_numr   r   Tr~   rW   r6   add_initializersr5   r   r(   r.   	transposer1   r2   r3   r   r4   add_initializerremove_initializer	add_nodesremove_nodestopological_sort)modelweight_configr@   rA   ru   ratiosr$   	providers	algorithmr   	new_nodesr   r}   	total_numcurr_idr>   weight_tensorr   r   r   rB   init_share_numsatisfy_MatMulNBits_conditionrC   rD   r   q_matmul_noderJ   rT   s                                rV   rtn_quantizer     s   H eE494D4D4Prwwu//0VXHILI1j1HQIJIG<<:%qLG	3LLJ&%%djjm4@!!$))R0F:!11$**Q-@M!**=8LQQSF6<< A%LLEyyM)(3F;*995lC
&yy1(; ,,K'1R'7[^J#A*z9A=H"<<TZZ]KN
H=F,4M,JX]),	)*CFHHh*c'HeR*6(

SWS]S]^_S`bcHd+'HeR ,H!,%)%%__W5,,u-%+v%5i9OrUY#1
,(y &&y1##D)  /%fhhfeU[U_U_`d`j`jkl`mopUqr::hQ0DE<<1#$4k!n$4a$78??F"&++"9"9A2h\:.)II6u=!))+ #: # %%o6 / 4 4

1"((7K N 
OOI	|$	LY Js   Q6Qc                 2   U R                   nUS:w  a  [        R                  " U SU45      OU n [        R                  " [        R                  " [        R                  " U 5      [        R
                  " [        R                  " U 5      SSS9-  U5      SS9nU$ )zGet the scale of weight.r   r   Tr^   r   r_   )r*   r(   r.   meanrg   re   )r   rA   r   rD   s       rV   get_weight_scaler     sn    I5?25ERZZZ 016FGGBJJrvvf~rvvf~AX\0]]_hipqrELrX   c                    SSK Jn  SSKJn  [        R
                  " 5       n[        R                  S:  a)  U" S5      (       a  SSKJ	n  UR                  U" 5       5        U R                  (       a/  [        R                  " U R                  U R                  S-   S	S	S
S9  U R                  (       d-  [        R                   " U R                  R#                  5       XcS9O![        R                   " U R                  S-   XcS9nUR%                  5        V	s/ s H  oR&                  PM     n
n	A/ n[)        U5       GH  u  pUS:w  a  U	S-   UR*                  -  U:  a    X4$ [-        U
5      S:w  d  [/        US   [0        5      (       a=  [-        US   5      [-        U
5      :X  d"   S[-        U
5       S[-        US   5       35       e[/        US   [0        5      (       aI  UR3                  [1        US   R5                  5        VVs/ s H  u  pX" U5      4PM     snn5      5        M  [/        US   [6        R8                  5      (       a?  UR3                  [1        [;        XS   /S
S9 VVs/ s H  u  pX4PM
     snn5      5        GML  UR3                  [1        [;        XS   S
S9 VVs/ s H  u  pX" U5      4PM     snn5      5        GM     X4$ s  sn	f s  snnf s  snnf s  snnf )aO  Prepare inputs for weight only quantization.

Args:
    model (ModelProto or ONNXModel): onnx model
    n_samples (int, optional): calibration sample number. -1 means all samples.
    dataloader (object): dataloader for calibration.
    providers (list): providers to use

Returns:
    inputs: prepared inputs.
    so: session options
r   )	find_specr   )to_numpy)      onnxruntime_extensions)get_library_path_augment.onnxTFsave_as_external_dataall_tensors_to_one_fileconvert_attributer   r   zInput number mismatch, require z	 but get strict)importlib.utilr   utilr   ortSessionOptionssysversion_infor   r   register_custom_ops_libraryis_large_modelr1   
save_modelr   r   InferenceSessionSerializeToString
get_inputsr   	enumerate
batch_sizer   
isinstancedictr5   itemsr(   ndarrayzip)r   	n_samples
dataloaderr   r   r   sor   sessionr}   inputs_namesr%   rt   r   inp_datainps                   rV   prepare_inputsr  #  s}    )				B
'!i0H&I&I;
&&'7'9:KK."&$(#	
 ## 	U[[::<bV!!%"2"2_"Db^ 
 %,$6$6$89$8qFF$8L9FZ(?Q**?*? ?9L : |!ZQ%>%>tAw<3|#44 1#l2C1DIcRVWXRYl^\4 d1gt$$MM$QUVWQXQ^Q^Q`aQ`~tx'9 :Q`abcQ,,MM$SUVPWyaf=gh=g	=ghijMM$s<^_Y`inGopGo)$x} 5Gopqr ) :% :  bhps   K>K$K"K   {Gz?FTc
                   ^^^	^#^$^%^& SU-  S-
  m$Sm#Sm%Sm&U#U$U%UU&U	U4S jn
U R                   nU
" U 5      u  p[        R                  " U5      S:H  nSXU4'   SXSS24'   U(       aG  [        R                  " [        R                  " U5      5      SSS	2   nXSS24   n XSS24   SS2U4   n[        R                  " U 5      n[        R                  " U 5      nU[        R
                  " [        R                  " U5      5      -  n[        R                  " US   5      nUUU4==   U-  ss'   [        R                  R                  [        R                  R                  U5      5      R                  nUn[        SUS   U5       GH  n[        UU-   US   5      nUU-
  n[        R                  " U UU2SS24   5      n[        R                  " U5      n[        R                  " U5      n[        R                  " U5      nUUU2UU24   n[        U5       GH!  nUUSS24   nUUU4   nUS	:w  a(  UU-   U-  S:X  a  U
" U UU-   UU-   U-   2SS24   5      u  pU[        R                  " [        R                   " USS2[        R"                  4   U-  5      U-   ST$5      U-
  -  R%                  5       n U UUSS24'   UU -
  S-  US-  -  UUSS24'   UU -
  U-  n!UUS2SS24==   [        R&                  " [        R(                  " UUS2U4   SS
9[        R(                  " U!SS
95      -  ss'   U!UUSS24'   GM$     UUUU2SS24'   US-  UUU2SS24'   U US2SS24==   [        R&                  " UUS2UU24   U5      -  ss'   GM     U(       a  [        R                  " W5      n"UU"SS24   n[        R*                  " UU R                   5      nA U$ )a  Quant the weight with GPTQ method.

Args:
    W (array): weight.
    H (array): Hessian matrix.
    num_bits (int, optional): num_bits. Default is 4.
    group_size (int, optional): how many elements share one scale/zp. Default is 32.
    scheme (str, optional): sym or asym. Defaults to "asym".
    blocksize (int, optional): blocksize to quantize weight.
    percdamp (float, optional): percent of the average Hessian diagonal to use for dampening.
    actorder (bool, optional): whether rearrange Hessian matrix considering the diag's value.
    mse (bool, optional): whether get scale and zero point with mse error.
    perchannel (bool, optional): whether quantize weight per-channel.

Returns:
    Q: fake quantized weight
r   r   d   g?g333333@c                 F  > U R                   nT(       d#  [        R                  " U R                  5       SS9n [        R                  " U R                   S   5      n[        R
                  " [        R                  " U SS9U5      n[        R                  " [        R                  " U SS9U5      nTS:X  aR  [        R                  " [        R                  " U5      U5      nUS:  n[        R                  " U5      (       a  XB   * X2'   US:H  US:H  -  nSX2'   SXB'   XC-
  T-  nTS:X  a*  [        R                  " UR                   5      TS-   -  S-  nO[        R                  " U* U-  5      nT(       Ga1  [        R                  " U R                   S   /5      [        S5      -  n[        [        TT-  5      5       H  nSUT-  -
  n	X-  n
X-  nX-
  T-  nTS:w  a  [        R                  " U
* U-  5      OUn[        R                   " [        R                  " X-  5      U-   ST5      nX-  n[        R"                  " [        R                  " U5      T5      n[        R$                  " US5      nX:  n[        R                  " U5      (       d  M  X   Xr'   X   XR'   X   Xb'   M     T(       d1  US   n[        R&                  " XR5      n[        R&                  " Xb5      nS/S/[)        U5      S-
  -  -   n[        R*                  " UU5      n[        R*                  " UU5      nXV4$ )Nr   r   r   r]   r   r   inf)r*   r(   expand_dimsrk   r)   ro   rd   rf   re   rg   anyrh   rn   rm   r   ra   rs   powerr   repeatr   r.   )r   r   tmpxminxmaxrD   zerobestr}   pxmin1xmax1scale1zero1qerrr*   gridrw   	maxshrinkmsenorm
perchannelru   s                    rV   find_paramsgptq.<locals>.find_params~  s   LL	^^FNN$41=Fhhv||A'zz"&&a0#6zz"&&a0#6U?::bffTlD1D(Cvvc{{!YJ	qyTQY'		$U?775;;'4!84q8D88TEEM*D77FLLO,-e<D3y4/01DL-4/5;u_%&1$GGBHHV_5=q$GHHRVVAY-ffQlj66#;; #DI!'EJ %
DI 2 A,CIIe)E99T'Dsc)nq011

5%(zz$&{rX   r   Nr   r   )r*   r(   diagargsort
zeros_liker   r8   linalgcholeskyinvr   r   rd   r   deepcopyrs   rn   newaxisrk   matmulr  r.   )'WHr@   rA   ru   	blocksizepercdampactorderr)  r+  r,  r*   rD   r   deadpermLossesQdampr.  Hinvi1i2countW1Q1Err1Losses1Hinv1r}   wdr%  err1invpermr'  rw   r(  r*  s'       `   ``                         @@@@rV   gptqrN  \  s   : h;?DDID. .` GGEAIE771:?DADjMAAgJ zz"''!*%dd+AgJAgJq$w]]1F
aAbggbggaj))D99U1XDdDjMTM
		299==+,..ADAuQx+iq*R]]1RUAX;']]2}}R --#RUBrE\"uA1a4AadARFj(A- +ArAv"q&::M.NPQ.Q,R SIE"''"((1Q

]+;e+C"Dr"I1dSVXXYbbdABq!tHUqL1a4/GAqDMEQ;Dqr1uI2>>%A,Q#GX\cdIeffIDAJ   "R%("Q;r"uax	"#q&	RYYtBCBJ/66	; ,> **T"gqjM


1aggA	HrX   c                 R   [        U 5      n U R                  b)  [        R                  R	                  U R                  5      OSn[        XX5      u  nnA[        R                  " U R                  R                  R                  5      nU R                  U Vs/ s H  nUR                  PM     sn5        / nU R                  5        H  nUR                  S;   d  M  UR                  UR                  0 5      S:w  d  M7  UR                  UR                  0 5      R                  SS5      S:X  d  Mi  UR!                  UR"                  S   5        M     [%        ['        U5      5      nU R)                  U5        U R*                  (       a/  [,        R.                  " U R                  U R                  S-   S	S	S
S9  U R*                  (       d.  [0        R2                  " U R                  R5                  5       UUS9O"[0        R2                  " U R                  S-   UUS9n[7        U5       GH  u  nn[9        [;        U5      US-   5        / n/ nU R<                  U    GH<  nUR                  S;   d  M  UR                  UR                  0 5      S:w  d  M8  UR                  UR                  0 5      R                  SS5      S:X  d  Mj  U R?                  UR"                  S   5      c  M  [@        RB                  " U R?                  U RE                  UR                  5      R"                  S   5      U5      R                  5       n[;        URF                  5      S:w  a  GM  UR!                  U5        UR!                  U RE                  UR                  5      5        GM?     [;        U5      S:X  a  GM  U Vs/ s H5  n[H        RJ                  " URF                  S   URF                  S   45      PM7     nnSnU H  nURM                  U/U5      S   nURF                  S   n[H        RN                  " USURF                  S   45      nU Vs/ s H  nUUUU-   -  -  PM     nnUU-  n[H        RP                  " SU-  5      U-  nU Vs/ s H'  nU[H        RR                  " URT                  U5      -   PM)     nnM     [W        UUUS
S9 GH)  u  nnn UR                  U;   a6  UUR                     S   nUUR                     S   nUUR                     S   nUS:w  a  UOURF                  S   nURX                  n![[        UU UUUUUU	U
US9
n"U R?                  UR"                  S   5      n#U R]                  UR"                  S   5      n$US:H  n%U%(       a  URF                  n&U&S   U-   S-
  U-  n'[_        U"UU'5      n"[a        U"RT                  X4US5      u  n"n(n)[c        UU&UUU'U"Re                  S5      U(Re                  U!5      US:X  a  U)OSUS9	u  n*n+U Rg                  U+5        U Ri                  U5        U Rk                  U*5        O[,        Rl                  Ro                  UR"                  S   SU< SU< 3-   [q        U!5      U"RF                  U"Re                  U!5      Rs                  5       S	S9n,U Ru                  U,5        U,R                  UR"                  S'   U$S:X  d  GM  U Rw                  U#5        GM,     GM     U R                  U5        U R                  R                  R                  Ry                  U5        U R{                  5         U R*                  (       aC  SSK>J?n-  U-" U R                  [        R                  R                  U R                  5      S   5        U $ s  snf s  snf s  snf s  snf )a+  Quant the model with GPTQ method.

Args:
    model (ModelProto or ONNXModel): onnx model
    dataloader (object): dataloader for calibration.
    weight_config (dict): quantization config
            For example,
            weight_config = {
                'fc2':
                    {
                        'bits': 4,
                        'group_size': 32,
                        'scheme': 'sym',
                        'algorithm': 'GPTQ'
                    }
            }
    num_bits (int, optional): num_bits. Default is 4.
    group_size (int, optional): how many elements share one scale/zp. Default is 32.
    scheme (str, optional): sym or asym. Defaults to "asym".
    n_samples (int, optional): calibration sample number.
    percdamp (float, optional): percent of the average Hessian diagonal to use for dampening.
    blocksize (int, optional): blocksize to quantize weight.
    actorder (bool, optional): whether rearrange Hessian matrix considering the diag's value.
    mse (bool, optional): whether get scale and zero point with mse error.
    perchannel (bool, optional): whether quantize weight per-channel.
    accuracy_level (int): accuracy level. Support 0 (unset), 1(fp32), 2(fp16), 3(bf16), or 4(int8).
    providers (list): providers to use

Returns:
    model: fake quantized ONNXModel
Nr   r   r   r   GPTQr   r   TFr   r   r   r   r   r   r"   rA   ru   )r@   rA   ru   r9  r:  r;  r)  r+  r   r\   r   rZ   r   r   r   r   )load_external_data_for_model)Ar   r   r   r   r   r  r   r4  r   graphr<   remove_tensors_from_outputsr   r   rL   r   r5   r+   listsetadd_tensors_to_outputsr   r1   r   r   r   r   r  r   r   input_name_to_nodesr   r   r   get_noder*   r(   r)   runr.   r   r6  r   r  r   rN  r   r   r~   rW   r6   r   remove_nodeadd_noder2   r3   r   r4   r   r   	MergeFromr   onnx.external_data_helperrQ  split).r   r
  r   r@   rA   ru   r	  r:  r9  r;  r)  r+  r$   r   r   r%   r  
org_outputr}   output_namesr>   r  rP   
input_name	node_listr   r   Hsnsamplesrt   r  r  r8  r   rC   r   r   r   r   rB   rD   r   r   rJ   rT   rQ  s.                                                 rV   gptq_quantizere    s   ^ eE494D4D4Prwwu//0VXH*HJFBu{{00778J	%%z&Bz!qvvz&BCLLLJ&!!$))R0F:!!$))R044[&IVS

1.  L)*L	  .KK."&$(#	
 ## 	U[[::<bIV!!%"2"2_"DbT]^  %\2ZC-sQw7	--j9D
*!%%dii4>!%%dii488fMQWW))$**Q-8D%..))%..*C*I*I!*LMx$&  v||$)v&  		!:; :  w<1:AB'Qbhh
AGGAJ/0'BD++zlD1!4C))A,C**S2syy}"56C=?@R!x8c>23RB@OH''!h,'#-C578R!biis++RB8B  GR6	
yyM)(3F;*4995lC
&tyy1(;'1R'7V\\!_JLLE!%#!!%H "11$**Q-@M"<<TZZ]KN,4M),"LL	%aL:59jH%h
HE&28::xU[]c&d#%+G!*%)%%__W5,,u-%+v%5r4#1
,(y &&y1!!$'}-"&++"9"9A2h\:.)II6u=!!/779 #: # %%o6 / 4 4

1"((7s 7O 3D 
%%l3	KK&&z2	 J$U[["''--@P@P2QRS2TULO 'Cd C A 9s   ^,<^ ^<.^$)r   )r   rY   rZ   ra   g      ?)r   rY   )r   rY   rZ   r  r  FFT)__doc__r   loggingr   r   numpyr(   r1   r   onnx.helperr   onnxruntimer   
onnx_modelr   r   r   	getLoggerr,   rW   r~   r   r   r   r   r   r   r  rN  re   rX   rV   <module>rn     s   0 %   	 
    0  ! %			.	/ p.f3'lF'RXDv8&4 %&tn6x I^ %&|rX   