
    h--                       S SK Jr  S SKrS SKrS SKrS SKrS SKrS SKJ	r
  S SKrS SKJr  S SKJrJr  S SKJrJrJrJrJrJr  S SKJr  S SKJr  S SKJr  S S	KJr  S S
K J!r!  S SK"r#\RH                  " S5      r%SS jr&SS jr'S r(  S           SS jjr)SS jr*/ 4SS jjr+\,S:X  a7  Sr-\R\                  R[                  \-5        \R^                  " \-5        \+" 5         gg)    )annotationsN)setup_logger)get_rankget_size)add_io_bindings_as_ortvaluesconvert_inputs_for_ort%get_merged_sample_with_past_kv_inputsget_sample_inputsget_sample_with_past_kv_inputsverify_ort_inputs)setup_torch_model)make_dynamic_cache)
AutoConfig)__version__)DynamicCache c                P    U R                   (       a  SOSu  p#UR                  nX#U4$ )N)      )r   r   )use_past_kvmax_position_embeddings)argsconfigpast_sequence_lengthcurr_sequence_lengthmax_sequence_lengths        l/var/www/fran/franai/venv/lib/python3.13/site-packages/onnxruntime/transformers/models/llama/llama_parity.pyget_sequence_lengthsr   %   s.    ;?;K;KQW. 887JJJ    c                V   [        5       nSn[        X5      u  pEnU R                  (       a2  [        UU R                  UUUUU R
                  U R                  SUS9
nU$ U R                  (       a%  [        UU R                  UUU R
                  SUS9nU$ [        XR                  X5SS9nU$ )N   T)seq_lenpast_seq_lenmax_seq_lenuse_fp16use_buffer_sharereturn_dict
world_size)r%   r'   r(   )r'   )
r   r   mergedr	   devicer%   r&   r   r   r
   )r   r   r(   
batch_sizer   sequence_lengthr   inputss           r   
get_inputsr.   +   s    JJAUVZAc>+>{{6KK#-+]]!22!
2 M 
		/KK]]!
 M #6;;
aefMr   c                l   [        U [        [        [        45      (       a  U $ [        U [        5      (       a  [	        S U  5       5      $ [        U [
        5      (       a  U  Vs/ s H  n[        U5      PM     sn$ [        U [        5      (       a  U  Vs1 s H  n[        U5      iM     sn$ [        U [        5      (       a/  U R                  5        VVs0 s H  u  p!U[        U5      _M     snn$ [        U [        R                  5      (       a  U R                  5       $ [        U S5      (       a  U R                  5       $ [        U [        5      (       a:  [!        [        [        [#        U R$                  U R&                  SS95      5      5      $ [)        S[+        U 5       35      es  snf s  snf s  snnf )Nc              3  8   #    U  H  n[        U5      v   M     g 7f)N)torch_deepcopy).0vs     r   	<genexpr>!torch_deepcopy.<locals>.<genexpr>R   s     61^A&&s   cloneF)strictz(torch_deepcopy not implemented for type )
isinstanceintfloatstrtuplelistr1   setdictitemsnpndarraycopyhasattrr6   r   r   zip	key_cachevalue_cacheNotImplementedErrortype)valuer3   ks      r   r1   r1   N   sK   %#uc*++%6666%+015aq!511%+015aq!511%16?>!$$??%$$zz|ug{{}%&&!.c%//5K\K\ej6k1l"mnn  HeV
WW 21?s   #F&F+F0c           	        UnUcH  [        U UUU R                  (       a  [        R                  O[        R                  U R
                  S9u  pV[        X5      nSU;   aC  [        R                  " [        5      [        R                  " S5      :  a  [        US   5      US'   [        U5      nU R                  S:w  a  [        R                  R                  5         [        R                  " 5       n	U" S0 UD6R                   R#                  5       R%                  5       R'                  5       n
U R                  S:w  a  [        R                  R                  5         [        R                  " 5       n[(        R+                  SX-
   S35        U R,                  (       a"  Ub  A[        R                  R/                  5         [1        X5      u  pn[3        UU R4                  UUS9nU R                  R7                  5        S3nUS	:X  a  US
U R8                  04n[:        R<                  " U R>                  [:        R@                  " 5       U/S9n[C        UU5      nU R                  S:w  a  [E        UUU R                  [G        U R8                  5      U R4                  US9u  nnURI                  5         [        R                  " 5       n	URK                  U5        URM                  5         [        R                  " 5       nURO                  5       S   nAOA[        R                  " 5       n	URQ                  S U5      n[        R                  " 5       nUS   n[(        R+                  SX-
   S35        SU R>                  ;   d  SU R>                  ;   a  SOSn[R        RT                  " U
UUUS9n[(        RW                  SU 35        U(       d/  [(        RW                  S[R        RX                  " U
U-
  5       35        U$ )Ntorch_dtyper*   past_key_valuesz4.45cpuzPyTorch took z s)r&   r#   r$   ExecutionProviderCUDAExecutionProvider	device_id)sess_options	providers)
ort_inputsr*   rS   r&   kv_cache_ortvaluesr   zONNX Runtime took int4int8g      4@g      ?)rtolatolz,Are PyTorch and ONNX Runtime results close? z
Max diff:  )-r   r%   torchfloat16float32r*   r.   pvVersiontransformers_versionr   r1   execution_providercudasynchronizetimelogitsdetachrP   numpyloggerinfo	small_gpuempty_cacher   r   r&   upperrankortInferenceSessiononnx_model_pathSessionOptionsr   r   r9   synchronize_inputsrun_with_iobindingsynchronize_outputscopy_outputs_to_cpurunrA   allclosewarningmax)r   locationuse_auth_tokenrW   pytorch_modelr   py_modelr-   inputs_after_deepcopy
start_time
pt_outputsend_timer   _r   ep	ort_model
io_bindingort_outputstolparitys                        r   verify_parityr   d   sK    H,*.--U]];;
 %FF"rzz2F'G2::V\K]']$6v>O7P$Q ! +62%'

 J 21299@@BFFHNNPJ%'

 yy{H
KK- 56b9:~~(.

  4H3U00#..)'	F ##))+,,=	>B	$$;		*+$$'')$I
 y&1F %')E**$))n!221*
&
& 	%%'YY[
$$Z0&&(99; 446q9 YY[
mmD&199;!!n
KK$X%:$;2>? 4///6T=Q=Q3Q#W[C[[[sEF
NNA&JKBFF:+C$D#EFGr   c           	        [         R                  " 5       nUR                  SSSSS9  UR                  SSS[        R                  R                  S5      S	S
9  UR                  SSS[        R                  R                  S5      SS
9  UR                  SSSS/ SQSS9  UR                  SSSSS9  UR                  SS9  UR                  SSSSS9  UR                  SS9  UR                  SS SS!S9  UR                  SS"9  UR                  S#SS$S9  UR                  SS%9  UR                  S&S'S/ S(QS)S*9  UR                  S+S[        S,S-S.9  UR                  S/SS0S9  U / :X  a  UR                  5       OUR                  U 5      nUR                  S1;   d   UR                  S2:X  a  UR                  S:X  a	  S3Ul	        U$ S4Ul	        U$ )5Nz-mz--model_nameFzModel name in Hugging Face)requiredhelpz-tz--torch_model_directory.zMPath to folder containing PyTorch model and associated files if saved on disk)r   defaultr   z-oz--onnx_model_pathTzSPath to ONNX model (with external data files saved in the same folder as the model)z-epz--execution_providerrP   )rP   rd   rocmz(Execution provider to verify parity with)r   r   choicesr   z-vz	--verbose
store_truezPrint verbose logs)actionr   )verbosez-pz--use_past_kvzfUse past key and past value as inputs to the model. Necessary for decoder_with_past_model.onnx models.)r   z-gz--use_buffer_sharezWUse if model has GroupQueryAttention and you want to enable past-present buffer sharing)r&   z--mergedz2Use merged model (i.e. decoder_merged_model.onnx).)r)   z-fpz--precision)rX   rY   fp16fp32zPrecision of model)r   r   r   z--cache_dirz./model_cachezQmodel cache dir to override default HF cache dir to avoid overflood the /home dir)r   rI   r   r   z--small_gpuzhLoad the llama in GPU every time for parity_check if it's running in a machine which GPU memory < 36GB. >   r   rY   rX   r   r   )argparseArgumentParseradd_argumentospathjoinset_defaultsr;   
parse_args	precisionrc   )argvparserr   s      r   get_argsr      sT   $$&F
)	   !S!\   S!b   '7   !	   &
u	   E*
f	   /
A  
 u%
0!   `   w   #'"*6&2C2CD2ID
 >>--$..F2JtOfOfjoOo 	 	N
 K  	N
 Kr   c           	     8   [        U 5      n[        UR                  5        [        R	                  SU 35        [        5       n[        USUR                  S:H  5        X!l        [        USUR                  S:X  a  SOSU 35        [        US[        R                  " UR                  5      5        UR                  [        R                  R!                  S5      :H  nU(       a  UR"                  OUR                  n0 nUR$                  (       d  ['        XX55        g S =pgUR(                  (       dH  [+        UUUUR,                  (       a  [        R.                  O[        R0                  UR                  S	9u  pgS
Ul        ['        XX5XvS9nSUl        ['        XX5XvS9  g )NzArguments: r%   r   device_namerP   zcuda:r*   r   rM   F)r~   r   T)r   r   r   rj   rk   r   setattrr   ro   rc   r]   r*   r   torch_model_directoryr   r   r   
model_namer)   r   rl   r   r%   r^   r_   r   )r   r   ro   r}   r|   rW   r   llamas           r   mainr   ,  sE   D>D
KK+dV$%:D D*dnn67ID-$*A*AU*JRWX\W]P^_D(ELL)9)9:;//277<<3DDN"0td6P6PH;;dnI~~-.2mmU]]{{MF !*Ne

  dnX]mr   __main__r!   )r   argparse.Namespacer   r   )NN)r   r   r|   r;   r}   boolrW   r?   r~   zNone | torch.nn.Moduler   zNone | AutoConfig)r   z	list[str])0
__future__r   r   loggingr   rf   ri   rA   packaging.versionversionr`   r]   benchmark_helperr   dist_settingsr   r   llama_inputsr   r   r	   r
   r   r   llama_torchr   (models.torch_export_patches.cache_helperr   transformersr   r   rb   transformers.cache_utilsr   onnxruntimerp   	getLoggerrj   r   r.   r1   r   r   r   __name__seedrandommanual_seedr\   r   r   <module>r      s    #   	     ) ,  * G # < 1 			2	K FX6 -1 $a
aa a 	a
 *a aHaH  $nN zDIINN4	dF	 r   