
    hh                        S SK r S SKrS SKrS SKrS SKrS SKrS SKrS SKrS SKr	S SK
r
S SKrS SKrS SKJrJr  S SKJrJr  S SKJrJrJrJrJrJr  S SKJr  S SKJrJrJr  S SK J!r!  S SK"J#r#J$r$J%r%  S SK&r'\RP                  " \)5      r*S	 r+S
\ RX                  S\-4S jr.S
\ RX                  4S jr/S r0S r1S r2S r3S r4S r5SS jr6S r7\)S:X  a  \7" 5         gg)    N)measure_memorysetup_logger)get_rankget_size)add_io_bindings_as_ortvalues%get_merged_sample_with_past_kv_inputsget_msft_sample_inputsget_sample_inputsget_sample_with_past_kv_inputsverify_ort_inputs)ORTModelForCausalLM)ProfilerActivityprofilerecord_function)trange)
AutoConfigAutoModelForCausalLMAutoTokenizerc                     U R                   S;   a  gU R                   S:X  a   [        UR                  5      $ [        UR                  5       5      $ ! [         a"    [        UR                  R
                  5      s $ f = f)N   hf-pt-eagerhf-pt-compiler   hf-ort)benchmark_typeleninputs_names	Exceptiondecoderinput_names
get_inputs)argsmodels     i/var/www/fran/franai/venv/lib/python3.13/site-packages/onnxruntime/transformers/models/llama/benchmark.pyget_ort_model_inputs_lenr$   (   st    >>h&	2u))** u!""  	2u}}0011	2s   A )A=<A=r!   ort_model_inputs_lenc                    Su  p#U R                   S:X  a  SOU R                  R                  nU R                   S;   ax  [        U R                  U R                  U R
                  U R                  SS9n[        U R                  U R                  U R
                  U R                  U R                  SS9nX#4$ U R                   S;   Ga  US	:X  ax  [        U R                  U R                  U R
                  U R                  SS9n[        U R                  U R                  U R
                  U R                  U R                  SS9nX#4$ [        U R                  U R                  U R
                  U R                  S
UU R                  U R                  SSS9
n[        U R                  U R                  U R
                  SU R                  UU R                  U R                  SSS9
n X#4$ U R                   S:X  a  [        U R                  U R                  U R
                  U R                  S
UU R                  U R                  SSU R                  S9n[        U R                  U R                  U R
                  SU R                  UU R                  U R                  SSU R                  S9nX#4$ U R                   S:X  a  US:  n[        U R                  U R
                  S
U R                  UU R                  U R                  US9n[        U R                  U R
                  U R                  SUU R                  U R                  US9nX#4$ [        S5      e)NNNort-msfti   r   T)return_dict)use_fp16r)   >   r      r   pt)seq_lenpast_seq_lenmax_seq_lenr*   use_buffer_shareenginer)      ort-convert-to-onnxort)r-   r.   r/   r*   r0   r1   r)   
world_size   )r.   r-   r/   r*   r0   split_kvz/Unable to auto-detect inputs for provided model)r   configmax_position_embeddingsr
   target_device
batch_sizesequence_lengthr   r*   r   r0   r5   r	   r   )r!   r%   init_inputsiter_inputsr/   r7   s         r#   r    r    5   s2   )K
 --;$AdAdK>>'KKOO  
 5KKOO  ]]
^ ##M 
		
	*1$+""$$ K 9""$$ Kx ##e @"",,'!%!6!6 K @""!11'!%!6!6 KL ##s 
		 5	5;KKOO((#]]!22
 <KKOO--#]]!22
T ##9 
		
	*'!+,KKOO((#]]!22	
 -KKOO--#]]!22	
 ## IJJ    c                 2   Su  pSu  p4U R                   S;   a  U R                  (       a  U R                  OU R                  n[        R                  " 5       n[        R
                  " UU R                  (       a  [        R                  O[        R                  U R                  U R                  SU R                  S9R                  U R                  5      n[        R                  " 5       nU R                   S:X  a  [        R                  " U5      nOnU R                   S;   aF  [        R                   " 5       nU R"                  Ul        U R&                  (       a  SUl        SUl        O[-        SU R                    35      eU R                   S	:X  Ga2  [/        U R0                  5      [2        L a  U R0                  S
   OU R0                  n[/        U R0                  5      [2        L a  U R0                  S   OS nS nS n	[4        R6                  " U R8                  5       H9  n
SU
;  d  SU
;   d  SU
;   a  M  SU
;   d  U
S:X  a  U
nSU
;   a  U
n	SU
;   d  M5  U
nU
n	M;     [        R                  " 5       n[:        R
                  " U R8                  UU	U R                  U R                  SUS:X  a  SOS UUUS9
n[        R                  " 5       nU R                   S;   a  [<        R?                  SU R@                  RC                  U RD                  5       35        [        R                  " 5       n[        RF                  " U R@                  RC                  U RD                  5      UU R0                  /S9n[        R                  " 5       n[<        R?                  SXC-
   S35        U$ )Nr'   r   T)torch_dtypeuse_auth_tokentrust_remote_code	use_cache	cache_dirr   >   r   r(   r3   r2   Cannot recognize r   r   z.onnxz
.onnx_dataz
.onnx.datadecoder_modelz
model.onnxdecoder_with_past_modeldecoder_merged_model)	decoder_file_namedecoder_with_past_file_namerB   rC   use_io_binding
use_mergedproviderprovider_optionssession_options   r(   r3   zLoading model from )	providerszLoaded model in  s)$r   hf_pt_dir_path
model_nametimer   from_pretrainedr*   torchfloat16float32authrE   tor:   compiler4   SessionOptionsr   enable_profilingverboselog_verbosity_levellog_severity_levelr   typeexecution_providertupleoslistdirhf_ort_dir_pathr   loggerinfoort_model_pathformatrankInferenceSession)r!   r"   sess_options
start_timeend_timesourcerN   rO   rJ   rK   filenames              r#   	get_modelrt      s   $E%J >>(,(;(;$$YY[
$44)-EMM99"iinn
 "T
  	 99;/1MM%(E			 M	M))+(,%<</0L,./L+ +D,?,?+@ABBh&15d6M6M1NRW1W4**1-]a]t]t9=d>U>U9VZ_9_42215ei &*#

4#7#78Hh&,(*BlV^F^(*h,.F$,!(H4.6+%1$,!.6+ 9 YY[
#33  /(C99"ii 1\ At-(
 99;AA)$*=*=*D*DTYY*O)PQRYY[
$$&&tyy1../

 99;
KK"8#8"9<=Lr?   c                   ^  T R                   S;   a  [        T R                  5      O"[        T R                  [        R
                  SS9nT R                  (       a  U" U5      n[        R                  U5        U 4S jnU 4S jnU H  nU" 5         U" U5        U" 5         M     SnT R                   S;   a  [        T R                  5      O"[        T R                  [        R
                  SS9n	U	 HJ  nU" 5         [        R                  " 5       n
U" U5        U" 5         [        R                  " 5       nXU
-
  -  nML     T R                   S;  a  [        R                  S5        UT R                  -  nT R                  U-  nT R                  S:X  av  [        R                  S	T R                   35        [        R                  S
T R                   35        [        R                  SU S35        [        R                  SU S35        g )NrQ   zWarm up)filedescc                     > TR                   S:w  a*  TR                  S;   a  TR                  R                  5       $ U4S j$ )NcpurQ   c                     > TR                   S:w  aA  [        R                  R                  5       (       a  [        R                  R	                  5       $ S $ )Nry   c                      g N kwargss    r#   <lambda>=time_fn.<locals>.<lambda>.<locals>.<lambda>.<locals>.<lambda>      r?   devicerX   cudais_availablesynchronizer   r!   s    r#   r   +time_fn.<locals>.<lambda>.<locals>.<lambda>  @    {{e#

(?(?(A(A JJ""$ &%&r?   )r   r   
io_bindingsynchronize_inputsr   s    r#   r   time_fn.<locals>.<lambda>  s>    ;;%D$7$7;^$^ 	**, 	

	
r?   c                     > TR                   S:w  a*  TR                  S;   a  TR                  R                  5       $ U4S j$ )Nry   rQ   c                     > TR                   S:w  aA  [        R                  R                  5       (       a  [        R                  R	                  5       $ S $ )Nry   c                      g r|   r}   r~   s    r#   r   r   (  r   r?   r   r   s    r#   r   r   %  r   r?   )r   r   r   synchronize_outputsr   s    r#   r   r   "  s>    ;;%D$7$7;^$^ 	++- 	

	
r?   r   	Benchmark zBatch Size: zSequence Length: z	Latency: rS   zThroughput: z tps)r   rangewarmup_runsr   sysstdoutr`   ri   rj   num_runsrV   r;   rm   r<   )r!   fninputswarmup_rangeoutputs
input_syncoutput_sync_
total_timebench_rangerp   rq   latency
throughputs   `             r#   time_fnr     s    "EE 	dD$$3::IF  ||V*GJK 
6
  J "EE 	dmmDMM

E 
 YY[

6
99;++
  "EEB4==(G7*JyyA~l4??"345'(<(<'=>?iy+,l:,d34
r?   c                 v   SU R                    SU R                   SU R                  R                  5        SU R                   SU R
                   SUR                  R                  SS5       SU S[        R                  R                  5       S 3nS nU R                  S;   a  [        [        R                  [        R                  /SSS9 n[        S	5         U" U5        S S S 5        S S S 5        WR                  S
S9R!                  U R"                  U R$                  S9n[&        R(                  R+                  U R,                  U S35      n[/        US5       nUR1                  U5        S S S 5        U$ U" U5        U S3nU$ ! , (       d  f       N= f! , (       d  f       N= f! , (       d  f       U$ = f)Nb_sr   -z%Y-%m-%d_%H:%M:%Sr   T)
activitiesrecord_shapesprofile_memorymodel_inferencer6   )group_by_stack_n)sort_by	row_limitz.logwz.json)r;   r<   r   lower	precisionr   __name__replacedatetimenowr   r   CPUCUDAr   key_averagestablept_filter_bypt_num_rowsrf   pathjoin
log_folderopenwrite)	r!   r   r   inputs_typeprefixrs   prof	prof_datafs	            r#   
profile_fnr   R  s    !D$8$8#94;N;N;T;T;V:WWXY]YgYgXhhijnjujuivvwxz  yD  yD  yL  yL  MP  RU  yV  xW  WX  Yd  Xe  ef  go  gx  gx  g|  g|  g~  P  fQ  RFH>>(,,.>.C.CDTXim
 !236
 4

 %%q%9??HYHYeieueu?v	77<<F84A(C AGGI ! O 	6
 XU#O 43
 
 !  Os0   F	F"FF)
F	F
F&)
F8c                   ^^ [         R                  " 5       n[        R                  " U5      nUR	                  SS9  T" T5        U R
                  S:X  a;  [        R                  SUR	                  S S9[        R                  " SS9-   S35        [        R                  " 5         [        R                  R                  5         [        U R                  S:g  UU4S	 jS
9  [         R"                  R%                  5         g )Ng?)intervalr   zCPU usage: F)logical%ry   c                     > T " T5      $ r|   r}   )r   r   s   r#   r   measure_fn.<locals>.<lambda>|  s	    r&zr?   )is_gpufunc)rf   getpidpsutilProcesscpu_percentrm   ri   rj   	cpu_countgccollectrX   r   empty_cacher   r   r   r   flush)r!   r   r   pidprocesss    ``  r#   
measure_fnr   o  s    
))+CnnS!G%vJyyA~k'"5"5t"5"DvGWGW`eGf"f!gghij JJL	JJ4;;%/7IJ JJr?   c                   ^ U4S jnUnU R                   S:X  a  U" U5        U" U5        U R                  (       Ga7  [        XUS5      nU R                   S:X  a~  TR                  R                  R                  5       n[        R                  SU SU 35        [        R                  " U[        R                  R                  U R                  U5      5        [        XUS5      nU R                   S:X  a~  TR                  R                  R                  5       n[        R                  SU SU 35        [        R                  " U[        R                  R                  U R                  U5      5        g [        R                  S5        [        XU5        [!        XU5        [        R                  S	5        [        XU5        [!        XU5        g )
Nc                    > T" S0 U D6nU$ )Nr}   r}   r   r   r"   s     r#   
get_logits$run_hf_inference.<locals>.get_logits  s    /&/r?   r   promptr   	Renaming  to token7
Evaluating `model(inputs)` step to get past_key_values5
Evaluating `model(inputs)` step with past_key_values)r   r   r   r   sessionend_profilingri   warningrf   renamer   r   r   decoder_with_pastrj   r   r   )r!   r=   r>   r"   r   generate_fnnew_lognameold_lognames      `    r#   run_hf_inferencer     s[   4 Ko-K K ||| KJ(*--//==?KNNY{m4}EFIIk277<<#MN KI(*1199GGIKNNY{m4}EFIIk277<<#MN KKJKD{+t+.
KKHID{+t+.r?   c                 v  ^ ^ U U4S jnU4S jnU4S jnT R                   S:w  a  UOUn0 nT R                  (       Ga  U" X5      u  p[        T XyS5      n
TR                  5       n[        R                  SU SU
 35        [        R                  " U[        R                  R                  T R                  U
5      5        [        T 5      mU" X(5      u  p[        T X|S5      n
TR                  5       n[        R                  SU SU
 35        [        R                  " U[        R                  R                  T R                  U
5      5        g [        R                  S	5        U" X5      u  p[        T Xy5        [        T Xy5        [        R                  S
5        U" X(5      u  p[        T X|5        [        T X|5        g )Nc                    > [        TU 5      n TR                  S:w  aI  [        TU TR                  [        TR                  5      TR
                  U5      u  p![        TSU5        X!4$ X4$ )Nry   r   )r   r   r   intrm   r0   setattr)r   kv_cache_ortvaluesr   r!   r"   s      r#   prepare_ort_inputs-run_ort_inference.<locals>.prepare_ort_inputs  sj    "5&1 ;;%-Ivt{{C		ND<Q<QSe.*J D,
311))r?   c                 (   > TR                  U 5        g r|   )run_with_iobinding)r   r"   s    r#   with_io_binding*run_ort_inference.<locals>.with_io_binding  s      ,r?   c                 ,   > TR                  S U 5      nU$ r|   )runr   s     r#   without_io_binding-run_ort_inference.<locals>.without_io_binding  s    ))D&)r?   ry   r   r   r   r   r   r   )r   r   r   r   ri   r   rf   r   r   r   r   rt   rj   r   r   )r!   r=   r>   r"   r   r   r   r   r   ort_init_inputsr   r   ort_iter_inputss   `  `         r#   run_ort_inferencer    sl   *-
 &*[[E%9/?QK|||.@.a+ {XN ))+;-tK=AB
		+rww||DOO[IJ $.@.a+ {WM ))+;-tK=AB
		+rww||DOO[IJ KKJK*<[*]'OD+/t[2
KKHI*<[*]'OD+/t[2r?   c                     U R                   S;   a  [        XX#5        g U R                   S;   a  [        XX#5        g [        SU R                    35      e)N>   r   r   r   rQ   rF   )r   r   r  r   )r!   r=   r>   r"   s       r#   run_inferencer    sN    HHK?			 C	C$[@+D,?,?+@ABBr?   c           
      N   [         R                  " 5       nUR                  SS[        S/ SQS9  UR                  SS[        SSS	9  UR                  S
SSSSS9  UR                  SSS[        S/ SQSS9  UR                  S[        SSS9  UR                  S[        SSS9  UR                  S[        SSS9  UR                  SSS S!9  UR                  S"S#S$S!9  UR                  S%S&[        [        R
                  R                  5       (       a  S'OS(/ S)QS*9  UR                  S+S,[        S-S.9  UR                  S/S0[        S1S.9  UR                  S2S3[        S4S.9  UR                  S5[        S6S.9  UR                  S7[        S8S.9  UR                  S9[        S:S.9  UR                  S;SSS<9  UR                  S=[        S>S?S9  UR                  S@[        SASBS9  UR                  SCSSS<9  UR                  SD[        [        R                  R                  SE5      SFS9  UR                  SG[        SSHSISJ9  UR                  5       n[        R                  R                  UR                  5        [        R                  " UR                  5        SKUR                   ;   az  [#        USLUR$                  R'                  5        SM35        UR(                  SN:X  a  UR(                  SOU 04Ul        O,UR(                  SP:X  a  UR(                  SOU 04Ul        S'Ul        UR                   SQ:X  a  UR*                  (       d   SR5       eUR                   SS;   a  UR,                  (       d   ST5       eUR.                  R1                  SU5      Ul        UR2                  R1                  SU5      Ul        UR4                  SV;   d   UR4                  SW:X  a  UR$                  S(:X  a  SOSXUl        UR6                  (       a9  [9        UR.                  5      S::X  a  [9        UR2                  5      S::X  d   SY5       eU$ )ZNz-btz--benchmark-typeT)r   r   r   r(   r3   )rc   requiredchoicesz-mz--model-namez<Hugging Face name of model (e.g. 'meta-llama/Llama-2-7b-hf'))rc   r  helpz-az--authF
store_truez5Use Hugging Face authentication token to access model)defaultactionr  z-pz--precisionfp32)int4int8fp16r  zePrecision for model. For ONNX models, the model's precision should be set before running this script.)r  rc   r	  r  r  z--hf-pt-dir-pathr   zNPath to directory containing all PyTorch files (e.g. tokenizer, PyTorch model))rc   r	  r  z--hf-ort-dir-pathzhPath to directory containing all ONNX files (e.g. tokenizer, decoder_merged, decoder, decoder_with_past)z--ort-model-pathzPath to ONNX modelz-bz--batch-sizesz1 2)r	  z-sz--sequence-lengthsz32 64 128 256 512z-dz--devicer   ry   )ry   r   rocm)rc   r	  r  z-idz--device-idr   )rc   r	  z-wz--warmup-runsr6   z-nz
--num-runs
   z--seed   z--max-length    z--num-return-sequencesr2   z	--profile)r	  r
  z--pt-filter-byself_cpu_time_totalz"What to filter PyTorch profiler byz--pt-num-rowsi  z.Number of rows for PyTorch profiler to displayz	--verbosez--log-folder.zFolder to cache log filesz--cache-dirz./model_cachez-Cache dir where Hugging Face files are stored)rc   r  r	  r  r4   rd   ExecutionProviderCUDAExecutionProvider	device_idROCMExecutionProviderr   z,Please specify a path to `--hf-ort-dir-path`rQ   z+Please specify a path to `--ort-model-path` >   r  r  r  r  zOPlease provide only one (batch_size, sequence_length) combination for profiling)argparseArgumentParseradd_argumentstrrX   r   r   r   rf   r   r   
parse_argsnprandomseedmanual_seedr   r   r   upperrd   rh   rk   batch_sizessplitsequence_lengthsr   r   r   )rm   parserr!   s      r#   get_argsr(    sc   $$&F

   K   hlAx  
 0t   ]	   w	   !	     
 #  
 **1133'   }3B
oCC
lbA
sA6 S"=
0sAF U<H
s,AHl   c4Fvw
U<H
S"'',,s:KRmn
<   D IINN499	dii  ###*t{{/@/@/B.CCT,UV""&=='+'>'>d@S&TD#$$(??'+'>'>d@S&TD# DK h&##S%SS#AA""Q$QQ"''--c2D 1177<D ..$4469QVZVaVaejVjqw 	N
 ||4##$)c$2G2G.HA.M 	
]	
M Kr?   c                     [        5       n [        5       n[        U 5      n[        UR                  5        [
        R                  UR                  5        S[        R                  R                  l        Xl        Xl        [        R                  " UR                   UR"                  UR$                  UR$                  S9n[&        R                  " UR                   UR"                  UR$                  UR$                  S9nUR(                  S:w  a  SUR                   3OUR(                  nUR*                  S:H  n[-        USU5        [-        USU5        [-        USU5        [-        US	U5        [/        U5      n[1        X'5      nUR2                  S
;   a  [4        R6                  " UR8                  R;                  UR                  5      SS9n	[=        [?        S U	R@                  RB                  5      5      n
U=(       a$    [E        U
5      S:  =(       a    UR(                  S:g  n[-        USU5        O[-        USS5        [F        RH                  " URJ                  URL                  5       Hv  u  pUR                  S:X  a  [
        R                  SU SU S35        [-        US[O        U5      5        [-        US[O        U5      5        [Q        X(5      u  p[S        X.X5        Mx     g )NT)rE   rB   rC   ry   zcuda:r  	tokenizerr8   r:   r*   rQ   F)load_external_datac                      U R                   S:H  $ )NGroupQueryAttention)op_type)nodes    r#   r   main.<locals>.<lambda>  s    T\\=R-Rr?   r   r0   z
Batch size = z and sequence length = z...r;   r<   )*r   r   r(  r   r`   ri   rj   __dict__rX   backendscudnn	benchmarkrm   r5   r   rW   rU   rE   r[   r   r   r   r   rt   r$   r   onnx
load_modelrk   rl   listfiltergraphr/  r   	itertoolsproductr$  r&  r   r    r  )rm   r5   r!   r*  r8   r:   r*   r"   r%   
onnx_model	gqa_nodesr0   r;   r<   r=   r>   s                   r#   mainr>    s@   :DJD>D
KK%)ENN"I O--4>>$))_c_h_hI ''4>>$))_c_h_hF ,0;;%+?eDII;'T[[M~~'HD+y)D(F#D/=1D*h' dOE3D@ AA__T%8%8%?%?		%J_de
 RT^TdTdTiTijk	#SI(:St{{e?S(*:;(%0 (1'8'89I9I4K`K`'a#
99>KK/*5L_L]]`ablC
O4'_)=>#-d#I d< (br?   __main__)r   )8r  r   r   r:  loggingrf   r   rV   numpyr  r5  r   rX   benchmark_helperr   r   dist_settingsr   r   llama_inputsr   r   r	   r
   r   r   optimum.onnxruntimer   torch.profilerr   r   r   tqdmr   transformersr   r   r   onnxruntimer4   	getLoggerr   ri   r$   	Namespacer   r    rt   r   r   r   r   r  r  r(  r>  r}   r?   r#   <module>rL     s      	   	 
      9 ,  4 E E  H H 			8	$
#$X'' $s $DRH&& RjCL:&;/|83vCEP/=d zF r?   