
    h%a                       S SK Jr  S SKrS SKrS SKrS SKrS SKrS SKrS SKrS SK	r	S SK
r
S SKrS SKrS SKrS SKJr  S SKJrJr  S SKJrJrJrJr  S SKr\R8                  " \5      rSS jrS r S r!S	 r"S
 r#S r$S r%\S:X  a  \%" 5         gg)    )annotationsN)setup_logger)add_io_bindings_as_tensorsget_initial_inputs_and_outputs)
AutoConfigAutoModelForCausalLMAutoTokenizerBitsAndBytesConfigc                   U R                   S;   Ga  S nU R                  S:X  a  U R                  S:X  a  [        SSS[        R
                  S9n[        R                  " U R                  S:w  a  U R                  OU R                  U R                  U R                  U R                  U R                  SSUU R                  S	0S
9	nO [        R                  " U R                  S:w  a  U R                  OU R                  U R                  U R                  U R                  U R                  SU R                  S:X  a  SOSS9R                  U R                   5      nUR'                  5         U R                   S:X  a  [        R(                  " U5      nU$ [*        R,                  " 5       nU R                  S:X  a  SSU R                  04OSn[*        R.                  " U R0                  XE/S9nU$ ! ["         a  n[%        SU5        [        R                  " U R                  S:w  a  U R                  OU R                  U R                  U R                  U R                  U R                  SSS9R                  U R                   5      n S nAGN.S nAff = f)N   pt-eager
pt-compileint4cudaTnf4)load_in_4bitbnb_4bit_use_double_quantbnb_4bit_quant_typebnb_4bit_compute_dtype flash_attention_280GB)	cache_dirtorch_dtypeuse_auth_tokentrust_remote_code	use_cacheattn_implementationquantization_config
max_memorysdpa)r   r   r   r   r   r   z&Try to load a model using eager mode: eagerr   CUDAExecutionProvider	device_idCPUExecutionProvider)sess_options	providers)benchmark_typeonnx_precisiondevicer
   torchfloat16r   from_pretrainedhf_dir_path
model_namer   r   authtrustr$   totarget_device	ExceptionprintevalcompileortSessionOptionsInferenceSessiononnx_model_path)argsmodel
bnb_configer&   eps         m/var/www/fran/franai/venv/lib/python3.13/site-packages/onnxruntime/transformers/models/llama/benchmark_e2e.py	get_modelrB   8   s0   88&(T[[F-B+!*.$)',}}	J )88$($4$4$:  .. ,,#yy"&**$7$. NNF3
E),<<(,(8(8B(>D$$DOO"nn $ 0 0#'99&*jj"@Dv@U)<[a "T''( . 	

,.MM%(E L ))+ {{f$ %{DNN&CD' 	
 $$T%9%9`deL9  ) >B,<<(,(8(8B(>D$$DOO"nn $ 0 0#'99&*jj"(/ "T''( 	)s   BG* *
J4BJJc                `   U R                   S:X  a&  [        R                  " 5          U" S0 UD6nS S S 5        S nU R                   S;   a:  U R                  S:w  a)  [        R                  R                  U R                  5        O1[        XX@R                  U R                  5      nUR                  5         [        R                  " 5       n[        U5       H  nU R                   S;   aa  [        R                  " 5          U" S0 UD6nU R                  S:w  a)  [        R                  R                  U R                  5        S S S 5        Mt  UR                  U5        UR                  5         M     [        R                  " 5       nX-
  U-  n	X4$ ! , (       d  f       GNd= f! , (       d  f       M  = f)Nr   r   cpu )r(   r+   no_gradr*   r   synchronizer3   r   use_fp16use_buffer_sharesynchronize_inputstimeperf_counterrangerun_with_iobindingsynchronize_outputs)
r<   r=   runsinputsoutputs
io_bindingstart_endavgs
             rA   run_inferencerX   x   sE   l*]]_ofoG  J88;;%JJ""4#5#56/wW[WlWlm
%%' E4["<</&/;;%'JJ**4+=+=> !
 $$Z0**,  


C;$
C<5 _  !s   	F<AF
F
F-	c           
         [        5         [        X#XEU R                  U R                  U R                  U R
                  5      u  pg[        XU R                  Xg5      u  pXg4$ N)clear_cacher   r3   rH   rI   enginerX   warmup_runs)	r<   r=   config	tokenizerprompt_lengthpromptrQ   rR   rU   s	            rA   prepare_model_for_inferencerb      sX    M4=$2D2DdmmUYUjUjlplwlwOF tD,<,<fNJA?    c                 j    [         R                  " 5         [        R                  R	                  5         g rZ   )gccollectr+   r   empty_cacherE   rc   rA   r[   r[      s    JJL	JJrc   c                    [         R                  " U SSSSSSSSS	US
-   S3SUS
-   S3S	U S3SU S3SS/S9nUR                  USS9  [        R	                  SU S35        g )Nz
Batch SizezPrompt LengthzPrompt Processing Latency (ms)z"Prompt Processing Throughput (tps)zSampling Latency (ms)zSampling Throughput (tps)z"First Token Generated Latency (ms)z&First Token Generated Throughput (tps)Average Latency of First    z Tokens Generated (ms)Average Throughput of First z Tokens Generated (tps)zWall-Clock Latency (s)zWall-Clock Throughput (tps))columnsF)indexzResults saved in !)pd	DataFrameto_csvloggerinfo)resultsfilename
gen_lengthdfs       rA   save_resultsrx      s    	,0#'04'
a'88NO*:?*;;RS'
|3IJ*:,6MN$)

B( IIheI$
KK#H:Q/0rc   c                    [         R                  " 5       n U R                  SS[        S/ SQS9  U R                  SS[        SS	S
9  U R                  SSSSSS9  U R                  SSSSSS9  U R                  SS[        [        R
                  R                  SS5      SS9  U R                  S[        SSS9  U R                  SSSSS9  U R                  S S!S[        R
                  R                  SS"S#S$5      S%S&9  U R                  S'SSS(S9  U R                  S)SSS*S94  U R                  S+S,S-S.9  U R                  S/S0S1S.9  U R                  S2S3S[        S4/ S5QS6S79  U R                  S8S9[        S:S;S9  U R                  S<S=[        [        R                  R                  5       (       a  S>OS?S?S>/S@9  U R                  SASB[        SCSD9  U R                  SESF[        SGSD9  U R                  SHSI[        SJSD9  U R                  SK[        SLSD9  U R                  5       n[        R                  R                  UR                  5        [        R                  " UR                  5        SMUR                   ;   aW  [#        USNUR$                  R'                  5        SO35        UR(                  SP:X  a  UR(                  SQUR*                  04Ul        UR                   SM:X  a  UR,                  (       d   SR5       eUR.                  R1                  SS5      Ul        UR2                  R1                  SS5      Ul        [#        USTUR4                  5        UR4                  SU;   d   UR4                  SV:X  a  UR$                  S?:X  a  S4OSWUl        UR$                  S?:w  a  SXUR*                   3OUR$                  nUR4                  SW:X  a  [        R6                  O[        R8                  nUR                   SM:X  a  SMOSYn[#        USZU5        [#        US[U5        [#        US\U5        [#        US]UR4                  SW:H  5        UR:                  =(       a    USM:H  Ul        U$ )^Nz-btz--benchmark-typeT)r   r   r8   )typerequiredchoicesz-mz--model-nameFz<Hugging Face name of model (e.g. 'meta-llama/Llama-2-7b-hf'))rz   r{   helpz-az--auth
store_truez5Use Hugging Face authentication token to access model)defaultactionr}   z-tz--trustzeWhether or not to allow for custom models defined on the Hugging Face Hub in their own modeling filesz-cz--cache-dir.model_cachezPath to directory containing all Hugging Face files (e.g. config, tokenizer, PyTorch model). Use when loading model as `AutoModel.from_pretrained(model_name, cache_dir=cache_dir)`.)rz   r   r}   z--hf-dir-pathr   zPath to directory containing all Hugging Face files (e.g. config, tokenizer, PyTorch model). Use when loading model as `AutoModel.from_pretrained(folder_path)`.z-oz--onnx-model-pathzPath to ONNX model)r{   r}   z-fz--prompts-filemodelsllamazprompts.jsonzsJSON file containing entries in the format 'prompt length: prompt' where prompt length = tokenized length of prompt)r{   r   r}   z--use_buffer_sharez3Use when GroupQueryAttention (GQA) is in ONNX modelz--anomaly-filteringzUse this flag to filter anomaly accelerator times for tokens generated.               This may give more accurate latency and throughput metrics for tokens generated.               Wall-clock metrics are still reported with anomaly times though.z-bz--batch-sizesz1 2)r   z-sz--prompt-lengthsz16 64 256 1024z-pz--precisionfp32)r   int8fp16r   zePrecision for model. For ONNX models, the model's precision should be set before running this script.)r{   rz   r   r|   r}   z-gz--generation-length   z Number of new tokens to generatez-dz--devicer   rD   )rz   r   r|   z-idz--device-idr   )rz   r   z-wz--warmup-runs   z-nz
--num-runsd   z--seedrj   r8   execution_providerExecutionProviderr#   r$   z,Please specify a path to `--onnx-model-path` r)   >   r   r   r   r   zcuda:ptr3   r   r\   rH   )argparseArgumentParseradd_argumentstrospathjoinintr+   r   is_available
parse_argsnprandomseedmanual_seedr(   setattrr*   upperr   r$   r;   batch_sizessplitprompt_lengths	precisionr,   float32rI   )parserr<   r3   r   r\   s        rA   get_argsr      s   $$&F
1   K   D   t   S-0 D    p	   !	   S(G^D C   B	   	!P	 	 	
	        0t   /   **1133   }3B
oCC
lcB
sA6D IINN499	dii  ###*t{{/@/@/B.CCT,UV""&=='+'>'>dnn@]&^D# e###S%SS#''--c2D--33C8D D"DNN3..$4469QVZVaVaejVjqw 	N 15u0DeDNN+,$++M#'>>V#;%--K))U2UFD/=1D--D(F#D*dnn67 11EfoDKrc   c                   ^9^: [        5       n [        S5        [        R                  U R                  5        S n[        U R                  5       n[        R                  " US S9nS S S 5        [        R                  " U R                  S:w  a  U R                  OU R                  U R                  U R                  U R                  S9n[         R                  " U R                  S:w  a  U R                  OU R                  U R                  U R                  U R                  S9n[#        U 5      n/ n[$        R&                  " U R(                  U R*                  5       GH]  u  px[-        U5      [-        U5      p[        R                  SU SU 35        [/        5         XR0                  -   n	X;  a\  [3        [4        R6                  " SU S	U R                   S
U R                   SU SU SU SU SU SU R                   SU S35      5      eX   /U-  n
Xx/n [        R                  S5        [9        XX4X5      u  p[;        XU R<                  X5      u  pUS-  nXxU-  -  n[        R                  SU S35        [        R                  SXxU-  -   S35        UR?                  UU/5        [        R                  S5        [/        5         [9        XX4X5      u  pUS   RA                  5       nURB                  S   nURD                  n[G        US5      (       a  URH                  OURJ                  URL                  -  n[N        RP                  " XpRR                  [N        RT                  S9n/ n/ n[V        RX                  " 5       nUU	::  Ga~  [;        XSX5      u  nnUR[                  U5        [V        RX                  " 5       nUS   RB                  S   S:  a  US    R]                  S5      S-
  nUR_                  SS!9Ra                  SURb                  5      Re                  USURb                  5      n[N        Rf                  " US   SU5      Ri                  5       nOUS   S S 2SS S 24   n[N        Rj                  " USS!9nUU-  URl                  :H  nURo                  UURl                  5      Rq                  US/5      n[V        RX                  " 5       n UR[                  U U-
  5        [N        Rr                  " UU/SS!9nUS-  nUUS'   [N        Rr                  " US    U) Ru                  [N        Rv                  5      Rq                  US5      /S5      US '   S"U;   a1  [N        Rx                  " US"   SS!9S#   Rq                  US5      S-   US"'   US   RB                  S   S:w  a"  US   S S 2S S2S S 24   R{                  5       US'   US   R}                  5         U R~                  S$:X  a
  US%   US%'   GOU R                  (       d  [        UR                  5       H#  n!US&U! S'3   US(U! S'3'   US&U! S)3   US(U! S)3'   M%     US    RB                  S   n"[        UR                  5       H  n![N        RP                  " UUU"UU RR                  U R                  S9n#[N        RP                  " UUU"UU RR                  U R                  S9n$UR                  S&U! S'3U#R{                  5       S&U! S)3U$R{                  5       05        M     UU	::  a  GM~  [V        RX                  " 5       n%UR                  S#5        U R                  (       ab  S*m9[        U5      m:[        U5      n&[        [        U9U:4S+ jU5      5      n[        U5      n'[        R                  S,U&U'-
   S-T9 S.T:S-   S/35        []        U5      [        U5      -  n(U(S-  n)USU(-  -  n*[        R                  S0U) S35        [        R                  S1U* S35        US#   n+U+S-  n,USU+-  -  n-[        R                  S2U, S35        [        R                  S3U- S35        U R0                  S4-  n.[]        US U. 5      [        US U. 5      -  n/U/S-  n0USU/-  -  n1[        R                  S5U. S6U0 S35        [        R                  S7U. S6U1 S35        []        U5      [        U5      -  n2U2S-  n3USU2-  -  n4[        R                  S5U R0                   S6U3 S35        [        R                  S7U R0                   S6U4 S35        U%U-
  n5XxU R0                  -   U5-  -  n6[        R                  S8U5 S935        [        R                  S:XxU R0                  -   U5-  -   S35        [        R                  S;5        UR?                  U)U*U,U-U0U1U3U4U5U6/
5        UR[                  U5        GM`     S>U R~                   S?[        R                  R                  5       S@ SA3n8[        UU8U R0                  5        g ! , (       d  f       G	N= f! [         a*  n7[        R                  S<U SU S=U7 35         S n7A7GM  S n7A7ff = f)BNFc                l    U R                  5        VVs0 s H  u  p[        U5      U_M     snn$ s  snnf rZ   )itemsr   )dkvs      rA   <lambda>main.<locals>.<lambda>j  s,    STSZSZS\<]S\41SVQYS\<]<]s   0)object_hookr   )r   r   r   zRunning batch size = z, prompt length = z2
                                A prompt of size z was not found in 'zv'. There are a couple of solutions to fix this.
                                1) You can change one of the keys in 'z' to be z).
                                    If za < actual prompt's length, the benchmark E2E tool will repeat the first word in the prompt until zB = actual prompt's length.
                                    If zm > actual prompt's length, the benchmark E2E tool will automatically trim the actual prompt's length so that zd = actual prompt's length.
                                2) You can add a new key-value entry in 'z' of the form 'z,': 'your prompt goes here'.
                zMeasuring prompt processing...i  z&Average Latency of Prompt Processing: z msz)Average Throughput of Prompt Processing: z tpszMeasuring token generation...	input_idshead_dim)r*   dtype   logitsattention_mask)dimposition_idsr   r   past_key_valueszpresent.z.keyzpast_key_values.z.value
   c                   > U TT-  :  $ rZ   rE   )acc_timeanomaly_threshold_factor
min_time_ss    rA   r   r     s    H7OR\7\,\rc   zFiltered out z$ anomaly accelerator times that are zx greater than z ms...zAverage Latency of Sampling: z Average Throughput of Sampling: z"Latency of First Token Generated: z%Throughput of First Token Generated: rj   ri   z Tokens Generated: rk   zWall-Clock Latency: z szWall-Clock Throughput: zAdding results to CSVz$Could not benchmark at batch size = z - 
benchmark__e2e_z%Y-%m-%d_%H:%M:%Sz.csv)Or   r   rr   rs   __dict__openprompts_filejsonloadr   r-   r.   r/   r   r0   r1   r	   rB   	itertoolsproductr   r   r   r[   generation_lengthNotImplementedErrortextwrapdedentrb   rX   num_runsextendcloneshapenum_key_value_headshasattrr   hidden_sizenum_attention_headsr+   zerosr3   boolrK   rL   appendsum	unsqueezerepeat
vocab_sizeviewgathersqueezeargmaxeos_token_idmasked_fillreshapecatr2   int64max
contiguouszero_r\   rI   rM   num_hidden_layersr   updatepopanomaly_filteringminlenlistfilterr4   datetimenowrx   );r<   size_to_promptfr^   r_   r=   all_csv_metrics
batch_sizer`   
max_lengthra   csv_metricsrQ   rR   accelerator_prompt_latency_saccelerator_prompt_latency_msaccelerator_prompt_thrptall_token_idscurrent_length	num_heads	head_sizehas_eosaccelerator_timessampling_timeswall_clock_start_timeaccelerator_time_latency_ssampling_start_timeprompt_end_indicesidxsnext_token_logitsnext_tokenstokens_to_addsampling_end_timeinew_sequence_lengthpresent_keypresent_valuewall_clock_end_time	orig_sizenew_sizeavg_sampling_latency_savg_sampling_latency_msavg_sampling_thrptfirst_token_latency_sfirst_token_latency_msfirst_token_thrpthalfwayhalfway_token_latency_shalfway_token_latency_mshalfway_token_thrptall_token_latency_sall_token_latency_msall_token_thrptwall_clock_latency_swall_clock_thrptr?   ru   r   r   s;                                                            @@rA   mainr  b  s   :D
KK N	d	 A12]^ 
! '' ,,2..yy**	F -- ,,2..yy**	I dOEO%.%6%6t7G7GI\I\%]!
$'
OS5GM+J<7I-YZ"%;%;;
.%22?@STXTeTeSf gGGKGXGXFYYaboap q((5  7X  Yf  Xg g((5  7d  er  ds sJJNJ[J[I\\klykz {
 
 !/0:=!1H	tKK899$vZgpOF4A$t}}^d4n1( -I4,O)'1Ea5a'b$KK@A^@__bcdKK;JZvJv<w;xx|}  =?WXY KK78M9$vZgpOF";/557M*004N22I#*6:#>#>FDVDVZ`ZtZtDt  kk*5G5GuzzZG !#N$($5$5$7! J.6CDQRTZ6d3*G!(()CD '+&7&7&9#8$**1-1)/0@)A)E)Ea)H1)L&*444;6#4#45j!V->->? 
 ).WX5F4(P(X(X(Z%(/(9!R((C%#ll+<"E "K/93I3II !, 7 7AWAW X ` `blnoap q$($5$5$7!%%&7:M&MN %		=-*Hb Q!# '4{#+099,-}}U[[/I/Q/QR\^_/`acd,'( "V+-2YYvn7MST-UVW-X-`-`akmn-ors-sF>* 8$**1-2(/(9!RaR((C(N(N(PGH%!'') ;;$&078I0JF,-.."6#;#;<=DxPQsRVEW=X!1!D9:?FRSQTTZG[?\!1!F;< = +11A*B*H*H*K'"6#;#;<&+kk&%/%#'#5#5"&"2"2' ).&%/%#'#5#5"&"2"2)  "*1#T 2K4J4J4L"*1#V 4m6N6N6P# =o !J.^ #'"3"3"5 !!!$%%+-( !23
 12	$(\^op%! 01#I$8#99]^v]w  xG  HR  UY  HY  GZ  Z`  a &)%83~;N%N"&<t&C#!+q3I/I!JKK78O7PPSTUKK:;M:NdST %6a$8!%:T%A" *a2G.G HKK<=S<TTWXYKK?@Q?RRVWX ,,1G&)*;HW*E&FM^_g`gMhIi&i#'>'E$",4K0K"LKK3G9<OPhOiilmnKK6wi?RSfRggklm #&&7"83?P;Q"Q#6#= (A0C,CDOKK+D,B,B+CCVWkVllop KK6t7M7M6NNabqarrvwx $79N#N )d>T>T.TXl-lmKK./C.DBGHKK)*I_I_9_cw8w*x)yy}~
 KK/0+&*%,'(#($ "";/u &^~ DKK=h.?.?.C.C.EFW-XX\]H(D,B,BCg 
!	 ^  	tKK>zlJ\]j\kknopnqrss	ts,   hS-hJ2h
h
i!ii__main__)r<   zargparse.Namespace)&
__future__r   r   r   re   r   r   loggingr   r   rK   numpyr   pandasro   r+   benchmark_helperr   llama_inputsr   r   transformersr   r   r	   r
   onnxruntimer8   	getLogger__name__rr   rB   rX   rb   r[   rx   r   r  rE   rc   rA   <module>r+     s   @ #   	    	      ) S \ \ 			8	$=@>
12aHzDz zF rc   