
    h                        S SK r S SKrS SKrS SKrS SKrS SKrS SKJr  S SKrS SK	r	S SK
Jr  SSSSSS	S
SSS.	rSSSSS.rS rS rS@S jrS\S\S\4S jrS\S\S\S\4S jrS\S\S\S\S\4
S  jr SAS\S!\S"\4S# jjrS$\4S% jr SAS\S!\4S& jjr SAS\S\S'\S\S\S(\S)\S\S*\S+\S,\S"\4S- jjr   SBS\S\S\S.\4S/ jjr  SCS\S!\4S0 jjr  SCS\S\S'\S\S\S(\S)\S\S*\S+\S.\S"\4S1 jjr  SDS2\S3\S\S\S(\S)\S\S*\S+\S4\S5\S6\4S7 jjr   SES2\S3\S\S\S\S(\S)\S\S*\S+\S4\S5\S6\S"\4S8 jjr    SES2\S3\S\S\S(\S)\S\S*\S+\S4\S5\S"\4S9 jjr!   SES2\S3\S\S\S(\S)\S\S*\S+\S4\S5\S"\4S: jjr" SFS\S\S\S\S\S(\S)\S\S*\S+\S"\4S; jjr#S< r$SFS= jr%S> r&\'S?:X  a  S SK(r( \&" 5         gg! \) a!    \(RT                  " \RV                  " 5       6    gf = f)G    N)Pathmeasure_memoryzrunwayml/stable-diffusion-v1-5zstabilityai/stable-diffusion-2z stabilityai/stable-diffusion-2-1z+stabilityai/stable-diffusion-xl-refiner-1.0z/stabilityai/stable-diffusion-3-medium-diffusersz'stabilityai/stable-diffusion-3.5-mediumz&stabilityai/stable-diffusion-3.5-largez black-forest-labs/FLUX.1-schnellzblack-forest-labs/FLUX.1-dev)	1.5z2.02.1zxl-1.0z3.0Mz3.5Mz3.5LzFlux.1SzFlux.1DCUDAExecutionProviderROCMExecutionProviderMIGraphXExecutionProviderTensorrtExecutionProvider)cudarocmmigraphxtensorrtc                      / SQn SnX4$ )N)
z.a photo of an astronaut riding a horse on marsz@cute grey cat with blue eyes, wearing a bowtie, acrylic paintingzia cute magical flying dog, fantasy art drawn by disney concept artists, highly detailed, digital paintingzdan illustration of a house with large barn with many cute flower pots and beautiful blue sky sceneryzgone apple sitting on a table, still life, reflective, full color photograph, centered, close-up productzWbackground texture of stones, masterpiece, artistic, stunning photo, award winner photozSnew international organic style house, tropical surroundings, architecture, 8k, hdrznbeautiful Renaissance Revival Estate, Hobbit-House, detailed painting, warm colors, 8k, trending on Artstationzcblue owl, big green eyes, portrait, intricate metal design, unreal engine, octane render, realisticzldelicate elvish moonstone necklace on a velvet background, symmetrical intricate motifs, leaves, flowers, 8kz*bad composition, ugly, abnormal, malformed )promptsnegative_prompts     t/var/www/fran/franai/venv/lib/python3.13/site-packages/onnxruntime/transformers/models/stable_diffusion/benchmark.pyexample_promptsr   (   s    G CO##    c                      g)N)zwarm upbadr   r   r   r   warmup_promptsr   ;   s    r   c                     [        SXUS9$ )NT)is_gpufuncmonitor_typestart_memoryr   )r   r   r   s      r   measure_gpu_memoryr   ?   s    DZfggr   
model_name	directorydisable_safety_checkerc                 t   SSK JnJn  SS KnUbH  [        R
                  R                  U5      (       d   eUR                  5       nUR                  UUUS9nOUR                  U SUSS9nUR                  UR                  R                  5      Ul
        UR                  SS9  U(       a  S Ul        S Ul        U$ )Nr   )DDIMSchedulerOnnxStableDiffusionPipeline)providersess_optionsonnxT)revisionr&   use_auth_tokendisable)	diffusersr$   r%   onnxruntimeospathexistsSessionOptionsfrom_pretrainedfrom_config	schedulerconfigset_progress_bar_configsafety_checkerfeature_extractor)	r    r!   r&   r"   r$   r%   r.   session_optionspipes	            r   get_ort_pipeliner<   C   s    Dww~~i((((%446*::( ; 
 +::	 ; 
 #..t~~/D/DEDN   ."!%Kr   enable_torch_compileuse_xformersc                    SU ;   a  SSK Jn  UR                  U [        R                  S9R                  S5      nU(       aL  UR                  R                  [        R                  S9  [        R                  " UR                  SSS	9Ul        U$ S
U ;   a  SSK J	n  UR                  U [        R                  S9R                  S5      nU(       aL  UR                  R                  [        R                  S9  [        R                  " UR                  SSS	9Ul        U$ SSK J
nJn  SSKJn	Jn
  UR                  X
S9R                  S5      nUR                  R                  U	S9  U(       a  UR                  5         U(       az  [        R                  " UR                  5      Ul        [        R                  " UR                  5      Ul        [        R                  " UR                   5      Ul        [#        S5        UR%                  UR&                  R(                  5      Ul        UR+                  SS9  U(       a  S Ul        S Ul        U$ )NFLUXr   )FluxPipeline)torch_dtyper   )memory_formatzmax-autotuneT)mode	fullgraphzstable-diffusion-3)StableDiffusion3Pipeline)r$   StableDiffusionPipeline)channels_lastfloat16z)Torch compiled unet, vae and text_encoderr+   )r-   rA   r3   torchbfloat16totransformerrH   compilerF   r$   rG   rI   unet*enable_xformers_memory_efficient_attentionvaetext_encoderprintr4   r5   r6   r7   r8   r9   )r    r"   r=   r>   rA   r;   rF   r$   rG   rH   rI   s              r   get_torch_pipelinerT   a   s   *++JENN+SVVW]^e.A.AB$}}T-=-=N^bcDz)6'77
PUP^P^7_bbcije.A.AB$}}T-=-=N^bcD@,"22:2SVVW]^DIILL}L-779MM$)),	==*!MM$*;*;<9:"..t~~/D/DEDN   ."!%Kr   engine
batch_sizestepsc                     UR                  S5      S   R                  SS5      nU  SU SU SU 3U(       a  S-   $ S	-   $ )
N/zstable-diffusion-sd__b_s _safe)splitreplace)rU   r    rV   rW   r"   short_model_names         r   get_image_filename_prefixrd      sV    !'',R0889LdSXQ'(:,b@J`Bnnfmnnr   image_filename_prefixskip_warmupc                   ^ ^^^^^
 SSK Jn  [        T U5      (       d   e[        5       u  pUUU U
UU4S jn[	        XU5      n[	        XU5      nU" 5         / n[        U5       H  u  nnUU:  a    O[        R                  " 5       nT " U/T-  TTTU/T-  S9R                  n[        R                  " 5       nUU-
  nUR                  U5        [        SUS S35        [        U5       H   u  nnUR                  U SU SU S	35        M"     M     SS
KJn  SUTTTTUU[        U5      [        U5      -  [        R                   " U5      UUS.$ )Nr   )r%   c                  R   > T(       a  g [        5       u  pT" U /T-  TTTU/T-  S9  g )Npromptheightwidthnum_inference_stepsr   r   )rj   negativerV   rk   r;   rf   rW   rl   s     r   warmup run_ort_pipeline.<locals>.warmup   s9    )+8j( %%J3	
r   ri   Inference took .3f secondsr\   .jpg__version__r.   rU   versionrk   rl   rW   rV   batch_countnum_promptsaverage_latencymedian_latencyfirst_run_memory_MBsecond_run_memory_MB)r-   r%   
isinstancer   r   	enumeratetimeimagesappendrS   saver.   rw   sumlen
statisticsmedian)r;   rV   re   rk   rl   rW   r{   rz   r   memory_monitor_typerf   r%   r   r   rp   first_run_memorysecond_run_memorylatency_listirj   inference_startr   inference_endlatencykimageort_versions   `` ```    `                r   run_ort_pipeliner      so    6d78888.0G

 

 **=|T*+>U
HLw'	6))+8j( %,-
:
 & 	 		/1G$}H56!&)HAuJJ/0!AaS=> * ($ 7   ""|,s</@@$++L9/ 1 r   returnc                     U(       d  U(       a  SU 0O	SU /U-  0O0 n[         R                  R                  5       (       a&  [         R                  " SS9R	                  S5      US'   U$ )Nr   r   )device{   	generator)rJ   r   is_available	Generatormanual_seed)r   use_num_images_per_promptis_fluxrV   kwargss        r   get_negative_prompt_kwargsr      sh      ) 0#o%6%CD   zz  #ooV<HHM{Mr   c                 p  ^ ^^^^^
^ [        5       u  pSS Kn[        T UR                  5      mUUUU U
UU4S jn[	        XU5      n[	        XU5      nU" 5         [
        R                  " S5        / n[        U5       H  u  nnUU:  a    O[
        R                  R                  5         [        R                  " 5       n[        USTT5      nT " SU/T-  TTTS.UD6R                  n[
        R                  R                  5         [        R                  " 5       nUU-
  nUR                  U5        [        SUS S35        [        U5       H   u  nnUR                  U SU SU S	35        M"     M     S
[
        R                   TTTTUU[#        U5      [%        U5      -  [&        R(                  " U5      UUS.$ )Nr   c                  l   > T(       a  g [        5       u  p[        USTT5      nT" SU /T-  TT	TS.UD6  g )NFrj   rk   rl   rm   r   r   r   )
rj   ro   extra_kwargsrV   rk   r   r;   rf   rW   rl   s
      r   rp   "run_torch_pipeline.<locals>.warmup  sC    )+1(E7JWqVHz)&[`qdpqr   Fr   rr   rs   rt   r\   ru   rJ   rx   r   )r   r-   r   rA   r   rJ   set_grad_enabledr   r   synchronizer   r   r   r   rS   r   rw   r   r   r   r   )r;   rV   re   rk   rl   rW   r{   rz   r   r   rf   r   r   r-   rp   r   r   r   r   rj   r   r   r   r   r   r   r   r   s   `` ```    `                @r   run_torch_pipeliner      s     /0Gy556Gr r **=|T*+>U
H	5!Lw'	6

 ))+1/5'S]^ 
8j( %	

 
 & 	 	

 		/1G$}H56!&)HAuJJ/0!AaS=> *' (. $$ ""|,s</@@$++L9/ 1 r   r&   rk   rl   r{   rz   tuningc                 P   UnU(       a  US;   a  USSS.4n[         R                   " 5       n[        XX5      n[         R                   " 5       n[        SUU-
   S35        [        SXXt5      n[	        UUUUUUUU	U
UUS9nUR                  U UUR                  SS	5      US
S.5        U$ )N)r   r	      )tunable_op_enabletunable_op_tuning_enableModel loading took rt   ortrf   ExecutionProviderr_   Fr    r!   r&   r"   enable_cuda_graph)r   r<   rS   rd   r   updaterb   )r    r!   r&   rV   r"   rk   rl   rW   r{   rz   r   r   r   rf   provider_and_options
load_startr;   load_endre   results                       r   run_ortr   :  s      $(PP (_`*abJJ3G`Dyy{H	: 56h
?@5eZUZsF MM$" (()<bA&<!&	
 Mr   use_io_bindingc                     SSK Jn  Ub5  [        R                  R	                  U5      (       a  UR                  XUS9nO#UR                  U SUUS9nUR                  U5        U(       a  S Ul        S Ul        U$ )Nr   )ORTPipelineForText2Image)r&   r   T)exportr&   r   )	optimum.onnxruntimer   r/   r0   r1   r3   save_pretrainedr8   r9   )r    r!   r&   r"   r   r   pipelines          r   get_optimum_ort_pipeliner   n  s~     =	!:!:+;;Iiw;x+;;)	 < 
 	  +"&%)"Or   c                   ^ ^^^^^^
^^ [        S[        T 5      5        SSKJn  [	        T U5      m[        5       u  pUUUUU UUU
U4	S jn[        XU5      n[        XU5      nU" 5         [        UT
TT5      n/ n[        U5       H  u  nnUU:  a    O[        R                  " 5       nT
(       a  T " SUTTTTS.UD6R                  nOT " SU/T-  TTTS.UD6R                  n[        R                  " 5       nUU-
  nUR                  U5        [        SUS S	35        [        U5       H   u  nnUR                  U S
U S
U S35        M"     M     SSKJn  SUTTTTTU[        U5      [!        U5      -  ["        R$                  " U5      UUS.$ )NzPipeline typer   )ORTFluxPipelinec            	         >	 T(       a  g [        5       u  p[        UT
TT5      nT
(       a  T" SU TTT	TS.UD6  g T" SU /T-  TTT	S.UD6  g )Nrj   rk   rl   rm   num_images_per_promptr   r   r   )rj   ro   r   rz   rV   rk   r   r;   rf   rW   r   rl   s      r   rp   (run_optimum_ort_pipeline.<locals>.warmup  sq    )+1(<UW^`jk$ $)&1  u:-fE_duhtur   r   r   rr   rs   rt   r\   ru   rv   optimum_ortrx   r   )rS   type&optimum.onnxruntime.modeling_diffusionr   r   r   r   r   r   r   r   r   r   r.   rw   r   r   r   r   )r;   rV   re   rk   rl   rW   r{   rz   r   r   r   rf   r   r   r   rp   r   r   r   r   r   rj   r   r   r   r   r   r   r   r   s   `` ``` `  ``                 @r   run_optimum_ort_pipeliner     s    
/4:&F/G.0Gv v& **=|T*+>U
H-o?XZacmnLLw'	6))+$ $)&0  f   x*,V5^cgsf  		/1G$}H56!&)HAuJJ/0!AaS=> *+ (0 7   ""|,s</@@$++L9/ 1 r   c                 l   [         R                   " 5       n[        XX$US9n[         R                   " 5       n[        SUU-
   S35        U(       a  U S-   [        U5      R                  -   OU n[        SUX7U5      n[        UUUUUUUU	U
UUS9nUR                  U UUR                  SS5      US	S
.5        U$ )Nr   r   rt   r\   optimumr   r   r_   Fr   )	r   r   rS   r   namerd   r   r   rb   )r    r!   r&   rV   r"   rk   rl   rW   r{   rz   r   r   r   rf   r   r;   r   full_model_namere   r   s                       r   run_optimum_ortr     s      J#xP^D yy{H	: 56h
?@AJj3&i)=)==PZO5?J7M &F MM$" (()<bA&<!&	
 Mr   work_dirry   max_batch_sizenvtx_profileuse_cuda_graphc                   ^^^^^- [        S5        SSKJn  U" 5         TU::  d   eSSKJn  U" U5      nUR                  5       nSSKJnJn  SSK	J
n  UR                  nU" U UU5      u  nnnnnU" USUSUUUUUS	9	m-T-R                  R                  UUUS
TTTSSS[        R                  R!                  5       S9  T-R#                  TTT5        UUU-UU4S jn[%        U
UU	5      n[%        U
UU	5      nU" 5         ['        SUTTU5      n/ n[)        5       u  n n![+        U 5       H  u  n"n#U"U:  a    O[,        R,                  " 5       n$T-R/                  U#/T-  U!/T-  TTTSSS9u  n%n&[,        R,                  " 5       n'U'U$-
  n(UR1                  U(5        [        SU(S SU& 35        [+        U%5       H   u  n)n*U*R3                  U SU" SU) S35        M"     M     T-R5                  5         SSKJn+  SSKJn,  0 SUR=                  5       _SS_SU,_SSU+ S3_SU_S T_S!T_S"T_S#T_S$U_S%U_S&[?        U5      [A        U5      -  _S'[B        RD                  " U5      _S(U_S)U_S*U_S+U_$ ),Nzd[I] Initializing ORT TensorRT EP accelerated StableDiffusionXL txt2img pipeline (static input shape)r   init_trt_pluginsPipelineInfo
EngineTypeget_engine_pathsrG   DDIMFr5   
output_dirverboser   r   r   framework_model_direngine_type   T)opt_image_heightopt_image_widthopt_batch_sizestatic_batchstatic_image_shapemax_workspace_size	device_idc                  T   > [        5       u  pTR                  U /T-  U/T-  TTTS9  g N)denoising_stepsr   run)rj   ro   rV   rk   r   rW   rl   s     r   rp   "run_ort_trt_static.<locals>.warmup\  s3    )+fX
*XJ,CVUdijr   ort_trtg      @r   r   guidanceseedEnd2End took rs    seconds. Inference latency: r\   ru   rv   r    rU   r.   ry   r&   z	tensorrt()r!   rk   rl   rW   rV   rz   r{   r|   r}   r~   r   r"   r   )#rS   trt_utilitiesr   diffusion_modelsr   
short_nameengine_builderr   r   pipeline_stable_diffusionrG   ORT_TRTbackendbuild_enginesrJ   r   current_deviceload_resourcesr   rd   r   r   r   r   r   r   teardownr   rw   r.   r   r   r   r   r   ).r   ry   rV   r"   rk   rl   rW   r{   rz   r   r   r   r   r   r   r   pipeline_infor   r   r   rG   r   onnx_dir
engine_dirr   r   r\   rp   r   r   re   r   r   r   r   rj   r   r   pipeline_timer   r   r   r   trt_versionr   r   s.     ` ```                                      @r   run_ort_trt_staticr    s9     

pq /'''- )M))+J;A$$K?OPXZgit?u<Hj*&91 '!%%/
H ""
!**++- #   FE:6k k **=v|T*+>U
H5iZY^`vwL.0G_w'	6))+ (Hz!
*! !- !
 		/1G$gc]*GWX!&)HAuJJ/0!AaS=> *% (* 36m((*- 	; 	i}A.	
 	Z 	& 	 	 	j 	{ 	{ 	3|,s</@@ 	*++L9 	/ 	 1  	!"8!" 	^# r   c                   ^^^^^^1 [        S5        SSKJn  SSKJn  U" 5         TU::  d   eSSKJn  U" U5      nSSKJnJ	n  SSK
Jn  UR                  nU" U UU5      u  nnnnnU" USUS	UUS
US9m1T1R                  R                  UUUSTTTS
S
S	US9  [        T1R                  R!                  5       T1R                  R!                  5       5      nUR#                  U5      u  nnT1R                  R%                  U5        T1R'                  TTT5        UUU1UUU4S jn [)        UU U
5      n![)        UU U
5      n"U " 5         [+        SUTTU5      n#/ n$[-        5       u  n%n&[/        U%5       H  u  n'n(U'U:  a    O[0        R0                  " 5       n)T1R3                  U(/T-  U&/T-  TTTSS9u  n*n+[0        R0                  " 5       n,U,U)-
  n-U$R5                  U-5        [        SU-S SU+ 35        [/        U*5       H   u  n.n/U/R7                  U# SU' SU. S35        M"     M     T1R9                  5         SS Kn0SU0R<                  STTTTU	U[?        U$5      [A        U$5      -  [B        RD                  " U$5      U!U"US.$ )N][I] Initializing TensorRT accelerated StableDiffusionXL txt2img pipeline (static input shape)r   cudartr   r   r   r   r   FT)r5   r   r   r   r   r   r   r   r	  r   r  
onnx_opsetr   r   r   r   static_shapeenable_all_tacticstiming_cachec                  d   > T(       a  g [        5       u  pTR                  U /T-  U/T-  TTTS9  g r   r   )rj   ro   rV   rk   r   rf   rW   rl   s     r   rp   #run_tensorrt_static.<locals>.warmup  s9    )+fX
*XJ,CVUdijr   trtr   )r   r   r   rs   r   r\   ru   r   default)rU   ry   r&   rk   rl   rW   rV   rz   r{   r|   r}   r~   r   r   )#rS   r   r  r   r   r   r   r   r   r   r   rG   TRTr  load_enginesmaxmax_device_memory
cudaMallocactivate_enginesr  r   rd   r   r   r   r   r   r   r  r   rw   r   r   r   r   )2r   ry   r    rV   r"   rk   rl   rW   r{   rz   r   r   r   r   r   rf   r  r   r   r  r   r   rG   r   r  r	  r   r   r  r  r\   shared_device_memoryrp   r   r   re   r   r   r   r   rj   r   r   r
  r   r   r   r   r  r   s2      ` ```       `                                 @r   run_tensorrt_staticr!    s   $ 

ij /'''- )M;A..KJZ-KGHj*&9<
 '!%	H !!/! ! "  H,,>>@(BRBRBdBdBfg$//0ABA%%&:; FE:6k k **=v|T*+>U
H5eZUZ\rsL.0G_w'	6))+ (Hz!
*! !- !
 		/1G$gc]*GWX!&)HAuJJ/0!AaS=> *# ((  ?? ""|,s</@@$++L9/ 1+ r   c                   ^ ^^^^^^^^^*^+^,^-^.^/ [        S5        SS KnSSKJn  SSKJn  Tm,Tm-T,S-  S:w  d	  T-S-  S:w  a  [        ST, ST- S35      eU" 5         TT::  d   eSS	KJn  SS
K	J
m*Jm+  U*UU+UUUUUU 4	S jnSSKJn  U" U5      nU" UU5      m.[        T.R                  R!                  5       T.R                  R!                  5       5      nUR#                  U5      u  nnT.R                  R%                  U5        T.R'                  T,T-T5        SU,U-U.U4S jjm/UU/U4S jn[)        U
UU	5      n[)        U
UU	5      nU" 5         UR+                  5       n[-        SUTTU5      n/ n[/        5       u  nn [1        U5       H  u  n!n"U!U:  a    O[2        R2                  " 5       n#T/" U"/T-  U /T-  SS9u  n$n%[2        R2                  " 5       n&U&U#-
  n'UR5                  U'5        [        SU'S SU% 35        [1        U$5       H   u  n(n)U)R7                  U SU! SU( S35        M"     M     T.R9                  5         USUR:                  STTTTUU[=        U5      [?        U5      -  [@        RB                  " U5      UUTS.$ )Nr  r   r  r      zCImage height and width have to be divisible by 8 but specified as: z and .r   r   c                    >	 T	R                   nT" TX5      u  p4pVnU " USUSTTTUUS9	nUR                  R                  UUUST
TTSSSUS9  U$ )Nr   Fr   r   Tr  )r  r  r  )pipeline_classr  r   r  r	  r   r   r  r   r   rV   r   rk   r   r   r   rl   r   s            r   init_pipeline-run_tensorrt_static_xl.<locals>.init_pipelineJ  s     nnN^mO
Kj|
 "!%)) 3#

 	%%! 3%#!$% 	& 	
 r   r   c           
      .   > TR                  U UTTTSUS9$ Ng      @r   r   )rj   r   r   image_heightimage_widthr   rW   s      r   run_sd_xl_inference3run_tensorrt_static_xl.<locals>.run_sd_xl_inferencez  s.    ||!  
 	
r   c                  P   > T(       a  g [        5       u  pT" U /T-  U/T-  5        g Nrn   rj   ro   rV   r.  rf   s     r   rp   &run_tensorrt_static_xl.<locals>.warmup  ,    )+VHz1H:
3JKr   r  r   r   r   rs   r   r\   .pngr   r  r    rU   ry   r&   rk   rl   rW   rV   rz   r{   r|   r}   r~   r   r   r1  )"rS   r   r   r  r   r   
ValueErrorr   r   r   r   r   r   rG   r  r  r  r  r  r  r   r   rd   r   r   r   r   r   r  rw   r   r   r   r   )0r   ry   rV   r"   rk   rl   rW   r{   rz   r   r   r   r   r   rf   r  r  r   r   r'  rG   r  r  r\   r   rp   r   r   r    re   r   r   r   r   rj   r   r   r
  r   r   r   r   r   r   r,  r-  r   r.  s0   ` ` ```    ````                           @@@@@@r   run_tensorrt_static_xlr9  #  s   " 

ij. LKa1a1 4QR^Q__depdqqrs
 	

 '''-;! !F B )M4mDHH,,>>@(BRBRBdBdBfg$//0ABA%%&:; L+zB	
 	
L **=v|T*+>U
H##%J5eZUZ\rsL.0G_w'	6))+ 3VHz4IOK\_iKips t		/1G$gc]*GWX!&)HAuJJ/0!AaS=> * (  !?? ""|,s</@@$++L9/ 1+ r   c                   ^^^^^^%^& SSK Jn  SSKJn  U" UUR                  U TTUUTS9m%TU::  d   eT%R                  TTT5        SUU%UU4S jjm&UU&U4S jn[        U
UU	5      n[        U
UU	5      nU" 5         T%R                  R                  5       n[        SUTTU5      n/ n[        5       u  nn[        U5       H  u  nnUU:  a    O[        R                  " 5       nT&" U/T-  U/T-  SS	9u  nn[        R                  " 5       nUU-
  nUR                  U5        [        S
US SU 35        [        U5       H.  u  n n!U SU SU  S3n"U!R                  U"5        [        SU"5        M0     M     T%R!                  5         SSKJn#  SSKJn$  USU$SU# S3TTTTUU[)        U5      [+        U5      -  [,        R.                  " U5      UUUS.$ )Nr   )initialize_pipeline)r   )ry   r   r   rk   rl   r   r   r   c           
      .   > TR                  U UTTTSUS9$ r*  r+  )rj   r   r   rk   r   rW   rl   s      r   r.  +run_ort_trt_xl.<locals>.run_sd_xl_inference  s.    ||!  
 	
r   c                  P   > T(       a  g [        5       u  pT" U /T-  U/T-  5        g r1  rn   r2  s     r   rp   run_ort_trt_xl.<locals>.warmup  r4  r   r   r   r5  r   rs   r   r\   r6  zImage saved torv   r.   r   r   r7  r1  )
demo_utilsr;  r   r   r  r  r   r  r   rd   r   r   r   r   rS   r   r  r   rw   r.   r   r   r   r   )'r   ry   rV   r"   rk   rl   rW   r{   rz   r   r   r   r   r   rf   r;  r   rp   r   r   r    re   r   r   r   r   rj   r   r   r
  r   r   r   r   filenamer  r   r   r.  s'     ` ```       `                      @@r   run_ort_trt_xlrB    s   " /)"&&%%!	H '''FE:6	
 	
L **=v|T*+>U
H'',,.J5iZY^`vwL.0G_w'	6))+ 3VHz4IOK\_iKips t		/1G$gc]*GWX!&)HAu/0!AaS=HJJx "H- * ( 36 !{m1- ""|,s</@@$++L9/ 1+ r   c                 b   S[         R                  R                  l        S[         R                  R                  l        [         R
                  " S5        [        R                  " 5       n[        XX45      n[        R                  " 5       n[        SX-
   S35        [        SXXr5      nU(       d2  [         R                  " 5          [        UUUUUUUU	U
UUS9nS S S 5        O[        UUUUUUUU	U
UUS9nWR                  U S U(       a  SO
U(       a  SOS	USS
.5        U$ ! , (       d  f       N9= f)NTFr   rt   rJ   r   rN   xformersr  r   )rJ   backendscudnnenabled	benchmarkr   r   rT   rS   rd   inference_moder   r   )r    rV   r"   r=   r>   rk   rl   rW   r{   rz   r   r   rf   r   r;   r   re   r   s                     r   	run_torchrJ  !  s*    $(ENN %)ENN"	5!JjBVeDyy{H	 56h
?@5gzW\u!!#'%#'F $# $!#
 MM$%9	\z_h&<!&	
 MM $#s   D  
D.c                     [         R                  " 5       n U R                  SSS[        S/ SQSS9  U R                  SS	S[        S
[	        [
        R                  5       5      SS9  U R                  SSSSS9  U R                  SSS[        [	        [        R                  5       5      SSS9  U R                  SSS[        S SS9  U R                  SSS[        SSS9  U R                  SSSSS 9  U R                  SS!9  U R                  S"SSS#S 9  U R                  SS$9  U R                  S%SSS&S 9  U R                  SS'9  U R                  S(SSS)S 9  U R                  SS*9  U R                  S+SSS,S 9  U R                  SS-9  U R                  S.S/[        S0/ S1QS2S39  U R                  S4S[        S5S6S9  U R                  S7S[        S5S8S9  U R                  S9S:S[        S;S<S9  U R                  S=S>S[        S?S@S9  U R                  SASBS[        [        S0SC5      SDSES9  U R                  SFSGS[        [        S0SH5      SISJS9  U R                  SKSLSSSMS 9  U R                  SSN9  U R                  5       nU$ )ONz-ez--engineFr.   )r.   r   rJ   r   z-Engines to benchmark. Default is onnxruntime.)requiredr   r  choiceshelpz-rz
--providerr   z8Provider to benchmark. Default is CUDAExecutionProvider.z-tz--tuning
store_truezsEnable TunableOp and tuning. This will incur longer warmup latency, and is mandatory for some operators of ROCm EP.)actionrN  z-vz	--versionr   z>Stable diffusion version like 1.5, 2.0 or 2.1. Default is 1.5.)rL  r   rM  r  rN  z-pz
--pipelinez[Directory of saved onnx pipeline. It could be the output directory of optimize_pipeline.py.)rL  r   r  rN  z-wz
--work_dirr$  z?Root directory to save exported onnx models, built engines etc.z--enable_safety_checkerzEnable safety checker)rL  rP  rN  )enable_safety_checkerz--enable_torch_compilez#Enable compile unet for PyTorch 2.0)r=   z--use_xformerszUse xformers for PyTorch)r>   z--use_io_bindingzUse I/O Binding for Optimum.r   z--skip_warmupz
No warmup.r   z-bz--batch_sizer   )r            r#  
          z)Number of images per batch. Default is 1.)r   r  rM  rN  z--heighti   z$Output image height. Default is 512.z--widthz#Output image width. Default is 512.z-sz--steps2   zNumber of steps. Default is 50.z-nz--num_promptsrU  z!Number of prompts. Default is 10.z-cz--batch_count      z(Number of batches to test. Default is 5.z-mz--max_trt_batch_sizerV  rT  zdMaximum batch size for TensorRT. Change the value may trigger TensorRT engine rebuild. Default is 4.z-gz--enable_cuda_graphz/Enable Cuda Graph. Requires onnxruntime >= 1.16)r   )argparseArgumentParseradd_argumentstrlist	PROVIDERSkeys	SD_MODELSset_defaultsintrange
parse_args)parserargss     r   parse_argumentsri  f  sw   $$&F
?<   Y^^%&G   a	   Y^^%&M   j   N   !$	   e4
 2	   U3
'	   U+
+	   u-
	   E*
+8   3   2   .   0   a7   as   >   %0DKr   c                    ^ SS K nUR                  [        R                  " 5       5      nUR	                  5        H;  mU (       a  [        U4S jS 5       5      (       d  M&  [        TR                  5        M=     g )Nr   c              3   @   >#    U  H  oTR                   ;   v   M     g 7fr1  )r0   ).0xlibs     r   	<genexpr>)print_loaded_libraries.<locals>.<genexpr>  s     )`A_Asxx-A_s   )libculibnvr   )psutilProcessr/   getpidmemory_mapsanyrS   r0   )cuda_related_onlyrs  prn  s      @r   print_loaded_librariesrz    sI    ryy{#A}}!c)`A_)`&`&`#((O r   c                  N   [        5       n [        U 5        U R                  S:X  a  U R                  S;   a  S[        R
                  S'   SSKJn  SSKJn  UR                  U5      UR                  S5      :X  a  S[        R
                  S	'   U R                  (       ag  U R                  S:X  a  U R                  S
;   a  U R                  b  [        S5      eUR                  U5      UR                  S5      :  a  [        S5      e[        R                  " SS9  U R                  S:X  a  SOSn[!        US 5      n[        SU5        ["        U R                     n[$        U R                     nU R                  S:X  GaT  U R                  S:X  GaC  SU R                  ;   a  [        S5        ['        U R(                  U R                  U R*                  SU R,                  U R.                  U R0                  U R2                  U R4                  UUU R6                  SU R                  U R8                  S9nGOf[        S5        [;        U R(                  U R                  U R*                  U R<                  (       + U R,                  U R.                  U R0                  U R2                  U R4                  UUU R6                  SU R                  U R8                  S9nGOU R                  S:X  a  US:X  a  SU R                  ;   a  S[        R
                  S	'   [?        UU R                  UU R*                  U R<                  (       + U R,                  U R.                  U R0                  U R2                  U R4                  UUU R@                  U R8                  S9nGOU R                  S:X  a  U R                  (       a.  [        RB                  RE                  U R                  5      (       d   S5       e[        SU SU RF                   35        [I        UU R                  UU R*                  U R<                  (       + U R,                  U R.                  U R0                  U R2                  U R4                  UUU RF                  U R8                  S 9nGOU R                  S:X  a  SU R                  ;   a  [        S!5        [K        U R(                  U R                  U R*                  SU R,                  U R.                  U R0                  U R2                  U R4                  UUU R6                  SU R                  U R8                  S9nGOgU R                  S:X  a  [        S"5        [M        S>0 S#U R(                  _S$U R                  _S%U_S&U R*                  _S'S_S(U R,                  _S)U R.                  _S*U R0                  _S+U R2                  _S,U R4                  _S-U_S.U_S/U R6                  _S0S_S1U R                  _S2U R8                  _6nO[        S3U RN                   S4U RP                   S535        [S        UU R*                  U R<                  (       + U RN                  U RP                  U R,                  U R.                  U R0                  U R2                  U R4                  UUU R8                  S69n[        U5        [U        S7S8S9S:9 n/ S;Qn	[V        RX                  " XS<9n
U
R[                  5         U
R]                  U5        S S S 5        U R0                  S=:X  a  [_        U R                  S
;   5        g g ! , (       d  f       N8= f)?Nr.   )r   1ORT_DISABLE_TRT_FLASH_ATTENTIONr   )ry   rv   z1.16.0!ORT_ENABLE_FUSED_CAUSAL_ATTENTION)r   r   z:The stable diffusion pipeline does not support CUDA graph.z1.16z.CUDA graph requires ONNX Runtime 1.16 or laterz%(funcName)20s: %(message)s)fmtr   r   z&GPU memory used before loading models:r   xlzNTesting Txt2ImgXLPipeline with static input shape. Backend is ORT TensorRT EP.TF)r   ry   rV   r"   rk   rl   rW   r{   rz   r   r   r   r   r   rf   zLTesting Txt2ImgPipeline with static input shape. Backend is ORT TensorRT EP.r   r   )r    r!   r&   rV   r"   rk   rl   rW   r{   rz   r   r   r   rf   z?--pipeline should be specified for the directory of ONNX modelsz/Testing diffusers StableDiffusionPipeline with z provider and tuning=)r    r!   r&   rV   r"   rk   rl   rW   r{   rz   r   r   r   rf   zGTesting Txt2ImgXLPipeline with static input shape. Backend is TensorRT.zETesting Txt2ImgPipeline with static input shape. Backend is TensorRT.r   ry   r    rV   r"   rk   rl   rW   r{   rz   r   r   r   r   r   rf   zNTesting Txt2ImgPipeline with dynamic input shape. Backend is PyTorch: compile=z, xformers=r$  )r    rV   r"   r=   r>   rk   rl   rW   r{   rz   r   r   rf   zbenchmark_result.csvar_   )rD   newline)r    r!   rU   ry   r&   r"   rk   rl   rW   rV   rz   r{   r|   r}   r~   r   r   )
fieldnamesr   r   )0ri  rS   rU   ry   r/   environ	packagingr.   rw   parser   r&   r   r8  coloredlogsinstallr   rb  r`  rB  r   rV   rk   rl   rW   r{   rz   max_trt_batch_sizerf   r  rQ  r   r   r0   isdirr   r   r9  r!  r=   r>   rJ  opencsv
DictWriterwriteheaderwriterowrz  )rh  ry   r   r   r   sd_modelr&   r   csv_filecolumn_names
csv_writers              r   mainr    sI   D	$K{{m#<<7" =@BJJ89%:==%x)@@ ?BBJJ:;!!KK=0T]]FZ5Z_c_l_l_t !]^^}}[)GMM&,AA !QRR9:$(MMV$;&%&94@L	
2LA&H'H{{m#(C4<<bc#??'+{{jjjj ,, ,,)$7#66"#55 ,,F$ `a'??+/+E+E'E{{jjjj ,, ,,)$7#66"#55 ,,F" 
		!h2I&I4<<>ABJJ:; mm'+'A'A#A;;****((((% 3..((
  
	%}}t}}!=!= 	
M	
= 	?zI^_c_j_j^klmmm'+'A'A#A;;****((((% 3;;((
  

	"tt||';WX']]LL#';;****((((% 32211((
" 

	"UV$ 
]]
LL
  
 	

 $(
 ;;
 **
 **
 ((
 ((
 &
 !4
  22
 
  11
  ((!
& 	\]a]v]v\w  xC  DH  DU  DU  CV  VW  X	
 '+'A'A#A!%!:!:**;;****((((% 3((
  
&M	$3	;x
& ^^HF
 F#- 
<2 zzQt}}0DDE 3 
<	;s   *:\
\$__main__r1  )F)r   TF)FF)FT)FTF)T),r[  r  r/   r   sysr   pathlibr   r  rJ   benchmark_helperr   rb  r`  r   r   r   r^  boolr<   rT   rd  rd   r   dictr   r   r   r   r   r   r  r!  r9  rB  rJ  ri  rz  r  __name__	traceback	Exceptionprint_exceptionexc_infor   r   r   <module>r     s    
 	  
     + ,+-;=541-
	 $#++		$&h  X\ <*3 * *\` *pt *Zoc os o oTW oqu o  HH H HVcg < FF Fn 111 1 	1
 !1 1 1 1 1 1 1 1n %#'  !	
 L $WW WN !222 2 	2
 !2 2 2 2 2 2 2 2D @@@ @ !	@
 @ @ @ @ @ @ @ @b !EEE E 	E
 !E E E E E E E E E  !Ej SSS S !	S
 S S S S S S S SF eee e !	e
 e e e e e e e ej BBB !B 	B
 B B B B B B BJm`JFZ z3	 
  3!!3<<>23s   2F; ;$G"!G"