
    7h                       S SK Jr  S SKrS SKrS SKrS SKJr  S SKJr  S SK	J
r
JrJr  S SKrS SKJr  SSKJr  SS	KJr  SS
KJr  \(       a  S SKJr  S SKJr  \R6                   " S S5      5       r\R6                   " S S\5      5       r\r\R6                   " S S5      5       r\R6                   " S S5      5       r \R6                   " S S\5      5       r!\R6                   " S S\5      5       r"\R6                   " S S\5      5       r#\R6                   " S S\ 5      5       r$ " S S\%5      r& " S S \&S!9r' " S" S#\'5      r( " S$ S%\'5      r) " S& S'\'5      r* " S( S)\'5      r+g)*    )annotationsN)partial)Lock)AnyCallableTYPE_CHECKING)
OrderedSet   )config)get_backend_num_stages)V)	GeneratorConfigc                  L    \ rS rSr% SrS\S'   S\S'   S\S'   S\S'   S\S'   S	rg
)
BaseConfig   z<
Base Gemm configuration used for most backends (CPU, CUDA)
intblock_mblock_nblock_k
num_stages	num_warps N__name__
__module____qualname____firstlineno____doc____annotations____static_attributes__r       ]/var/www/fran/franai/venv/lib/python3.13/site-packages/torch/_inductor/template_heuristics.pyr   r      s"     LLLONr#   r   c                  (    \ rS rSr% SrSrS\S'   Srg)
GemmConfig&   z7
Gemm configuration used for most backends (CPU, CUDA)
   r   group_mr   N)r   r   r   r   r    r)   r!   r"   r   r#   r$   r&   r&   &   s     GSr#   r&   c                  B    \ rS rSr% SrS\S'   S\S'   S\S'   S\S'   Srg	)

FlexConfig3   z
Base Config class for flex attention
- FlexAttn forward, backward and flex decode will use this

NOTE:
For flex_attn bwd block_m and block_n are reused for block_m1, block_m2, block_n1, block_n2

r   r   r   r   r   r   Nr   r   r#   r$   r+   r+   3   s     LLONr#   r+   c                  8    \ rS rSr% SrS\S'   S\S'   S\S'   Srg)	FlexDecodeConfigD   z 
Config class for flex decoding
r   r   r   r   r   Nr   r   r#   r$   r.   r.   D   s     LONr#   r.   c                  D    \ rS rSr% SrSrS\S'   SrS\S'   SrS\S	'   S
r	g)ROCmGemmConfigP   zF
ROCm subclass for GEMMs, with AMD backend specific tuneable kernargs
   r   matrix_instr_nonkdimr   waves_per_eu   kpackr   N
r   r   r   r   r    r4   r!   r5   r7   r"   r   r#   r$   r1   r1   P   '     !##"L#E3Nr#   r1   c                  D    \ rS rSr% SrSrS\S'   SrS\S'   SrS\S	'   S
r	g)ROCmConvConfig[   zE
ROCm subclass for Conv, with AMD backend specific tuneable kernargs
r3   r   r4   r   r5   r6   r7   r   Nr8   r   r#   r$   r;   r;   [   r9   r#   r;   c                  D    \ rS rSr% SrSrS\S'   SrS\S'   SrS\S'   S	r	g
)ROCmFlexConfigf   zI
ROCm subclass for FlexAttn, with AMD backend specific tuneable kernargs
r   r   r4   r5   r6   r7   r   Nr8   r   r#   r$   r>   r>   f   '     !"#!L#E3Nr#   r>   c                  D    \ rS rSr% SrSrS\S'   SrS\S'   SrS\S'   S	r	g
)ROCmFlexDecodeConfigq   zK
ROCm subclass for FlexDecode, with AMD backend specific tuneable kernargs
r   r   r4   r5   r6   r7   r   Nr8   r   r#   r$   rB   rB   q   r@   r#   rB   c                  h   ^  \ rS rSr% Sr0 rS\S'   \" 5       rS\S'           S	U 4S jjr	Sr
U =r$ )
BaseHeuristicSingleton|   z
Thread-safe implementation of single to be used in the config heuristic subclasses
to ensure heavy __init__ calls are not repeatedly run
zdict[type[Any], Any]
_instancesr   _lockc                   > U R                      X R                  ;  a  [        TU ]  5       nX0R                  U '   U R                  U    sS S S 5        $ ! , (       d  f       g = fN)rH   rG   super__call__)clsargskwargsinstance	__class__s       r$   rL   BaseHeuristicSingleton.__call__   sE     YY..( 7+-&.s#>>#&	 YYs   :A
A r   )rM   rE   rN   r   rO   r   returnBaseConfigHeuristic)r   r   r   r   r    rG   r!   r   rH   rL   r"   __classcell__rQ   s   @r$   rE   rE   |   sK    
 (*J$)&E4'#',/';>'	' 'r#   rE   c                  Z   \ rS rSrSrSS jr    SS jr                SS jr      SS jrSSS	 S
4                 S S jjr	        S!S jr
S"S jrS"S jrS"S jrS"S jrS"S jrS"S jrS"S jr  S"S jrS"S jrS"S jrS#S jrS#S jr      S$S jrSrg)%rT      zT
Base class for mm_configs, device specific triton kernels config inherit from here
c                   [        SSSSS5      [        SSSSS5      [        SSSSS	5      [        SSSSS	5      [        SSSSS5      [        SSSSS5      [        SSSSS5      [        SSSS
S	5      [        SSSSS5      [        SSSS
S5      [        SSSSS	5      [        SSSS
S5      [        SSSSS5      [        SSSS
S5      [        SSSSS	5      [        SSSSS	5      [        SSSS
S5      [        SSSS
S5      [        SSSSS	5      /U l        [        R                  " / SQS
S9 VVVVVVs/ s H1  u  pnS  H$  nS  H  nS  H  n[        XX4XV5      PM     M     M&     M3     snnnnnnU l        [        SSSS
S5      [        SSSSS5      [        SSSSS5      [        SSSS
S5      [        SSSSS5      [        SSSS
S	5      [        SSSSS	5      [        SSSSS5      [        SSSS
S	5      [        SSSSS5      /
U l        [        SSSSS5      [        SSSS
S5      [        SSSS
S5      [        SSSSS	5      [        SSSSS	5      [        SSSSS	5      [        SSSSS	5      [        SSSSS	5      [        SSSS
S	5      [        SSSS
S	5      [        SSSS
S	5      /U l        [        SSSS
S5      [        SSSSS	5      /U l        [        SSSS
S	5      [        SSSS
S	5      [        SSSS
S	5      [        SSSS
S5      [        SSSSS	5      [        SSSSS	5      [        SSSSS	5      [        SSSSS5      /U l        / [        SSSS
S	5      P[        SSSS
S	5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSS
S	5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSS
S5      P[        SSSS
S5      P[        SSSS
S5      P[        SSSS
S5      P[        SSSS
S5      P[        SSSS
S5      P[        SSSS
S5      P[        SSSS
S5      P[        SSSS
S5      P[        SSSS
S5      P[        SSSS
S5      P[        SSSS
S5      P[        SSSS
S5      P[        SSSS
S5      P[        SSSS
S5      P[        SSSS
S5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      P[        SSSSS5      PU l	        [        SSSS
S	5      [        SSSS
S	5      [        SSSSS	5      [        SSSSS5      [        SSSS
S5      [        SSSSS5      [        SSSSS	5      [        SSSSS	5      [        SSSSS	5      /	U l
        [        SSSSS5      [        SSSS
S	5      [        SSSSS5      [        SSSSS	5      [        SSSSS	5      [        SSSSS	5      [        SSSSS	5      [        SSSSS	5      [        SSSSS5      [        SSSSS5      /
U l        [        SSSSS5      [        SSSSS5      [        SSSSS	5      [        SSSSS	5      [        SSSSS5      [        SSSSS	5      [        SSSSS	5      /U l        [        SSS
S5      [        SSS
S5      [        SSSS	5      [        SSS
S5      [        SSS
S5      /U l        S VVV	V
s/ s HH  nS  H>  nS  H4  n	US:  d  US:  a  SS	/OS/  H  n
X-  S:X  d  M  [        XxX5      PM     M6     M@     MJ     sn
n	nnU l        [#        SS
S5      [#        SS
S5      [#        SS
S5      /U l        S VVVVs/ s H-  nS  H#  nS  H  nS  H  n[        XXE5      PM     M     M%     M/     snnnnU l        S VVVVs/ s H7  nS  H-  nS  H#  nS  H  nX-  S:X  d  M  [        XxXE5      PM     M%     M/     M9     snnnnU l        S VVVs/ s H#  nS  H  nS  H  n[#        XU5      PM     M     M%     snnnU l        g s  snnnnnnf s  sn
n	nnf s  snnnnf s  snnnnf s  snnnf )N    r3   r
   r6         @      r(      r3   rZ   r]   r[      repeat)r
   r6   r_   r\   r^   r6   r\   r(   )r(   ra      i   )rZ   r]   rZ   r]   r[   )r
   r_   r\   r^   r   r3   rZ   r]   r[   )r&   
mm_configs	itertoolsproductexhaustive_configsextra_mm_configsint8_mm_configsmixed_mm_configspersistent_mm_configsscaled_mm_configsscaled_persistent_mm_configsmm_plus_mm_configs
ConvConfigconv_configsr+   flex_attn_fwd_autotune_configsflex_attn_bwd_autotune_configsr.   flex_decode_autotune_configs exhaustive_flex_attn_fwd_configs exhaustive_flex_attn_bwd_configsexhaustive_flex_decode_configs)selfBLOCK_MBLOCK_NBLOCK_Kr   r   r)   BLOCK1BLOCK2swr   s               r$   __init__BaseConfigHeuristic.__init__   s_   
 r2r1a(r2sAq)r2r1a(r2r1a(r2sAq)r2r1a(r2r1a(r2r1a(r2sAq)r3Aq)r3Aq)r3Aq)r3Q*sBAq)sBAq)sCQ*sCQ*sCQ*sCQ*'-
2 .7->->&q.5
 5
.)' .
&	 wiQ  R
 ' R .	 R.5
  r2r1a(r2r1a(r2r1a(r2sAq)sBAq)sBAq)sBQ*sCQ*sCQ*sCQ*3
 r2r1a(r3Aq)sBAq)r3Aq)sBAq)r2r1a(r2r1a(sCQ*r2r1a(sCa+sCa+2
 r3Q*r3Q*3
 sCQ*sCQ*sCa+sCa+sCQ*sCQ*sCQ*sCQ*	8
"b4
sCQ*b4
sCQ*b4
 sBAq)b4
 r3Aq)	b4

 sCQ*b4
 sBAq)b4
 r3Aq)b4
 sBAq)b4
 r2r1a(b4
 sCa+b4
 sBQ*b4
 r3Q*b4
 sCa+b4
 sBAq)b4
 r3Aq)b4
  sBAq)!b4
" r2r1a(#b4
$ r2r1a(%b4
& r2r1a('b4
( r3Aq))b4
* r3Aq)+b4
, r2r1a(-b4
. r2r1a(/b4
0 r3Aq)1b4
2 r3Aq)3b4
4 r2r1a(5b4
6 r2r1a(7b4
8 r3Aq)9b4
: r3Aq);b4
< r2r1a(=b4
> r2r1a(?b4
@ r3Aq)Ab4
B r3Aq)Cb4
D r2r1a(Eb4
F r2r1a(Gb4
H r3Aq)Ib4
J r3Aq)Kb4
L r2r1a(Mb4
N r2r1a(Ob4
P r3Aq)Qb4
R r3Aq)Sb4
T r2r1a(Ub4
V r2r1a(Wb4
X r3Aq)Yb4
Z r3Aq)[b4
\ r2r1a(]b4
^ r2r1a(_b4
` r3Aq)ab4
b r3Aq)cb4
d r2r1a(eb4
f r2r1a(gb4
h r3Aq)ib4
j r3Aq)kb4
l r2r1a(mb4
n r2r1a(ob4
p r3Aq)qb4
r r3Aq)sb4
t r2r1a(ub4
v r2r1a(wb4
x r3Aq)yb4
z r3Aq){b4
| r2r1a(}b4
~ r2r1a(b4
@ r3Aq)Ab4
B r3Aq)Cb4
D r2r1a(Eb4
F r2r1a(Gb4
H r3Aq)Ib4
J r3Aq)Kb4
L r2r1a(Mb4
N r2r1a(Ob4
P r3Aq)Qb4
R r3Aq)Sb4
T r2r1a(Ub4
V r2r1a(Wb4
X r3Aq)Yb4
Z r3Aq)[b4
\ r2r1a(]b4
^ r2r1a(_b4
` r3Aq)ab4
b r3Aq)cb4
d r2r1a(eb4
f r2r1a(gb4
h r3Aq)ib4
j r3Aq)kb4
l r2r1a(mb4
n r2r1a(ob4
p r3Aq)qb4
r r3Aq)sb4
t r2r1a(ub4
v r2r1a(wb4
x r3Aq)yb4
z r3Aq){b4
| r2r1a(}b4
~ r2r1a(b4
@ r3Aq)Ab4
B r3Aq)Cb4
J sCQ*sCa+sCa+sCa+sCa+sCa+sCa+sCa+sCQ*
?
) r2r1a(r2r1a(r2r1b)r2r1a(r2r1a(sCQ*r2r1a(r2sAq)r2r1a(r2r1a(5
 r3Aq)sBAq)tRQ*sCQ*r2r1a(r3Aq)sBAq)/
 sB1%sCA&sCA&r31%r2q!$A
+ #A
"'! &#3q!fQCG!# -Jvq, H	 - " -' -"A
+ RA&RA&S!Q'E
) -C
,(*
&		 w? '	 @ + @( @,C
- ,C
++*
&	!# >Jvz= '	 > + >+ >+C
- -G
,*
&	 W)< ' =* =,G
+W5
VA
C
C
G
s*   08p	/p)
<p)
4p1
p9
=p9
**qc              #  <  #    [        5       n[        R                  R                  nU H  n[	        UR
                  UR                  UR                  -  S-  5      nUR                  UR                  UR                  UR                  U4n[        USS5      nUb  Xg4-  nXb;  d  M~  Ub  [        U5      U:  d  M  UR                  U5        UR                  UR                  UR                  UR                  US.nUb  XxS'   U R                  " S0 UD6v   M     g7f)C
Finalizes configs after scaling, applying additional constraints.
ra   r)   N)r|   r}   r~   r   r   GROUP_Mr   )r	   r   test_configsmax_mm_configsminr   r   r   r   r   getattrlenaddtriton_config)	r{   configsusedr   confr   keyr)   rO   s	            r$   _finalize_mm_configs(BaseConfigHeuristic._finalize_mm_configs  s      -7L,,;;DDNNDLL4<<,G3,NOI $C dIt4G"z!&#d)n*D#||#||#||"&//!* &(/9%((2622? s   B!D'D;A!Dc                   SSK Jn  Sn	U(       a  SOSn
[        U" [        R                  R
                  R                  U[        R                  S95      U	5      n[        U" [        R                  R
                  R                  U[        R                  S95      U	5      n[        U" [        R                  R
                  R                  U[        R                  S95      U
5      n/ nU H  n[        R                  " U[        [        [        UR                  U-  5      U5      U	5      [        [        [        UR                  U-  5      U5      U	5      [        [        [        UR                  U-  5      U5      U
5      S9nU" UR                  UR                  UR                  5      (       a  M  UR!                  U5        M     U$ )zG
Scales and filters matrix multiplication configs based on input size.
r
   )next_power_of_2r3   rZ   )fallback)r   r   r   )runtime.runtime_utilsr   maxr   graphsizevars	size_hintr   unbacked_symint_fallbackdataclassesreplacer   r   r   r   r   append)r{   mnkr   scalehas_int8_tensorexcluder   min_block_sizemin_block_size_kscaled_configscscaled_configs                 r$   _scale_mm_configs%BaseConfigHeuristic._scale_mm_configs  s    	;!02b  **#<< +  
   **#<< +  
   **#<< +  
 A'//CAII$5 6:NKCAII$5 6:NKCAII$5 6:<LM	M %%}'<'<m>S>S  %%m4  r#   c                   SS K n/ nU H  nUR                  R                  5       nUR                  R                  U5      nUR                  nSn	[
        R                  " UR                  UR                  -  UR                  S-  -  5      n
UUR                  UR                  -  UR                  UR                  -  -   -  nXR                  -  U:  a  M  X:  a  M  UR                  U5        M     U$ )Nr      rZ   )torchcudacurrent_deviceget_device_propertiesshared_memory_per_block_optinmathceilr   r   r   r   r   r   )r{   r   
dtype_sizer   pruned_configsgemm_configdevicepropssm_availableNUM_REGacc_regsshared_mem_accums               r$   _prune_exhaustive_configs-BaseConfigHeuristic._prune_exhaustive_configs  s    
 	"KZZ..0FJJ44V<E >>LGyy##k&9&99[=R=RUW=WXH  *##k&9&99%%(;(;;<   "8"88<G#!!+.- #0 r#   Fr
   c                    g)NFr   )r   r   r   s      r$   <lambda>BaseConfigHeuristic.<lambda>5  s    5r#   r   c	           	         U R                  XX4XeU5      n	[        R                  S:X  a  US:  d   S5       eU R                  X5      n	U R	                  U	5      $ )N
EXHAUSTIVEr   z1dtype_size must be provided for exhaustive search)r   r   max_autotune_gemm_search_spacer   r   )
r{   r   r   r   r   r   r   r   r   r   s
             r$   preprocess_mm_configs)BaseConfigHeuristic.preprocess_mm_configs-  s`     //!eg
 00L@>V#VV>!;;NWN((88r#   c                    SSK Jn  U" X1US9$ )Nr   r   )r   r   )tritonr   )r{   r   r   rO   TritonConfigs        r$   r   !BaseConfigHeuristic.triton_configA  s     	2FYOOr#   c                >    [        U R                  U R                  S9$ Nr   )r   r   rh   r{   s    r$   get_mm_configs"BaseConfigHeuristic.get_mm_configsH  s    t114??KKr#   c                >    [        U R                  U R                  S9$ r   )r   r   rk   r   s    r$   get_exhaustive_mm_configs-BaseConfigHeuristic.get_exhaustive_mm_configsK  s    t114;R;RSSr#   c                >    [        U R                  U R                  S9$ r   )r   r   rl   r   s    r$   get_extra_mm_configs(BaseConfigHeuristic.get_extra_mm_configsN  s    t114;P;PQQr#   c                >    [        U R                  U R                  S9$ r   )r   r   rm   r   s    r$   get_int8_mm_configs'BaseConfigHeuristic.get_int8_mm_configsQ  s    t114;O;OPPr#   c                    [         R                  S:X  a  U R                  U R                  -   OU R                  n[	        U R
                  US9$ Nr   r   )r   r   rh   rn   r   r   )r{   rh   s     r$   get_mixed_mm_configs(BaseConfigHeuristic.get_mixed_mm_configsT  sG     44D OOd333 	
 t11:FFr#   c                    [         R                  S:X  a  U R                  OU R                  nU Vs/ s H  o"R                  S:w  d  M  UPM     nn[        U R                  US9$ s  snf )Nr   r6   r   )r   r   rk   ro   r   r   r   )r{   ro   r   s      r$   get_persistent_mm_configs-BaseConfigHeuristic.get_persistent_mm_configs\  sp     44D ##++ 	 "7!
!6v:J:Ja:OF!6 	 !
 t11;PQQ!
s   A$A$c                >    [        U R                  U R                  S9$ r   )r   r   rp   r   s    r$   get_scaled_mm_configs)BaseConfigHeuristic.get_scaled_mm_configsi  s    t114;Q;QRRr#   c                >    [        U R                  U R                  S9$ r   )r   r   rq   r   s    r$    get_scaled_persistent_mm_configs4BaseConfigHeuristic.get_scaled_persistent_mm_configsl  s"     &&0Q0Q
 	
r#   c                >    [        U R                  U R                  S9$ r   )r   r   rr   r   s    r$   get_mm_plus_mm_configs*BaseConfigHeuristic.get_mm_plus_mm_configss  s    t00$:Q:QRRr#   c                >    [        U R                  U R                  S9$ r   )r   r   rt   r   s    r$   get_conv_configs$BaseConfigHeuristic.get_conv_configsv  s    t114;L;LMMr#   c                   / n[         R                  (       a.  [         R                  S:X  a  U R                  $ X0R                  -  nUS::  a2  U[
        R                  :X  a  [        SSSS5      nO@[        SSSS5      nO1U[
        R                  :X  a  [        SSSS5      nO[        SSSS5      nXC;  a  UR                  U5        U$ )	Nr   ra   r]   r_   r\   r[   rZ   r3   )	r   max_autotunemax_autotune_flex_search_spacerx   ru   r   float32r+   r   r{   head_dimdtypeflex_attn_fwd_configsdefault_configs        r$   get_flex_attn_fwd_configs-BaseConfigHeuristic.get_flex_attn_fwd_configsz  s    2444D<<<!%H%HH!s?%!+BAq!9!+CQ!:%!+BAq!9!+BAq!96!((8$$r#   c                    / n[         R                  (       a.  [         R                  S:X  a  U R                  $ X0R                  -  n[        SSSS5      nXC;  a  UR                  U5        U$ )Nr   r3   r
   r\   )r   r   r   ry   rv   r+   r   r{   r   r   flex_attn_bwd_configsr   s        r$   get_flex_attn_bwd_configs-BaseConfigHeuristic.get_flex_attn_bwd_configs  sd    2444D<<<!%H%HH!#BAq16!((8$$r#   c                    / n[         R                  (       a.  [         R                  S:X  a  U R                  $ X0R                  -  n[        SSSS9nXC;  a  UR                  U5        U$ )Nr   r]   r
   r6   )r   r   r   )r   r   r   rz   rw   r.   r   r{   r   r   flex_decode_configsr   s        r$   get_flex_decode_configs+BaseConfigHeuristic.get_flex_decode_configs  sd     7944D:::#D#DD)"aP4&&~6""r#   )rt   rk   ry   rx   rz   rl   rv   ru   rw   rm   rn   rh   rr   ro   rp   rq   NrS   Noner   list[BaseConfig]rS   #Generator[TritonConfig, None, None])r   r   r   r   r   r   r   r  r   floatr   boolr   Callable[[int, int, int], bool]rS   r  )r   r  r   r   rS   r  )r   r   r   r   r   r   r   r  r   r  r   r   r   r  r   r   rS   r	  )r   r   r   r   rO   r   rS   r   rS   z,partial[Generator[TritonConfig, None, None]]r   r   r   r   rS   zlist[FlexConfig]r   r   r   r   rS   zlist[FlexDecodeConfig])r   r   r   r   r    r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r"   r   r#   r$   rT   rT      s   K
Z*3!*3 
-*3X<< < 	<
 "< < < 1< 
<| !    
	 P !&3H99 9 	9
 "9 9 9 19 9 
-9(PP*-P9<P	PLTRQGRS
	5
SN%0%##$'#	#r#   rT   )	metaclassc                      \ rS rSrSrg)CPUConfigHeuristici  r   N)r   r   r   r   r"   r   r#   r$   r  r    s    r#   r  c                  Z   ^  \ rS rSrSrSU 4S jjrS	S jrS	S jr      S
S jrSr	U =r
$ )CUDAConfigHeuristici  zI
Child class for CUDA device specific gemm/flex attention/conv/ configs.
c                v  > [         TU ]  5         [        R                  S4[	        SSSS5      [        R                  S4[	        SSSS5      [        R                  S4[	        SSSS5      [        R
                  S4[	        SSSS5      [        R
                  S4[	        SSSS5      [        R
                  S4[	        SSSS5      [        R                  S4[	        SSSS5      [        R                  S4[	        SSSS5      [        R                  S4[	        SSSS5      0	U l        [        R                  S4[	        SSSS5      [        R                  S4[	        SSSS5      [        R                  S4[	        SSSS5      [        R
                  S4[	        SSSS5      [        R
                  S4[	        SSSS5      [        R
                  S4[	        SSSS5      [        R                  S4[	        SSSS5      [        R                  S4[	        SSSS5      [        R                  S4[	        SSSS5      0	U l        g )	Nr]   r[   rZ   r_   r\   ra   r(   r3   )	rK   r   r   r   r+   bfloat16float16h100_default_flex_configa100_default_flex_config)r{   rQ   s    r$   r   CUDAConfigHeuristic.__init__  s    ]]BCQ!:]]C *RQ":]]C *RQ":^^R *S#q!"<^^S!:c2q!#<^^S!:b"a#;]]BCa!;]]C *S#q!"<]]C *RQ":
)
% ]]BCQ!:]]C *S"a";]]C *RQ":^^R *S"a";^^S!:c2q!#<^^S!:b"a#;]]BCQ!:]]C *S"a";]]C *RQ":
)
%r#   c                X   [         R                  R                  5       n/ n[        R                  (       a.  [        R
                  S:X  a  U R                  $ X@R                  -  nUS::  ay  U[         R                  :X  a  [        SSSS5      nO[        SSSS5      nUS:  a  U R                  R                  X!4U5      nOUUS:  a  U R                  R                  X!4U5      nO1U[         R                  :X  a  [        S	S
SS5      nO[        SS	SS5      nXT;  a  UR                  U5        U$ )Nr   ra   r]   r_   r\   r[   	   r   r(   r   rZ   r3   )r   r   get_device_capabilityr   r   r   rx   ru   r   r+   r  getr  r   )r{   r   r   
capabilityr   r   s         r$   r   -CUDAConfigHeuristic.get_flex_attn_fwd_configs  s   ZZ557
2444D<<<!%H%HH!s?%!+BAq!9!+CQ!:V#!%!>!>!B!B%~" v%!%!>!>!B!B%~" %!+BAq!9!+BAq!96!((8$$r#   c                |   [         R                  R                  5       n/ n[        R                  (       a.  [        R
                  S:X  a  U R                  $ X@R                  -  nU[         R                  :X  a  [        SSSS5      nOUS::  a?  US:  a9  US:X  a  [        SSSS5      nO~US	:X  a  [        SS	SS
5      nOi[        SSSS5      nOZUS:  aF  US:X  a  [        SS	SS5      nO?US	:X  a  US   S:X  a  SOSn[        SSUS5      nO[        SSSS5      nO[        SSSS5      nXT;  a  UR                  U5        U$ )Nr   r3   r
   r\   ra   r  r]   r_   r[   r(   r6   r  rZ   r   )r   r   r  r   r   r   ry   rv   r   r+   r   )r{   r   r   r!  r   r   r   s          r$   r   -CUDAConfigHeuristic.get_flex_attn_bwd_configs  s6   ZZ557
2444D<<<!%H%HH!EMM!'B15N_v!52~!+BAq!9S!+BQ!:!+BAq!96!2~!+BQ!:S",Q-1"4Q!
!+BJ!B!+BAq!9'B15N6!((8$$r#   c                   [         R                  R                  5       n[        SSS5      n/ n[        R
                  (       a.  [        R                  S:X  a  U R                  $ XPR                  -  nUS:  a6  US:  a"  U[         R                  :X  a  [        SSS5      nO[        SSS5      nO[        SSS5      nXE;  a  UR                  U5        U$ )Nr]   r
   r6   r   r  r[   r_   )r   r   r  r.   r   r   r   rz   rw   r   r   )r{   r   r   r!  r   r  s         r$   r  +CUDAConfigHeuristic.get_flex_decode_configs  s     ZZ557
)"a36844D:::#D#DD#~%5=="8!1"a!;!1"a!;-b!Q7N4&&~6""r#   )r  r  r  r  r  )r   r   r   r   r    r   r   r   r  r"   rU   rV   s   @r$   r  r    s;    
6%B"%H##$'#	# #r#   r  c                     ^  \ rS rSrSrSU 4S jjr      SS jr    SS jrSS jrSS jr	SS jr
SS	 jrSS
 jr  SS jrSS jrSS jrSS jrSS jr      SS jrSrU =r$ )ROCmConfigHeuristici8  zB
Child class for ROCm specific gemm/flex attention/conv/ configs.
c                N  > [         TU ]  5         [        5       U l        / [	        SSSU R                  SSSS9P[	        SSSU R                  SSS9P[	        SSSU R                  SSSS9P[	        SSS	U R                  SSS9P[	        SS
S
U R                  SSS9P[	        S
SS	U R                  SSSS9P[	        S
SSU R                  SSS9P[	        S
SS
U R                  SSS9P[	        S
SS
U R                  SSS9P[	        S
SS	U R                  SSS9P[	        S
S
SU R                  SSS9P[	        S
S
S
U R                  SSS9P[	        S
S
S	U R                  SSS9P[	        S
S
SU R                  SSS9P[	        S
S	SU R                  SSSS9P[	        S
S	SU R                  SSS9P[	        S
S	S
U R                  SSS9P[	        S
S	S	U R                  SSS9P[	        S	SSU R                  SSS9P[	        S	SS
U R                  SSS9P[	        S	S
SU R                  SSSS9P[	        S	S
S
U R                  SSS9P[	        S	S
S	U R                  SSS9P[	        S	S	SU R                  SSSS9P[	        S	S	SU R                  SSS9P[	        S	S	SU R                  SSSS9P[	        S	S	S
U R                  SSS9P[	        S	S	S
U R                  SSS9P[	        S	S	S	U R                  SSS9P[	        S	SSU R                  SSSS9P[	        S	SS
U R                  SSS9P[	        SS
S
U R                  SSS9P[	        SS	SU R                  SSSS9P[	        SS	SU R                  SSS9P[	        SS	S
U R                  SSS9P[	        SSS
U R                  SSS9PU l        [        R                  " / SQSS9 VVVVVVVVV	s
/ s Ha  u  pnSU R                  4  HH  nS  H>  nS  H4  nS  H*  nS  H   nS  H  n	[	        UUUUUUUUU	5	      PM     M"     M,     M6     M@     MJ     Mc     s
n	nnnnnnnnU l        [        R                  S
4[        S	SSS5      [        R                  S	4[        S	SSS5      [        R                  S4[        S
SSS5      [        R                  S
4[        S	S
SS5      [        R                  S	4[        S	S
SS5      [        R                  S4[        SS
SS5      [        R                  S
4[        S	S
SS5      [        R                  S	4[        S	S
SS5      [        R                  S4[        SS
SS5      0	U l        S V
VVs/ s H$  n
S  H  nS  H  n[        XSU5      PM     M     M&     snnn
U l        S V
VVVs/ s HI  n
S  H?  nU
S	:  d  US	:  a  SS/OS/  H$  nS  H  nX-  S:X  d  M  [        XSX5      PM     M&     MA     MK     snnnn
U l        [#        SSS5      [#        S
SS5      [#        S	SS5      [#        SSS5      [#        S
SS5      [#        S	SS5      /U l        S VVVVVVs/ s HP  nS  HF  nS  H<  nS  H2  nS  H(  nS['        SU-  5      4  H  n[        XXEX5      PM     M*     M4     M>     MH     MR     snnnnnnU l        S V
VVVVVs/ s HZ  n
S  HP  nS  HF  nS  H<  nS  H2  nS['        SU-  5      4  H  nX-  S:X  d  M  [        XXEX5      PM     M4     M>     MH     MR     M\     snnnnnn
U l        S VVVVVs/ s HE  nS  H;  nS  H1  nS  H'  nS['        SU-  5      4  H  n[#        XX]USS9PM     M)     M3     M=     MG     snnnnnU l        g s  s
n	nnnnnnnnf s  snnn
f s  snnnn
f s  snnnnnnf s  snnnnnn
f s  snnnnnf )Nr3   ra   r\   r6   )r)   r5   rZ   )r)   r(   r[   r]   r`   r_   rb   r
   )r\   r(   )r\   r(   r3   )r   r3   )r   r6   )r6   )r3   r]   r[   rg   )r3   rZ   r]   rf   r   )r
   r6   rd   )r7   )rK   r   r   default_num_stagesr1   rh   ri   rj   rk   r   r   r>   r  r  default_flex_configru   rv   rB   rw   r   rx   ry   rz   )r{   r|   r}   r~   r   r   r)   r4   r5   r7   r   r   r   mfmawpeur   rQ   s                   r$   r   ROCmConfigHeuristic.__init__=  sv	   "8":7-
BT44aQR7-
 2r3(?(?AN	7-

 BD33QPQ7-
 2r3(?(?AN7-
 2r2t'>'>1M7-
 BT44aQR7-
 2r2t'>'>1M7-
 2r2t'>'>1M7-
 2r2t'>'>1M7-
  2r3(?(?AN!7-
" 2r2t'>'>1M#7-
$ 2r2t'>'>1M%7-
& 2r3(?(?BO'7-
( 2r3(?(?AN)7-
* CT44aQR+7-
0 2sB(?(?AN17-
2 2sB(?(?AN37-
4 2sC)@)@!QO57-
6 3B(?(?AN77-
8 3B(?(?AN97-
: RT44aQR;7-
@ 3B(?(?BOA7-
B 3C)@)@!QOC7-
D S"d55q"STE7-
J 3R)@)@!RPK7-
L S"d55q"STM7-
R 3R)@)@!RPS7-
T 3R)@)@!QOU7-
V 3S$*A*A1bQW7-
X S"d55q"STY7-
^ 3R)@)@!QO_7-
` 3B(?(?ANa7-
b S"d55q!RSc7-
h 3R)@)@!RPi7-
j 3R)@)@!QOk7-
l 3R)@)@!QOm7-
L .7->->&q.5
 5
.)'  !$"9"9:
#	%(/$ &' $
& '
$ !'%
" )0#
  &!
 $
 ;
.5
 5
0 ]]BRA!>]]C .b!Q"?]]C .RA">^^R .b!Q"?^^S!>#r1a#@^^S!>"b!Q#?]]BRA!>]]C .b!Q"?]]C .RA">
$
  (A
'+ 61a0  1+ 1'A
+ 'A
&' &#3q!fQCG!# 7N61a6  	 7 H 7' 7&A
+ !Q* Q* a+ Q* Q* a+E
) -C
 C
,($
&	CY/0 7ZDO 1 P
   P '	 P % P( P,C
- ,	C
 	C
++$
&	CY/0!# NN6:$M 1 N
   N '	 N % N+ N+	C
- -G
,$
&	CY/0 !itSTU
 1 V  	 V ' V$ V,G
+c5
 5
FA
A
$C
	C
G
s3   0A(].+];?/^
2^
2A^
8^%^A^c                :    U H  nU R                   Ul        M     U$ rJ   )r*  r   )r{   r   new_num_stagesr   s       r$   _filter_configs#ROCmConfigHeuristic._filter_configs  s     
 A22AL r#   c           
   #  h  #    [        5       n[        R                  R                  nU GH  n[	        UR
                  UR                  UR                  -  S-  5      Ul        [        USS5      n[        USS5      n[        USS5      nUS:w  a(  UR                  U-  S:w  d  UR                  U-  S:w  a  M  UR                  UR                  UR                  UR                  UR
                  UUU4n[        USS	5      n	U	b  X4-  nUS:w  a  [        S
UR
                  -  5      nX;  d  GM  Ub  [        U5      U:  d  GM  UR                  U5        UR                  UR                  UR                  UR                  UR
                  UUUS.n
U	b  XS'   U R                  " S0 U
D6v   GM     g	7f)r   ra   r4   r3   r5   r   r7   r6   r)   Nr(   )r|   r}   r~   r   r   r4   r5   r7   r   r   )r	   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )r{   r   r   r   r   r4   r5   r7   r   r)   rO   s              r$   r   (ROCmConfigHeuristic._finalize_mm_configs  s     -7L,,;;D 1LPS1STDN $+41G#L "4;LD'1-E#q(33q8<<"66!;  $	$C dIt4G"z!q "1#67&#d)n*D#||#||#||"&//!%,@$0"	 &(/9%((2622i s   D'F2.F2A/F2c                v    U R                  U R                  U R                  5      n[        U R                  US9$ r   )r1  rl   r*  r   r   r{   filtered_configss     r$   r   (ROCmConfigHeuristic.get_extra_mm_configs   s:    //!!4#:#:
 t11;KLLr#   c                v    U R                  U R                  U R                  5      n[        U R                  US9$ r   )r1  rm   r*  r   r   r6  s     r$   r   'ROCmConfigHeuristic.get_int8_mm_configs&  s:    //  $"9"9
 t11;KLLr#   c                    [         R                  S:X  a  U R                  U R                  -   OU R                  nU R	                  XR
                  5      n[        U R                  US9$ r   )r   r   rh   rn   r1  r*  r   r   )r{   rh   r7  s      r$   r   (ROCmConfigHeuristic.get_mixed_mm_configs,  s_     44D OOd333 	
  //
<S<STt11;KLLr#   c                v    U R                  U R                  U R                  5      n[        U R                  US9$ r   )r1  ro   r*  r   r   r6  s     r$   r   -ROCmConfigHeuristic.get_persistent_mm_configs5  s:    //&&(?(?
 t11;KLLr#   c                v    U R                  U R                  U R                  5      n[        U R                  US9$ r   )r1  rp   r*  r   r   r6  s     r$   r   )ROCmConfigHeuristic.get_scaled_mm_configs;  s:    //""D$;$;
 t11;KLLr#   c                v    U R                  U R                  U R                  5      n[        U R                  US9$ r   )r1  rq   r*  r   r   r6  s     r$   r   4ROCmConfigHeuristic.get_scaled_persistent_mm_configsA  s<      //--t/F/F
 t11;KLLr#   c                b    U R                  U R                  S5      n[        U R                  US9$ )Nr
   r   )r1  rr   r   r   r6  s     r$   r   *ROCmConfigHeuristic.get_mm_plus_mm_configsI  s/    //0G0GKt00:JKKr#   c                v    U R                  U R                  U R                  5      n[        U R                  US9$ r   )r1  rt   r*  r   r   r6  s     r$   r   $ROCmConfigHeuristic.get_conv_configsM  s:    //t66
 t11;KLLr#   c                   / n[         R                  (       a.  [         R                  S:X  a  U R                  $ X0R                  -  nUS::  aO  U[
        R                  :X  a  [        SSSS5      nO[        SSSS5      nU R                  R                  X!4U5      nO1U[
        R                  :X  a  [        SS	SS5      nO[        SSSS5      nXC;  a  UR                  U5        U$ )
Nr   ra   r]   r
   r\   r[   r(   rZ   r3   )r   r   r   rx   ru   r   r   r>   r+  r   r   r   s        r$   r   -ROCmConfigHeuristic.get_flex_attn_fwd_configsS  s    2444D<<<!%H%HH!s?%!/B1!=!/RA!>!5599!>N %!/B1!=!/B1!=6!((8$$r#   c                   / n[         R                  (       a.  [         R                  S:X  a  U R                  $ X0R                  -  nU[
        R                  :X  a  [        SSSS5      nOMUS::  a9  US:X  a  [        SSSS5      nO2US:X  a  [        SSSS5      nO[        SSSS5      nO[        SSSS5      nXC;  a  UR                  U5        U$ )	Nr   r3   r
   r\   ra   r]   r[   r(   )	r   r   r   ry   rv   r   r   r>   r   r   s        r$   r   -ROCmConfigHeuristic.get_flex_attn_bwd_configsn  s    2444D<<<!%H%HH!EMM!+BAq9N_2~!/B1!=S!/CA!>!/B1!=+BAq9N6!((8$$r#   c                    / n[         R                  (       a.  [         R                  S:X  a  U R                  $ X0R                  -  n[        SSS5      nXC;  a  UR                  U5        U$ )Nr   r]   r
   r\   )r   r   r   rz   rw   rB   r   r  s        r$   r  +ROCmConfigHeuristic.get_flex_decode_configs  sd     7944D:::#D#DD-b!Q74&&~6""r#   )
r+  r*  rk   ry   rx   rz   rv   ru   rw   rh   r  )r   r  r0  r   rS   r  r  r  r  r  )r   r   r   r   r    r   r1  r   r   r   r   r   r   r   r   r   r   r   r  r"   rU   rV   s   @r$   r(  r(  8  s    W
r'9<	?3!?3 
-?3BMMMMMM	5MLM%6%2##$'#	# #r#   r(  c                      \ rS rSrSrSrg)XPUConfigHeuristici  z5
Placeholder child class for XPU specific overrides.
r   N)r   r   r   r   r    r"   r   r#   r$   rN  rN    s    r#   rN  ),
__future__r   r   ri   r   	functoolsr   	threadingr   typingr   r   r   r   torch.utils._ordered_setr	    r   utilsr   virtualizedr   collections.abcr   r   r   r   	dataclassr   r&   rs   r+   r.   r1   r;   r>   rB   typerE   rT   r  r  r(  rN  r   r#   r$   <module>rZ     s   "      / /  /  )  )- 	 	 	    
        Z   Z   Z   +  'T '&a#$: a#H	, 	~#- ~#B^#- ^#B, r#   