
    8hu                   r   S SK Jr  S SKrS SKrS SKrS SKrS SKrS SKrS SKrS SK	r	S SK
r
S SKJr  S SKJrJrJrJrJrJrJr  S SKJr  S SKrS SKrS SKrS SKJr  S SKJr  S SKJr  S S	KJ r   S S
K!J"r"J#r#J$r$  S SK%J&r&J'r'J(r(J)r)  SSK*J+r+  SSK,J-r-J.r.J/r/  SSK0J1r1  SSK2J3r3  SSK4J5r5J6r6J7r7  \(       a  SSK.J8r8  SSK9J:r:  SSK;J<r<J=r=  SSK/J>r>J?r?J@r@  SSKAJBrBJCrCJDrDJErEJFrFJGrGJHrHJIrIJJrJJKrKJLrL  SSKMJNrNJOrOJPrP  SSKQJRrR  SSKSJTrTJUrUJVrVJWrW  SSKXJYrY  SSKZJ[r[J\r\J]r]J^r^J_r_  \(       a  S SK`JaraJbrbJcrc  S SKJdrd  \R                  " \f5      rg\R                  R                  \fS 5      rj\R                  R                  \fS!5      rk\R                  R                  \fS"5      rl\W" 5       R                  rn\ " / S#Q5      roS8S9S$ jjrp\R                   " S% S&5      5       rr " S' S(\r5      rs " S) S*\r5      rtS:S+ jru\" S,\T\TS-9rv " S. S/\V\v   \\v   5      rw " S0 S1\?5      rx\R                  " S2S39 " S4 S55      5       ry " S6 S7\z5      r{g);    )annotationsN)Counter)AnyCallableGenericno_type_checkOptionalTYPE_CHECKINGUnion)TypeVar)analyze_memory_coalescing)free_unbacked_symbols)immutable_dict)
OrderedSet)FloorDivIdentityModularIndexing)free_symbol_is_type
prefix_strsymbol_is_typeSymT   )counters   )configir	scheduler)prologue_preserves_zero_mask)	code_hash)	MemoryDepStarDepWeakDep)IRNode)!indexing_dtype_strength_reduction)
green_textyellow_text)BaseSchedulerNodeBaseScheduling	WhyNoFuse)cache_on_selfexpr_fits_within_32bitget_dtype_sizeIndentedBufferPlaceholderprefix_is_reduction'set_kernel_post_grad_provenance_tracingsympy_index_symbolsympy_product
sympy_subsunique)ops
OpsWrapperV   )BlockPatternMatcher)CSEVariableindex_prevent_reorderingKernelPythonPrinter)MultiKernel)DisableReductionEnableReductionNodeScheduleEntryNodeScheduleMarkerSIMDKernelFeatures)IterableIteratorSequence)CoalesceVarAnalysis
perf_hintsschedulefusion)zyxr0_r1_c                l    [         R                  R                  R                  R                  nUb  U$ U $ N)torch	_inductorr   triton	max_tiles)defaultrU   s     V/var/www/fran/franai/venv/lib/python3.13/site-packages/torch/_inductor/codegen/simd.pyget_max_tilesrX   Y   s-    &&--77I!-9:7:    c                    ^  \ rS rSrSr\R                  R                  \R                  R                  S.               S	U 4S jjjr\	\
\S
S j5       5       5       rSS jr\	\
\SS j5       5       5       rSrU =r$ )IterationRanges^   a  
Each range tree represents multiple sets of iteration indexing
in a single tiled dimension in the output kernel.

If you have two loops ranges one (4, 3, 2) and another (4, 6),
then the range tree will be:
        4 (i0)
    3 (i1)  6 (i3)
    2 (i2)
Where i0 is shared between both loops, but then the split into
different indexing vars.  All loop ranges must iterate over
the same number of elements.
)divisorlengthc                  > [         T
U ]  5         Xl        X l        X0l        X@l        XPl        Xpl        Xl        X`l	        Xl
        g rQ   )super__init__namevar_list
var_rangesnumelprefixr]   r^   kernelroot)selfrb   rc   rd   re   rf   rg   r]   r^   rh   	__class__s             rW   ra   IterationRanges.__init__n   s=     		 $
	rY   c                ,    [        U R                  5      $ rQ   )r/   rf   ri   s    rW   is_reductionIterationRanges.is_reduction   s     #4;;//rY   c                ,    [        U R                  5      $ rQ   )r1   rb   rm   s    rW   symbolIterationRanges.symbol   s    !$)),,rY   c                |    [         R                  " 5        VVs0 s H  u  pX!_M	     nnnX0R                     $ s  snnf rQ   )r   itemsrf   )ri   symtrf   prefix_to_symts       rW   ru   IterationRanges.symt   s;     <F;K;K;MN;M<4&,;MNkk** Os   8)	r]   rg   r^   rb   re   rf   rh   rc   rd   )rb   strrc   list[sympy.Symbol]rd   dict[sympy.Symbol, sympy.Expr]re   
sympy.Exprrf   rx   rg   
SIMDKernelrh   IterationRangesRootreturnNoner~   boolr~   zsympy.Symbol)r~   r   )__name__
__module____qualname____firstlineno____doc__sympySOnera   propertyr*   r   rn   rq   ru   __static_attributes____classcell__rj   s   @rW   r[   r[   ^   s    . ww{{ % 3	
    " 
 0 0   0- +   +rY   r[   c                     ^  \ rS rSrSr S                     SU 4S jjjrSS jrSS jrSS jrSS jr	    SS jr
SS	 jr    SS
 jrSrU =r$ )r}      z
Root of a iteration range tree that represents a single
tiled dimension in the output kernel. It contains multiple
sets of iteration represented with IterationRangesEntry.
c          
        > Uc  0 n[         TU ]  U/ 0 UUUU S9  X@l        0 U l        X`l        U(       a  U R
                  (       a  U	b   eXpl        Xl        Xl        Xl	        g )N)rb   rc   rd   re   rf   rg   rh   )
r`   ra   indexnodes	pid_cachern   is_loop
tensor_dimgrid_dimhas_zdim)ri   rb   re   rf   r   rg   r   r   r   r   r   rj   s              rW   ra   IterationRangesRoot.__init__   sx     I 	 	
 
=?
 *3 t00X5EFF$  rY   c                >    SU R                   < SU R                   S3$ )NzIterationRangesRoot(, z, ...))rb   re   rm   s    rW   __repr__IterationRangesRoot.__repr__   s    %dii]"TZZLGGrY   c                f    U R                   R                  5        H  nUR                  5         M     g rQ   )r   valuescache_clear)ri   nodes     rW   r   IterationRangesRoot.cache_clear   s%    JJ%%'D (rY   c                2    [        U R                   S35      $ )Nr   )r1   rf   rm   s    rW   	index_symIterationRangesRoot.index_sym   s    !T[[M"788rY   c                   [         R                  R                  R                  X-  U R                  5      (       a  [        U R                  5       U5      nO[        U R                  5       X5      nX0R                  ;  a  [        U R                   [        [         R                  R                  5       3UUUU 5      nU[         R                  R                  UR                  5       '   U R                   R#                  UR                  5       5        X R$                  UR                  5       '   X@R                  U'   U R                  U   $ )z6
Lookup a given RangeTreeEntry, creating it if needed
)r7   graphsizevarsstatically_known_equalsre   r   r   r   r   IterationRangesEntryrf   nextrg   iter_vars_countrange_tree_nodesrq   rc   appendrd   )ri   r]   r^   exprr   s        rW   lookupIterationRangesRoot.lookup   s     7733G4DdjjQQDNN,g6D"4>>#3WEDzz!';;-QXX%=%= >?@D 8<AHH%%dkkm4MM  /-3OODKKM*#JJtzz$rY   c                    [         R                  R                  n/ n[        U5       H'  nUR	                  U R                  X$5      5        X$-  nM)     / [        U5      Q$ rQ   )r   r   r   reversedr   r   )ri   lengthsr]   itervarsr^   s        rW   construct_entries%IterationRangesRoot.construct_entries   sT     ''++w'FOODKK89&G ( %(#$$rY   c                j    U R                  U5       Vs/ s H  o"R                  5       PM     sn$ s  snf rQ   )r   rq   )ri   r   es      rW   	constructIterationRangesRoot.construct   s+    $($:$:7$CD$Cq
$CDDDs   0c           
       ^^^	^
 SS jmUR                    Vs/ s H,  n[        R                  R                  R	                  U5      PM.     nnU Vs/ s H)  oD(       d  M  UR
                  U R
                  :X  d  M'  UPM+     nnUR                  U4S jS9  [        R                  R                  m/ m	/ m
UU	U
4S jnU H|  n[        R                  R                  R                  UR                  T5      (       d8  U" U R                  T[        UR                  T5      5      5        UR                  mU" U5        M~     [        R                  R                  R                  U R                   T5      (       d,  U" U R                  T[        U R                   T5      5      5        / [#        T	5      Q/ [#        T
5      Q4$ s  snf s  snf )z,Figure out vars from this tree used in indexc                   [         R                  R                  R                  U R                  [
        R                  S9n[         R                  R                  R                  U R                  [
        R                  S9S:H  nX(       + 4$ )z
Gets the key for sorting nodes. When two nodes have the
same divisor, the node with length as 1 should be handled
first so the current divisor is not changed after multiplied
node.length. Returns `not length_is_one_hint` for ascending
sort.
fallbackr8   )r7   r   r   	size_hintr]   r   unbacked_symint_fallbackr^   )rM   divisor_hintlength_is_one_hints      rW   get_sort_key8IterationRangesRoot.vars_and_sizes.<locals>.get_sort_key   s}     77++55		F$C$C 6 L   **HHv'F'F +    !"899rY   c                   > T" U 5      $ rQ    )rM   r   s    rW   <lambda>4IterationRangesRoot.vars_and_sizes.<locals>.<lambda>  s	    arY   keyc                   > TR                  U R                  5       5        TR                  U R                  5        TU R                  -  mg rQ   )r   rq   r^   )r   r]   
index_varssizess    rW   add/IterationRangesRoot.vars_and_sizes.<locals>.add  s5    dkkm,LL%+GrY   )rM   r   r~   ztuple[int, bool])free_symbolsr7   rg   r   getrf   sortr   r   r   r   r   r   r]   r   r   re   r   )ri   r   sr   nr   r   r]   r   r   r   s          @@@@rW   vars_and_sizes"IterationRangesRoot.vars_and_sizes   sR   
	:& <A;M;MN;Ma**..q1;MN!CEqQ188t{{+BEC

0
1''++
	, D77##;;DLL'RRDKK$,,)HIJ,,I  ww77

GLLGXdjj'%BCD&*%&(:(5/(:::/ OCs   3F=
GG;G)r   r   r   r   r   r   r   rQ   )rb   rx   re   r{   rf   rx   r   intrg   r|   r   Optional[dict[str, str]]r   r   r   Optional[int]r   r   r   r   r~   r   r~   rx   r~   r   r   )r]   r{   r^   r{   r~   r   )r   list[sympy.Expr]r~   zlist[IterationRangesEntry])r   r   r~   ry   )r   r{   r~   z+tuple[list[sympy.Symbol], list[sympy.Expr]])r   r   r   r   r   ra   r   r   r   r   r   r   r   r   r   r   s   @rW   r}   r}      s     /3(!(! (! 	(!
 (! (! ,(! (! "(!  (! (! 
(! (!TH9 .%'%	#%E/;/;	4/; /;rY   r}   c                     ^  \ rS rSr            SU 4S jjrSS jrSS jrSS jrSS jrSS jr	SS jr
SS	 jrS
rU =r$ )r   i(  c                  > [         TU ]  UUR                  U-  UR                  UR                  UR
                  UUUR                  UR                  S9	  XPl        [        R                  " S 5      " U R                  5      U l        X@l        g )N)	rb   re   rc   rd   rf   r]   r^   rg   rh   )r`   ra   re   rc   rd   rf   rg   rh   parent	functools	lru_cache_codegencodegenr   )ri   rb   r]   r^   r   r   rj   s         rW   ra   IterationRangesEntry.__init__)  sx     	,,'__((==== 	 
	
  **40?	rY   c                    SU R                    SU R                   SU R                   SU R                   SU R                   S3$ )NzIterationRangesEntry(r   ))rb   r]   r^   r   rd   rm   s    rW   r   IterationRangesEntry.__repr__@  sH    &tyykDLL>DKK=PRSWS\S\R]]_`d`o`o_ppqrrrY   c                N   ^ U4S jU l         S U R                   l        TU l        g )Nc                    > T $ rQ   r   )rb   s   rW   r   /IterationRangesEntry.set_name.<locals>.<lambda>D  s    trY   c                     g rQ   r   r   rY   rW   r   r   E  s    4rY   )r   r   rb   )ri   rb   s    `rW   set_nameIterationRangesEntry.set_nameC  s    ##/ 	rY   c                8    U R                   R                  5         g rQ   )r   r   rm   s    rW   r    IterationRangesEntry.cache_clearH  s      "rY   c                X    [         R                  R                  U 5        U R                  $ rQ   )r7   rg   codegen_iteration_ranges_entryrb   rm   s    rW   r   IterationRangesEntry._codegenK  s    	//5yyrY   c                   / n[        U R                  [        R                  5      (       a  U$ [        U R                  [        [
        45      (       d   [        U R                  5      5       eU R                  R                  SS   H{  n[        U[        R                  [        R                  45      (       a  M4  UR                  n[        U5      S:  d  MQ  [        S U 5       5      (       d  Mj  UR                  U5        M}     U$ )Nr8   r   c              3  V   #    U  H  n[        U[        R                  5      v   M!     g 7frQ   )r   r   SIZE.0r   s     rW   	<genexpr>8IterationRangesEntry.precomputed_args.<locals>.<genexpr>X  s!      ,:AQN1dii00's   '))
isinstancer   r   Symbolr   r   typeargsIntegerr   lenallr   )ri   precomputed_argsargsymbolss       rW   r  %IterationRangesEntry.precomputed_argsO  s    -/dii..##$))h%@AAR4		?RA99>>!"%CcEMM5<<#@AA**w<!# ,:A, ) ) %++C0 &  rY   c                ,    [        U R                  5      $ rQ   )hashrb   rm   s    rW   __hash__IterationRangesEntry.__hash__^  s    DIIrY   c                b    [        U[        5      (       d   eU R                  UR                  :H  $ rQ   )r   r   rb   )ri   others     rW   __eq__IterationRangesEntry.__eq__a  s)    %!56666yyEJJ&&rY   )r   r   rb   r   )rb   rx   r]   r{   r^   r{   r   r{   r   r[   r~   r   r   )rb   rx   r~   r   r   )r~   r   r~   r   )r  objectr~   r   )r   r   r   r   ra   r   r   r   r   r  r  r  r   r   r   s   @rW   r   r   (  sk      	
    
.s
# ' 'rY   r   c                    U [        S5      :X  a  gU [        S5      :X  a  g[        R                  " U 5      (       a  g[        U 5      $ )Ninfzfloat("inf")z-infzfloat("-inf")zfloat("nan"))floatmathisnanrepr)values    rW   constant_reprr  f  s<    e	%-		E		;rY   CSEVariableType)boundrV   c                  |  ^  \ rS rSr% Sr\rS\S'   S\S'   SrS\S'   S	\S
'       S9             S:U 4S jjjr	\
\\S;S j5       5       5       rS<S jrS=S jr\
S>S j5       rS?S jr            S@S jrSAS jrSBS jrSCS jrS?S jrS?S jrSDS jrS;S jrSES jrSFS jrS>S jrSGS jr      SHS jr      SHS jrSIS jr SJS  jr!\"      SKS! j5       r#\$\%RL                  RN                  4       SLS" jj5       r(\$\%RL                  RN                  4       SMS# jj5       r)    SNS$ jr*\$      SOS% j5       r+SPS& jr,SPS' jr-SQS( jr.    SGS) jr/SRS* jr0SSS+ jr1STS, jr2SUSVS- jjr3\4Rj                        SWS. j5       r6SXS/ jr7\"S0 5       r8S1 r9S2 r:S3 r;S4 r<S5 r=S6 r>SYS7 jr?S8r@U =rA$ )Zr|   is  zg
Common base class for Triton/Halide codegen which both use flattened indexing rather than loop nests.
zCallable[[sympy.Expr], str]sexprkexprFr   allow_block_ptrrx   kernel_namec                  >^  Uc  0 n[         T
T ]  5         UT l        UR                  5       T l        [        5       T l        [        5       T l        UR                  5        VVs0 s H/  u  pxU[        R                  R                  R                  U5      _M1     snnT l        / T l        0 T l        [         R"                  " 5       T l        UR'                  5       T l        Ub  UOT R+                  5       T l        UT l        Ub  UOT R1                  5       T l        T R5                  5       T l        S T l        [:        R<                  SU 4S jj5       n	U	T l        T RA                  U5        g s  snnf )Nc                   > [         R                  R                  R                  U TR	                  5       5      n TR
                   H  nTR                  X5      n M     TR                  U 5      $ rQ   )r7   r   r   simplify_with_rangesrd   range_treescombine_contiguous_dimscombine_modular_indexing_pairs)r   treeri   s     rW   simplify_indexing.SIMDKernel.__init__.<locals>.simplify_indexing  sY    GG$$99%ARSE((44UA ) 66u==rY   )r   r{   )!r`   ra   featuresget_mutations	mutationsr-   bodyindexing_codert   r7   r   r   simplifynumelsr"  r   	itertoolscountr   rn   inside_reduction should_use_cooperative_reductioncooperative_reductiontiling_scoresshould_use_persistent_reductionpersistent_reductionwant_no_x_dimno_x_dimr   r   cacher&  initialize_range_tree)ri   tilingr(  r   override_persistent_reductionoverride_cooperative_reductionr4  rf   valr&  rj   s   `         rW   ra   SIMDKernel.__init__}  sQ    I !//1"$	+-FLlln
FT{vFAGG$$--c22n
 79JL(0 ( 5 5 7 .9 +668 	"
 ?L -8 *557 	!
 **,(, 
	> 
	> "3""9-?
s   #6E*c                :    [        S U R                   5       5      $ )Nc              3  8   #    U  H  n[        U5      v   M     g 7frQ   )r/   )r   rf   s     rW   r   0SIMDKernel.num_reduction_dims.<locals>.<genexpr>  s     I[6&v..[   )sumr.  rm   s    rW   num_reduction_dimsSIMDKernel.num_reduction_dims  s     IT[[IIIrY   c                    [         erQ   NotImplementedError)ri   dtypes     rW   dtype_to_strSIMDKernel.dtype_to_str      !!rY   c                6    U R                   R                  5       $ rQ   )r(  select_index_dtyperm   s    rW   get_index_dtype_as_torch_dtype)SIMDKernel.get_index_dtype_as_torch_dtype  s    }}//11rY   c                @    U R                  U R                  5       5      $ rQ   )rK  rP  rm   s    rW   index_dtypeSIMDKernel.index_dtype  s      !D!D!FGGrY   c                    gNFr   rm   s    rW   r7  SIMDKernel.want_no_x_dim      rY   c                  ^ [        U4S j[         5       5      nU(       + =(       d    U(       + nS	S jn/ SQn	[        [        U	5      5      n
SS/nU(       a  UnOU(       a  U
nOX-   nU" X5      nU" U	[        5      n/ n[	        U5       H|  u  nn[        U5      nUR                  U5      nUR                  U5      nUc  UOUnUR                  [        U S3TU   UUU UU=(       a    U R                  (       + UUST;   S9
5        M~     U$ )
Nc              3  6   >#    U  H  oT;   d  M
  Uv   M     g 7frQ   r   )r   rf   r.  s     rW   r   3SIMDKernel.construct_range_trees.<locals>.<genexpr>  s      %
!-v61AFF   		c                d   ^ [        U4S jU  5       5       VVs0 s H  u  p#X2_M	     snn$ s  snnf )Nc              3  6   >#    U  H  oT;   d  M
  Uv   M     g 7frQ   r   )r   r>  masks     rW   r   OSIMDKernel.construct_range_trees.<locals>.filtered_index_map.<locals>.<genexpr>  s     2U#3PT33#r\  )	enumerate)seqr_  idxr>  s    `  rW   filtered_index_map<SIMDKernel.construct_range_trees.<locals>.filtered_index_map  s4    )22U#2U)U)UXS)U  s   ,)rM   rL   rK   rN   rO   r   rK   )r   r   r   r   r   )r~   zdict[Any, int])
r   all_prefixeslistr   ra  r/   r   r   r}   r6  )ri   r   r1  rn   r.  r8  active_prefixesno_r_dimrd  	grid_dimspointwise_tensor_dimsreduction_dimstensor_dimstensor_dim_mapgrid_dim_mapr"  irf   r   r   r   s       `                rW   construct_range_trees SIMDKernel.construct_range_trees  s*    % %
!-%
 
 (';|+;	
 $	 $Xi%8 9(K/K/@K ,KI))\B"?3IAv.v6L'++F3J#''/H!)AxE#he$6N'(J1J1J-J)% F] 4& rY   c                    U R                  UU R                  U R                  R                  5       U R                  U R
                  5      nU R                  R                  U5        g rQ   )rq  r1  r(  rn   r.  r8  r"  extend)ri   r   r"  s      rW   r:   SIMDKernel.initialize_range_tree  sR    00!!MM&&(KKMM
 	,rY   c                    g)zZ
Hook called right before codegen with every index that will be
used in the fused kernel.
Nr   )ri   indicess     rW   finalize_indexingSIMDKernel.finalize_indexing  s    rY   c                p    U R                   nSU l          U R                  XU5      X@l         $ ! X@l         f = frV  )r1  store)ri   rb   r   r  priors        rW   store_reductionSIMDKernel.store_reduction  s5    %% %	*::d51$)!E!s   - 5c                    grV  r   rm   s    rW   r2  +SIMDKernel.should_use_cooperative_reduction  rX  rY   c                    grV  r   rm   s    rW   r5  *SIMDKernel.should_use_persistent_reduction  rX  rY   c                t    [        [        R                  R                  S U R                   5       5      5      $ )Nc              3  T   #    U  H  oR                   R                  5       v   M      g 7frQ   )rd   rt   r   r%  s     rW   r   (SIMDKernel.var_ranges.<locals>.<genexpr>  s"      *4DD%%''4Ds   &()dictr/  chainfrom_iterabler"  rm   s    rW   rd   SIMDKernel.var_ranges  s4    OO)) *484D4D* 
 	
rY   c                :    [        S U R                   5       5      $ )Nc              3  P   #    U  H  n[        UR                  S L5      v   M     g 7frQ   )r   r   r  s     rW   r   0SIMDKernel.triton_tensor_ndim.<locals>.<genexpr>  s#     Q@P3td233@Ps   $&)rD  r"  rm   s    rW   triton_tensor_ndimSIMDKernel.triton_tensor_ndim  s    Q@P@PQQQrY   c                \    S/U R                  5       -  nSX!'   SSR                  U5       S3$ )Nr   :[r   ])r  join)ri   rp  r   s      rW   indexing_size_strSIMDKernel.indexing_size_str   s7    42244499U#$A&&rY   c                   S/U R                  5       -  nU R                   H_  nUR                  c  M  UR                  (       a  U R                  (       d  M6  UR
                  R                  5        S3XR                  '   Ma     U$ )N1BLOCK)r  r"  r   rn   r1  rf   upper)ri   r   r%  s      rW   dense_size_listSIMDKernel.dense_size_list%  sp    //11$$D&$$(=(=(=,0KK,=,=,?+@)Foo& % rY   c                L    U R                  5       nSSR                  U5       S3$ )Nr  r   r  )r  r  ri   r   s     rW   dense_size_strSIMDKernel.dense_size_str/  s)    $$&499U#$A&&rY   c                   [        U[        5      (       d  U$ UR                  S   nU R                  R	                  U5      =nc  U$ [        XUR                  05      n[        R                  R                  R                  U5      n[        UUR                  R                  5       UR                  R                  [        R                  R                   UR                  R"                  5      R%                  5       05      $ Nr   )r   r   r   r   r   r3   r   r7   r   r   r$  rh   r   r   r   r   r   re   rq   )ri   r   rM   	tree_node	new_indexs        rW   r$  )SIMDKernel.combine_modular_indexing_pairs3  s    %11LJJqM..22155I>Lu)..&9:	GG$$CCIN	((*INN,A,AGGKK!5!5-&(
 	
rY   c                    [         R                  R                  R                  U5      =n(       a  Uu  pE[	        U R                  XB5      U5      $ U R                  X5      $ rQ   )r7   r   r   expand_floor_divr   _combine_contiguous_dims)ri   r   r%  
expand_resr  denominators         rW   r#  "SIMDKernel.combine_contiguous_dimsE  sU     ))::5AA:A%/"ID99)JKXX00==rY   c                   [        U[        R                  [        R                  45      (       a  U$ UR	                  U5      u  p4[        U5      S::  a  U$ [        R                  R                  R                  X4[        U/X45      5      u  pVnXT:X  a  U$ UR                  U5      n[        U[        [        X6" U5      5      5      5      n	U	$ )z9
More aggressive simplification to merge contiguous dims
r8   )r   r   r   r   r   r   r7   r   r   _simplify_loopsr;   r   r3   r  zip)
ri   r   r%  r   r   	new_sizesreindex_prunenew_index_varsr  s
             rW   r  #SIMDKernel._combine_contiguous_dimsN  s     eemmU\\:;;L //6
u:?L%&WW%5%5%E%E7S&
"	F L	2ud3z7>;R+S&TU	rY   c                   ^ ^ T R                   S   R                  =(       d    T R                  m[        R                  U U4S j5       nU" 5       $ )Nc               3    >#    T R                   R                  5       (       d  T R                  (       a   eS v   g T(       a  T R                  5         ST l         S v   T(       a  T R                  5         ST l        g ! ST l        f = f7f)NFT)r(  rn   r1  codegen_body)ri   should_flushs   rW   ctx)SIMDKernel.disable_reduction.<locals>.ctxe  sn     ==--//0000 !!#$)D!-%%'(,%%s   AB	A= 5B	=	BB	)r"  r   r3  
contextlibcontextmanager)ri   r  r  s   ` @rW   disable_reductionSIMDKernel.disable_reductionb  sE    ''+33Qt7Q7Q		"	"	- 
#	-$ urY   c                    [        U5      [        U R                  5      :X  d   e[        XR                  5       VVs/ s H  u  p#UR                  U5      PM     snn$ s  snnf rQ   )r   r"  r  r   )ri   r   r^   rangess       rW   
set_rangesSIMDKernel.set_rangesz  s^    7|s4#3#34444 #&g/?/?"@
"@ V$"@
 	
 
s   Ac                  ^^^^ [        S U 5       5      (       a  U  Vs/ s H  n/ PM     sn/ 4$ [        R                  R                  mU  Vs/ s H  n/ PM     snmU  Vs/ s H  nTR	                  U5      PM     snm[
        R                  " 5       mS
UUUU4S jjn        SS jn/ nSnU GHo  n	/ n
U	 GHQ  nTR                  US5      (       a  U
R                  S 5        M/  U[        T5      :  aJ  TR                  TU   S5      (       a0  US-  nU[        T5      :  a  TR                  TU   S5      (       a  M0  US-   [        T5      :  az  TR                  UTU   5      (       a`  TR                  UTU   5      (       d  [        eTU   n[        UTU   5      nU
R                  U" UU" X5      U" US-   U5      5      5        GM  U[        T5      :  d  GM&  U
R                  [        R                  " U" X5      5      5        GMT     UR                  U
5        GMr     [        S T 5       5      (       d   ST S	U 35       eTU4$ s  snf s  snf s  snf )Nc              3  >   #    U  H  n[        U5      S :H  v   M     g7fr   N)r   )r   r^   s     rW   r   5SIMDKernel._split_iteration_ranges.<locals>.<genexpr>  s     6gFs6{ags   c                   > TR                  U5      nTR                  TU    U5      (       d  [        e[        TU    U5      TU '   TU    R	                  U5        [        T5      $ rQ   )r-  statically_known_multiple_of	CantSplitr   r   r   )rp  r   
new_ranges	remainingsv	var_counts     rW   	add_range5SIMDKernel._split_iteration_ranges.<locals>.add_range  s]    ;;t$D229Q<FF#IaL$7IaLqM  &	?"rY   c                    ^ ^^ SUUU 4S jjnU$ )Nc                    > TU T   -  U T   -   $ rQ   r   )	flat_varsidx1idx2sizes    rW   getterISIMDKernel._split_iteration_ranges.<locals>.make_combined.<locals>.getter  s    io-	$??rY   )r  r   r~   r{   r   )r  r  r  r  s   ``` rW   make_combined9SIMDKernel._split_iteration_ranges.<locals>.make_combined  s    @ @ MrY   r   r8   c                6    [         R                  R                  $ rQ   )r   r   Zero)_s    rW   r   4SIMDKernel._split_iteration_ranges.<locals>.<lambda>  s    EGGLLrY   c              3  z   #    U  H1  n[         R                  R                  R                  U5      S :H  v   M3     g7f)r8   Nr7   r   r   r   r   s     rW   r   r    s*     Iy!177##--a0A5ys   9;zfailed to set ranges  )rp  r   r   r{   r~   r   )r  r{   r  r   r  r   r~   z(Callable[[list[sympy.Expr]], sympy.Expr])r  r7   r   r   r-  r/  r0  r   r   r   statically_known_gtr  r  r   operator
itemgetter)groupsr   groupr  gr  r  return_getters_groupscurrent_grouplength_groupreturn_gettersr  size1size2r  r  r  r  s                 @@@@rW   _split_iteration_ranges"SIMDKernel._split_iteration_ranges  s    6g666$*+F5BF+R//WW:@-A&Qb&-A
-34VR[[^V4	OO%		# 	#		$'	/2	5	 !##LN$--dA66"))*@A#c)n49S9Sm,: :
 "Q&M $c)n49S9Sm,: : !1$s9~5":P:P)M2; ; ::i6  (%m4E$T9]+CDE"))%!%m;%ma&7? %s9~5&--$//	-0NOA %F "((8K $N IyIII 	
#I;ay9	
I 000K , .B4s   II"Ic                *   [         R                  R                  n[        US   5      S:X  af  UR	                  U[
        R                  R                  5      (       d7  UR	                  [        U5      [        US   5      U-  5      (       a  US   U/4$ U$ )z1Fill in the reduction numel of lengths if missingr8   r   )	r7   r   r   r   r   r   r   r   r2   )clsr  r   reduction_numelr   s        rW   prepare_split_iteration_lengths*SIMDKernel.prepare_split_iteration_lengths  s     77##wqz?a00%''++NN00f%gaj)O; 
 AJ 122rY   c                l    U R                  XU5      n U R                  X5        g! [         a     gf = fNTF)r  r  r  )r  r  r   r  s       rW   is_compatibleSIMDKernel.is_compatible  s>     55fW	''8 		s   & 
33c                X   U R                    Vs0 s H  o"R                  UR                  _M     nnU R                  (       d7  U H1  n[	        U5      (       d  M  [
        R                  R                  X4'   M3     / UR                  5       QnU R                  XQU R                  5      $ s  snf rQ   )r"  rf   re   r1  r/   r   r   r   r   map_kernel_groups_to_node_sizesr  )ri   r   rtr;  rf   r  s         rW   split_and_set_rangesSIMDKernel.split_and_set_ranges  s     150@0@A0@"))RXX%0@A$$ &v..%*WW[[FN ! $6==?#33FT__UU Bs    B'c           
     T   [        U5      [        U5      :X  a%  [        S [        X!5       5       5      (       a  U" U6 $ U R                  X5      u  pE/ [        R
                  R                  U" U6 5      QnU VVs/ s H  ow Vs/ s H
  o" U5      PM     snPM     snn$ s  snf s  snnf )aY  
We may want to fuse `for i0 in s0*s1` into a tiled kernel with groups (s0, s1).

To do this we need to split up the iteration space of i0 into something like:
    for i1 in s0:
      for i2 in s1:
        i0 = i1*s1 + i2
        ....

This function matches and resplits lengths to the groups of
this kernel to enable tiled + non-tiled fusions.
c              3     #    U  H?  u  p[         R                  R                  R                  [	        U5      U-
  5      S :H  v   MA     g7fr  r7   r   r   r-  r2   )r   rM   r  s      rW   r   =SIMDKernel.map_kernel_groups_to_node_sizes.<locals>.<genexpr>  s=      /
, GG%%mA&6&:;q@,s   AA	)r   r  r  r  r/  r  r  )	r  r  r   r  r  r  r   fnsfns	            rW   r  *SIMDKernel.map_kernel_groups_to_node_sizes  s    & w<3v;&3 /
G,/
 ,
 ,
 w'',/,G,G,X)
LY__22:z3JKL8MN8M,"H,8MNN,Ns   :	B$BB$B$c                6    [        U[        R                  5      $ rQ   )r   r   TMPri   r   s     rW   is_indirect_indexingSIMDKernel.is_indirect_indexing  s    "5$((33rY   c                  ^ U R                  U5      (       a  gS/[        U R                  5      -  nUR                   Hn  nX0R                  ;  a  M  U R                  U   n[        UR                  [        5      (       d   eX$R                  R                  ==   UR                  -  ss'   Mp     [        R                  R                  R                  m[        U4S j[        X R                  R!                  5       5       5       5      $ )NFr8   c              3  J   >#    U  H  u  pT" U5      T" U5      :g  v   M     g 7frQ   r   )r   	idx_range
iter_ranger-  s      rW   r   ,SIMDKernel.is_broadcasted.<locals>.<genexpr>1  s*      
)P%	 Y8J#77)Ps    #)r  r   r.  r   r   r   r   r}   r   r^   r7   r   r   r-  anyr  r   )ri   r   index_numelsrq   entryr-  s        @rW   is_broadcastedSIMDKernel.is_broadcasted   s    $$U++sS--((F222))&1Eell,?@@@@++,<, ) 77##,, 
),\;;;M;M;O)P
 
 	
rY   c                    [        U[        5      (       a)  SSR                  [        U R                  U5      5       S3$ U R                  U R                  U5      5      $ )a`  
Convert an index expr to a string that can be used in output code.
e.g. a sympy expression "s2" may actually appear as "ks1" in the generated kernel.

Index expressions often need to be passed in as arguments to the triton kernel.
Rename_indexing and codegen_indexing keep track of the needed indices and add
new parameters to the function signature.
r  r   r  )r   rg  r  mapindex_to_strr  rename_indexingr  s     rW   r  SIMDKernel.index_to_str6  sQ     eT""tyyT%6%6!>?@BBzz$..u566rY   c                   U R                  U5      n[        U[        R                  R                  R
                  5      n[        UR                  [        R                  5      5      (       d-  [        UR                  [        R                  5      5      (       a3  UR                  [        R                  R                  R
                  5      n[        UR                  [        R                  5      5      (       a  UR                  [        R                  5       Ho  nUR                  n[        U5      S:  d  M   [        S U 5       5      (       d  M9  U[        R                  R                  R                  U5      0n[        X5      nMq     U R                  U5      n[        U[         5      (       d  UOUR"                  S   nU R%                  U5      $ )Nr   c              3  v   #    U  H/  n[        U[        R                  [        R                  45      v   M1     g 7frQ   )r   r   r   PRECOMPUTED_SIZEr   s     rW   r   .SIMDKernel.prepare_indexing.<locals>.<genexpr>W  s0      ,$ #1tyy$2G2G&HII$s   79)r&  r3   r7   r   r   precomputed_replacementsr   atomsr   floorceilingsubsr   r  lookup_precomputed_sizer   r   r   codegen_indexing)ri   r   ar  replacements
simp_indexs         rW   prepare_indexingSIMDKernel.prepare_indexingC  sQ    &&u-5!''"2"2"K"KLu{{5;;'((CEMM0J,K,KJJqww//HHIE u{{5==)**[[/ ..w<!# ,$, ) ) %&qww'7'7'O'OPQ'R#SL&u;E 0 ++E2
 )X>>JJOOTUDV 	 $$Z00rY   c                    U R                    Vs/ s H(  oR                  (       a  U R                  (       d  M&  UPM*     sn$ s  snf rQ   )r"  rn   r1  )ri   ts     rW   active_range_treesSIMDKernel.active_range_treesi  s6    ''
'!~~AVAVA'
 	
 
s
   %AAc                8   [         R                  R                  R                  XR	                  5       5      n[        UR                  [        S9 H  nX R                  ;   d  M  0 nU R                  U   R                  5        H.  n[         R                  R                  R                  U5      X4'   M0     [        U5      S:  a5  [        U R                  U   R                  U5      U R                  U   l        U R                  U   R                  5         M     U$ )Nr   r   )r7   r   r   r!  rd   sortedr   rx   r   r  r!  r   r3   r   r   )ri   r   symr$  pss        rW   r"  SIMDKernel.codegen_indexingn  s    ww44T??;LM$++5C+++  "//4EEGB'(ww'7'7'O'OPR'SL$ H|$q(6@--c277$7D))#.3 %%c*224 6 rY   c                    [        S5      e)NzNYI: codegen_nan_checkrH  rm   s    rW   codegen_nan_checkSIMDKernel.codegen_nan_check  s    !":;;rY   c                    [        S5      e)NzNYI: call_kernelrH  )ri   rb   r   s      rW   call_kernelSIMDKernel.call_kernel  s    !"455rY   c              #     #    U R                   nU R                  nU(       a  [        R                  " X5      n[        R
                  " U5      nXl         X l         Uv   X0l         X@l        g! X0l         X@l        f = f7f)z:Context manager to add an additional mask to tl.load/storeN)
_load_mask_load_otherr5   logical_andr6   _unwrap)ri   r_  r  r|  	prior_vals        rW   
mask_loadsSIMDKernel.mask_loads  sj     
 $$	??4/D!!$' 	)J#O( $O(s   AA=A, A=,A::A=c                &   U R                   R                  5        VVs0 s H  u  p#X#R                  _M     nnn[        X5      n0 nU R                   H5  n[        UR                  5      n[        XXS05      [        XXS05      -
  Xh'   M7     U$ s  snnf )a  
This gets the stride of the index for each of the tiling variables
(technically, it does it at index 0)

For example, if
xindex = x0 + 512*x1 + 1024*r0
x0 = (xindex//512)
x1 = (xindex % 512)
r0 = rindex // 1024

this function would return
{xindex: 512, rindex: 1024}
r8   r   )r   rt   r   r3   r"  r1   rb   )	ri   r   kvindex_to_tile_indexesindex_in_tile_varsstrides
range_treer   s	            rW   get_strides_of_loadSIMDKernel.get_strides_of_load  s     8<7L7L7R7R7T U7TtqFF7T U'E**J":??3A#$6A?*"FC GJ +
  !Vs   Bc                d    [        U[        5      (       a  [        [        X5      5      $ U " U5      $ rQ   )r   tupler  )r  r  s     rW   _map_tuple_or_scalarSIMDKernel._map_tuple_or_scalar  s(    eU##R((%yrY   c           	     "   / n[        [        U R                  R                  R	                  5       5      5      nU R                  R                  5       u  p4  nU R                  R                  5       n[        R                  R                  R                  [        U R                  R	                  5       5      5      n[        U5       GH;  u  pxX;  a  UR                  S5        M  [        R                  R!                  U5      n	[        R                  R                  R                  U	5      n
X:  a  ["        [$           " 5       nSnXX    HT  n['        U[(        [*        45      (       a  UR-                  SU 35        US-  nM9  UR-                  UR.                  5        MV     [        U5      U-  nOU
n[        R                  R1                  U5      n[3        U5      nUR                  UU-  S[5        Xr:  5      -   -  5        GM>     [7        U5      $ )a  
Try the best to estimate the total size (in bytes) of the
kernel's inputs and outputs, which is used for estimating the memory
throughput of this kernel. This information is used for checking how
far we are from the peak memory bandwidth. It's important that
we want to avoid overestimating the sizes of the inputs and outputs,
because it can wrongfully give us a very large memory traffic value,
which may be even larger than the theoretical bandwidth and thus
become very misleading. This is particularly problematic for cases
where we slice some inputs. In those cases, we should only count
the size of the "slices" instead of the original inputs, because
only the slices contribute to the real memory traffic.
r   no_index_dep_r8   )r   r4   r   inplace_buffersr   python_argdefsr(  buf_accessesr7   r   r   r   r2   r.  ra  r   	get_numelr   r   r   r!   r"   r   r   	get_dtyper,   r   rD  )ri   nbytesninplace_argsr  	call_argsrP  	out_numelrp  r  	arg_numelbuf_sizerw  no_index_dep_countdepre   rJ  
dtype_sizes                    rW   estimate_kernel_num_bytes$SIMDKernel.estimate_kernel_num_bytes  s    F499#<#<#C#C#EFG!YY557a}}113 GG$$..}T[[=O=O=Q/RS		*FA &a ))#.Iww''11)<H# %S/+%&"',C!#'9::m4F3G$HI*a/*CII. - Gy0 GG%%c*E'.JMM%*,C8I4J0JKL9 +: 6{rY   c           	        [        U R                  R                  5      S:X  aG  [        U R                  R                  5      S:X  a$  [        U R                  R                  5      S:X  a  gU R                  R                  5       u  p#pESnU GHr  n[        R                  R                  U5      nU(       d  M,  UR                  5       n	[        U	R                  5      S:X  d  MW  [        U	R                   V
s/ s H  oS:X  d  M
  U
PM     sn
5      S:X  a  M  [        R                  " U	R                  5      nUc  UnM  Xk:w  d  M  [        SU S3SU S	U 3-   5      n[        R!                  U5        U Vs/ s Ht  n[        R                  R                  U5      (       aK  [        R                  " [        R                  R#                  U5      R                  5       R                  5      OSPMv     nnU Vs/ s H`  n[        R                  R                  U5      (       a7  [        R                  R#                  U5      R                  5       R                  OSPMb     nnU Vs/ s HE  nU[        R                  R$                  ;   a  S
O!U[        R                  R&                  ;   a  SOSPMG     nnU V
s/ s H  oR(                  PM     nn
[        SU SU SU 3SU SU S3-   5      n[        R!                  U5          g   [+        SU S35      n[        R!                  U5        gs  sn
f s  snf s  snf s  snf s  sn
f )zZ
Print message if the kernel have mixed layout inputs.
Only care about 4D tensor for now.
r8   r   N   r   zExpected stride order z, but found stride orderr  z for kernel 
GraphInputIntermediateBufferz  param names z
  buf names z
  strides z	
  sizes z
  sources 
z%All the inputs for the triton kernel z have uniform layout)r   r   input_buffersoutput_buffersrN  rO  r7   r   try_get_buffer
get_layoutr  r   get_stride_orderstrider&   logwarning
get_buffergraph_inputsname_to_bufferrb   r%   )ri   r  argdefsrU  
_signaturer  uniform_stride_orderarg_namebuflayoutrM   stride_ordermsgrb   stride_order_list	size_listsource_listargdef_namess                     rW   warn_mix_layoutSIMDKernel.warn_mix_layout  s    		''(A-DII,,-2DII--.!3
 ,0II,D,D,F)J#!H''((2C^^%F6;;1$6;;9;aq&;9:a?!226==A'/+7()9%01E0FF^_l^<}EFC KK$ %.) %.D 7711$77 ++GG..t4??AHH "	"
 %. & ) %.	! %.D 7711$77 **40;;=BB!" %.	  ! %.# %.D	  177#7#77 %  177#9#99 2!	"
 %.   # 5<#<GqFFGL#<%(nYK|\m[no&ykk]"MNC KK$a "b 3K=@TU
 	C[ :)!# $=s'   6	L(
L(
5A;L-6A'L2#AL75L<c                   [         R                  " XSU5      nSU l        [         R                  " U R                  R
                  U5      n[         R                  " X45      nSU l        [         R                  " X%5      n[         R                  " Xf5      n[         R                  " XSU5      n[        R                  " XXU45      $ )NrD  FT)r5   	reductionr1  
index_exprr(  r  truedivsubmulr6   r;  )	ri   rJ  r  sum_rnumelmeandxdx2m2s	            rW   welford_reduce_fallback"SIMDKernel.welford_reduce_fallback=  s    }}U5%8 % = =uE{{4( $WWU!ggbo]]54!!4V"455rY   c                    [         R                  " XSU5      n[         R                  " X#5      n[         R                  " U5      n[         R                  " XSU5      n[        R
                  " X645      $ )NmaxrD  )r5   r}  r  expr6   r;  )ri   rJ  r  vmaxr  r  vsums          rW    prepare_softmax_twopass_fallback+SIMDKernel.prepare_softmax_twopass_fallbackI  sT    }}U5%8gge"ggcl}}U5#6!!4,//rY   c                    [         erQ   rH  rm   s    rW   codegen_kernelSIMDKernel.codegen_kernelP  rM  rY   c                    g rQ   r   rm   s    rW   r  SIMDKernel.codegen_bodyS      rY   c                    g rQ   r   )ri   r  s     rW   r   )SIMDKernel.codegen_iteration_ranges_entryV  r  rY   )r8  r9  r+  r   r3  r(  r,  r1  r   r*  r8  r.  r6  r   r"  r&  r4  )NNNN)r;  dict[str, sympy.Expr]r(  rC   r   r   r<  Optional[bool]r=  r  r4  Optional[dict[str, sympy.Expr]]r~   r   r  )rJ  torch.dtyper~   rx   )r~   r  r   r   )r   r   r1  r   rn   r   r.  r  r8  r   r~   list[IterationRangesRoot])r   zdict[str, str]r~   r   )rw  Sequence[sympy.Expr]r~   r   )rb   rx   r   r{   r  r:   r~   r   )r~   rz   )rp  r   r~   rx   )r~   z	list[str])r   r{   r~   r{   )r   r{   r%  r}   r~   r{   )r~   z'contextlib.AbstractContextManager[None])r   r{   r~   ry   )r  Iterable[sympy.Expr]r   Sequence[Sequence[sympy.Expr]]r~   zStuple[list[list[sympy.Expr]], list[list[Callable[[list[sympy.Expr]], sympy.Expr]]]])r  r  r   r  r  r{   r~   r  )r  r  r   r  r  r{   r~   r   )r   r  r~   list[list[sympy.Expr]])r  r  r   r  r~   r  )r   r{   r~   r   )r   r{   r~   rx   )r~   r  )r   r{   r~   r{   r   rQ   )rb   rx   r   zOptional[IRNode]r~   r   )r_  zUnion[str, OpsWrapper]r  Union[int, float]r~   zIterator[str])r   r{   r~   rz   )r  r   )Br   r   r   r   r   pexprr  __annotations__r  ra   r   r*   r   rE  rK  rP  rS  r7  rq  r:  rx  r}  r2  r5  rd   r  r  r  r  r$  r#  r  r  r  staticmethodr  classmethodr   r   r   r  r  r  r  r  r  r  r&  r*  r"  r2  r5  r  r  r=  rF  rJ  r\  rz  r  r  r  r  r   r   r   r   s   @rW   r|   r|   s  sO    */E&.&&!OT! /38<9=9=/.%/. %/. ,	/.
 (6/. )7/. 7/. 
/. /.b J   J"2 H H5+5 5 	5
 &5 5 
#5n-*
R'
'
$>>':>	>':	(0
 L1$L1/ML1
L1 L1\ 
 ',ggkk	$ 0 $	
 
( & 
 ',ggkk	$ 0 $	
 
 
V5
V	
V O$O 0O
 
 O O84
,7$1$1 
$1L

"<6 )*)3D)	) )&0  
=~EN
60" rY   r|   c                     \ rS rSr% Sr\rS\S'   S rS r	\	r
\	rS r  S%S jr\      S&S	 j5       rS'S
 jr    S(S jrS rSS. S)S jjrS r S*           S+S jjrS r\\R2                  " S5      S,S j5       5       r\      S-S j5       r\      S.S j5       r\        S/S j5       r\  S0S j5       r\          S1S j5       r\        S2S j5       r \        S3S j5       r!\\"RF                  RH                  S4   S4S jj5       r%\\"RF                  RH                  S4   S5S jj5       r&S r'S6S  jr(S*S! jr)S" r*S# r+S$r,g)7SIMDSchedulingiZ  zc
Single Instruction Multiple Data parent class used for fusion across
multiple different backends.
z	type[Any]kernel_typec                &    [        S U 5       5      $ )Nc              3     #    U  H7  n[         R                  R                  R                  [	        U5      5      v   M9     g 7frQ   r  r   s     rW   r   *SIMDScheduling.group_fn.<locals>.<genexpr>c  s-     P%QQWW%%..}Q/?@@%s   ?A)rI  r  s     rW   group_fnSIMDScheduling.group_fnb  s    P%PPPrY   c                D	  ^^ [        U[        R                  5      (       d  [        U[        R                  5      (       a  [        R                  R                  X5      $ UR                  u  nu  pEUR                  u  nu  mm[        X5      nUR                  5       (       a3  UR                  5       (       d  UR                  5       (       a  U" S5        OGUR                  5       (       a2  UR                  5       (       d  UR                  5       (       a  U" S5        UR                  5       (       a;  UR                  5       (       a&  UT:H  =(       a    UT:H  nU(       d  U" SUTUT5        U$ UR                  5       (       Gd  UR                  5       (       Gd  UT:X  a  UT:X  d  UR                  5       (       d  U" SUTUT5        gUR                  5        Hk  nUR                  5       (       a    OUUR                  5       UR                  5       -  (       d  MB  UR                  u  nu  pXI:X  a  XZ:X  a  M_  U" SUU	UU
5          g   X4 H  nUR                  5       (       d  M    g   U R                  UR                  5       XE5      nU R                  UR                  5       XE5      nU R                  UR                  5       UR                  5       -   XE5      n[        R                  R                  (       a`  Sn[!        U5      S:  a)  [!        U5      S:  a  Xs=:H  =(       a    U:H  Os  nOX:H  nO[!        U5      S:  a  X:H  nU(       d  U" SUUU5        ggUR                  5       (       d  UR                  5       (       a  US	:X  a  TS	:w  d   eUTT-  :X  a  [#        UU4S
 jUR                  5        5       5      (       d	  U" S5        g[        R                  R$                  (       ag  UR                  5       (       dR  ['        U R                  UR                  5       U5      R)                  5       5      US	4TTS	44;   nU(       d  U" S5        U$ gUT:w  a  U" S5        UT:H  $ UR                  5       (       a  UR                  5       (       a   eU R+                  X!5      $ )z
Hook called by Scheduler to determine if the Triton backend
can fuse node1 and node2.  These nodes might already be
FusedSchedulerNodes.
z&Split scan cannot fuse with reductionsz1numel/rnumel mismatch (reduce) (%s, %s), (%s, %s)z5numel/rnumel mismatch (non-reduce) (%s, %s), (%s, %s)Fz:numel/rnumel mismatch prologue mismatch (%s, %s), (%s, %s)Tr   ztiling mismatch (%s, %s, %s)r8   c              3  p   >#    U  H+  n[         R                  TT4UR                  5       5      v   M-     g 7frQ   )r|   r  
get_ranges)r   r   numel2rnumel2s     rW   r   *SIMDScheduling.can_fuse.<locals>.<genexpr>  s3      . ,,fg->OO.s   36z"nodes numel/rnumel incompatibilityzinvalid tiling for reductionznodes numel incompatibility)r   r   ForeachKernelSchedulerNodecan_fuser  r)   is_split_scanrn   is_template	get_nodesused_buffer_namesget_buffer_namesselect_tilingr   rT    tiling_prevents_pointwise_fusionr   r   tiling_prevents_reduction_fusionrI  r   can_fuse_horizontal)ri   node1node2r  numel1rnumel1whyreduction_can_fuser   	pro_numel
pro_rnumelr   tiling1tiling2tiling3condis_reduction_tiling_validr  r  s                    @@rW   r  SIMDScheduling.can_fusee  s0    eYAABBj977G
 G
 77@@NN${{F${{FG%  )<)<)>)>!!##<=  ""5+>+>+@+@!!##<=E$6$6$8$8!'6!1!Hg6H%G &%!!##E,>,>,@,@f$G);((**O ! !& 1++--!  $557%:P:P:RR$59ZZ22I & 38M \ & ) ' * $)# !2& ^==?? $
 (():FLG(():FLG((!EOO$55vG }}==w<!#7|a'&<<W<&1\A%"-D6	 !!!##(:(:(<(<a<GqL00')) "__.   <= MMBB!--//05**5??+<fELLN1  !,1- 5:;4412V##!!##E,>,>,@,@@@''55rY   c           
       ^^^^^^^ / m[         [        R                     " 5       m[        5       m[        5       mS mUU4S jnUU4S jnU4S jnUUUU4S jn[        R                  UUUU4S j5       nUU4S jn	U H  n
U
T;   a  M  TR                  U
5        U" U
5      (       aT  U	" U
T5      (       a  U" 5           S S S 5        T(       a"  U" U
5      (       d  T=(       d    [        T5      mOS mU" U
5        M}  U" U
5      (       a#  U" 5          TR                  U
5        S S S 5        M  [        ST ST S	U
R                  S
    35      e   T$ ! , (       d  f       N= f! , (       d  f       M  = f)Nc                ~   > U R                   u  nu  p#UT:H  =(       a    UT:H  =(       d    UTT-  :H  =(       a    US:H  $ Nr8   r  r   r  
node_numelnode_rnumelre   r  s       rW   fits_in_main_body@SIMDScheduling.generate_node_schedule.<locals>.fits_in_main_body  sF    +,77(A(
%'AK6,A efn,A1ArY   c                `   > U R                   u  nu  p#UT:H  =(       a    US:H  =(       a    TS:g  $ r  r  r  s       rW   fits_outside_reductionESIMDScheduling.generate_node_schedule.<locals>.fits_outside_reduction  s2    +,77(A(
&K;!+;K!KrY   c                d   > U R                   R                   H  nUR                  T;   d  M    g   gr  )read_writesreadsrb   )r   readcurrent_loop_buffer_usages     rW   expect_improved_memory_usageKSIMDScheduling.generate_node_schedule.<locals>.expect_improved_memory_usage  s,    ++99 99 , rY   c                  > TR                  U 5        TR                  U 5        TR                  U R                  R                   Vs/ s H  oR
                  PM     sn5        U R                  5       (       a  [        U [        R                  5      (       a|  [        U R                  [        R                  5      (       aS  [        U R                  R                  [        R                  5      (       d   TR                  U R                  5       5        g TR                  U R                  R                    Vs/ s H  oR
                  PM     sn5        g s  snf s  snf rQ   )r   r   updater  r  rb   rn   r   r   SchedulerNoder   r   ComputedBufferdataScanget_namewrites)r   rM   r  donenode_schedulenot_ready_yet_nodess     rW   schedule_node_in_loopDSIMDScheduling.generate_node_schedule.<locals>.schedule_node_in_loop  s    HHQK  #%,,amm>Q>Q-R>Qff>Q-RS
   q)"9"9::qvvr'8'899"166;;88#''

5)00!--BVBV1WBVQ&&BV1WX .S 2Xs   E6Ec               3  b  >#    T(       a  TS   [         L a  TR                  5         OTR                  [        5        T(       a1  TR	                  T[        5        TR	                  TS-   [         5        S mS v   TR                  [         5        TR                  5         T R                  5         g 7f)Nr  r8   )r@   popr   r?   insertclear)r  maybe_split_indexr  r  s   rW   end_current_reduction_loopISIMDScheduling.generate_node_schedule.<locals>.end_current_reduction_loop  s      r!2o!E!!#$$%56 $$%68HI$$%6%:OL$(!  1%%'%++-s   B,B/c                   > TS:X  a  gTU R                   -  (       d  gU(       a  [        US   [        [        45      (       a   e[	        T5      $ )Nr8   Fr  )	ancestorsr   r@   r?   r   )r   r  r  r  s     rW   #requires_closing_previous_reductionRSIMDScheduling.generate_node_schedule.<locals>.requires_closing_previous_reduction#  sS    {&7 b!O5E#F* *   +,,rY   zunexpected group: (r   z) != r8   )
r   r   r'   r  r  r   r   r   rI  r  )ri   r   re   r  r  r  r  r  r  r  r   r  r  r  r  r  s     ``       @@@@@rW   generate_node_schedule%SIMDScheduling.generate_node_schedule  sT   #%)5568 0:|5?\!+/		L		Y 	Y" 
	"	"	. 
#	.	- Dt|HHTN &&6t]KK35 6 -5QRV5W5W(9(OS=O% )-%%d+'--/1!((. 21 *)%6(%

1O - 4 ' 65 21s   <EE!
E	!
E0	c                ^   UR                  5       n[        R                  R                  R                  R
                  (       a  [        U5      nOSn[        US S9R                  u  nu  pVU R                  X%U5      n[        R                  SU5        U R                  [        XuXc5      5      $ )z;
Given a set of pre-fused nodes, generate a Triton kernel.
Nc                4    [        U R                  5       5      $ rQ   r   rn   rM   s    rW   r   -SIMDScheduling.codegen_node.<locals>.<lambda>V  s    c!..:J6KrY   r   zSchedule:
 %s)r  rR   rS   r   rT   coalesce_tiling_analysisr   r  r  r  schedule_logdebugcodegen_node_schedulerC   )ri   r   r   coalesce_analysisr  re   r  r  s           rW   codegen_nodeSIMDScheduling.codegen_nodeI  s     04~~/???!!((AA 9$ ? $ ,KLRR?E33E&I+];))}VO
 	
rY   c                   [         R                  " [         R                  5      R                  n[	        U 5      (       d  gU Vs/ s H8  nUR                  5       (       d  M  UR                  5       R                  5       PM:     nn[        S U 5       5      (       d  g[        R                  R                  R                  X5        U H,  n[        R                  R                  R                  XR5        M.     gs  snf )NFc              3  8   #    U  H  n[        U5      v   M     g 7frQ   )r+   )r   r  s     rW   r   8SIMDScheduling.can_use_32bit_indexing.<locals>.<genexpr>s  s     FID)$//IrC  T)rR   iinfoint32r  r+   has_tensor_outputrf  storage_sizer  r7   r   r   	guard_leq)re   buffersint_maxrr  	buf_sizesr  s         rW   can_use_32bit_indexing%SIMDScheduling.can_use_32bit_indexing_  s     ++ekk*..%e,, 
$$& ,CNN))+ 	 
 FIFFF 	
""52DGG&&t5 
s   C9!"C9c                N   UR                   nU R                  UUR                  UR                  UR                  5      u  p4U R                  UU/XS.5      nU H  nU R                  X&5        M     [        R                  " U5        U H  n[        R                  " U5         UR                  5       nS S S 5        U R                  WX&5      n[        R                  R                  (       a  [!        UU5        ["        R%                  SU5        Xl        [)        U5      Ul        M     A[+        U5      S:  a  [        U5      n	OUu  n	[        R                  " U	5         UR-                  5        H  n
U
R/                  5         M     S S S 5        U R1                  U5        U	R3                  U	R&                  5        [        R4                  (       a  U	R7                  5         [        R8                  (       a  U	R9                  US   R&                  5        [        R:                  =R<                  U	R<                  -  sl        [        R:                  =R>                  U	R>                  -  sl        [        R:                  R@                  RB                  (       a  [        RD                  (       a  US   RF                  RI                  5       nUR-                  5        H  n
U
RK                  5       nX;  a  M  U
RL                  c   eU
RL                  RO                  5       nUc  MH  [P        S   S==   S-  ss'   [        R:                  R@                  RS                  SURT                  < SU S	35        M     U RW                  5         g ! , (       d  f       GN= f! , (       d  f       GN= f)
N)r(  r4  z+Generating kernel code with kernel_name: %sr8   r   inductorintermediate_hookszrun_intermediate_hooks(r   r   ),r  get_tiling_and_scoresre   r  r  create_kernel_choices!codegen_node_schedule_with_kernelr>   merge_workspaces_inplacer7   set_kernel_handlerr  define_kernelr   traceenabledr0   ri  r  r  r   r   scheduler_nodesmark_runcodegen_commentr5  nan_assertsr2  rz  r   removed_buffersinplaced_to_removewrapper_codesupports_intermediate_hooksgenerate_intermediate_hooksr   live_output_buffersr  r   get_origin_noder   	writelinerb   free_buffers_in_scheduler)ri   kernel_featuresr  r;  tiling_scorekernelsrg   src_coder  final_kernelr   	live_outsrb   origin_nodes                 rW   r  $SIMDScheduling.codegen_node_schedule}  s   '55#99!!++--	 
 ,,H(H

 F22=I ,,W5F%%f-!002 .,,X}MK||##7! IIC[Q!,(2F   w<!&w/L%O\!!,/'779 : 0 	]+  !9!9:**,!!(()?)?@	<#?#??	""l&E&EE" GG  <<22  
;;=I'779}}(yy,,,"ii779*Z()=>!C>GG((221+2B2B1ERvQO : 	&&(k .-& 0/s   "N(N
N	
N$c                (    U R                   " U0 UD6/$ rQ   )r  )ri   r,  kernel_argskernel_kwargss       rW   r  $SIMDScheduling.create_kernel_choices  s'     
 	
rY   c           	     ^   U   [         R                  " 5       n0 nU H  nU[        L a!  UR                  UR	                  5       5        M-  U[
        L a  UR                  5         MH  UR                  5         UR                  UR                  5       5      nUR                  [        R                  UR                  R                  U5      R                  5       5      5        M     UR!                  UR#                  5       5        U H  nU[        L a!  UR                  UR	                  5       5        M-  U[
        L a  UR                  5         MH  [%        UR                  5        UR                  UR                  5       5      nUR'                  U5        M     S S S 5        g ! , (       d  f       g = frQ   )r  	ExitStackr?   enter_contextr  r@   closedecide_inplace_updater  r  r  r  fromkeys_bodyindexing_from_argsr   rx  keysr$   r   )ri   r  rg   stackall_indexingr   r   s          rW   r  0SIMDScheduling.codegen_node_schedule_with_kernel  s>   ((*EL &++''(@(@(BC_,KKM..0!'!<!<T__=N!OJ '' JJ99*ELLN & $$\%6%6%89 &++''(@(@(BC_,KKM 6djjA!'!<!<T__=N!OJLL, &- VVs   FF
F,Fonly_gen_src_codec                  UR                   u  nu  pgUS:X  d   eUR                  R                  UR                  5      u  p0 n
UR                  5       n/ nU H  nUR	                  5       nUR                  U5        X-  (       d  M/  [        U5      S:X  d   eX[        [        U5      5      '   UR                  R                  [        [        U5      5      5        / nM     [        U5      S:X  d   eU   U(       d  U/UQ H  nUR                  5         M     U	" 5       nUR                  S5         U H1  nUR                  UR                  UR                  5       5      5        M3     UR                   R#                  [%        5       5        SSS5        UR&                  R)                  5        GH@  u  nnSU S3nU
R+                  UR-                  5       / 5      =n(       d  M6  [/        S U 5       5      n[0        R2                  " SU(       + 5         UR                  U5         U H  n[        UR	                  5       5      S:X  aB  [        U5      S:X  a3  [5        U5      (       a#  U=R6                  UR	                  5       -  sl        UR                  UR                  UR                  5       5      5        M     UR                   R#                  [%        5       5        SSS5        SSS5        GMC     SSS5        [9        W[:        5      (       d!  UR=                  S	5        UR=                  S
SS9  [>        R@                  " U5         UR&                  RC                  5        H  nSU S3nUR=                  USS9  M     UR                  S5         [9        U[:        5      (       a  UnOUR=                  S5        URD                  nSSS5        / UQUPUQn[0        RF                  (       aH  URI                  5       S-  nURK                  5        SW SURM                  U5      RO                  5        3nU(       a  WsSSS5        $ U RQ                  WUU5      n[0        RR                  RT                  (       a  [W        UU5        SSS5        U RY                  W5        UR[                  WUR                  5        [>        R\                  =R^                  UR^                  -  sl/        [>        R\                  =R`                  UR`                  -  sl0        U Rc                  5         g! , (       d  f       GN= f! , (       d  f       GN^= f! , (       d  f       GM  = f! , (       d  f       GNn= f! , (       d  f       GN= f! , (       d  f       GN= f)zw
Codegen a triton template

If `only_gen_src_code` the src code will be returned instead of codegen'd into the wrapper
r8   r   z<STORE_OUTPUT>Nz<LOAD_INPUT_>c              3  @   #    U  H  oR                  5       v   M     g 7frQ   )can_codegen_without_upcasts)r   p_ns     rW   r   2SIMDScheduling.codegen_template.<locals>.<genexpr>!  s      5ESc7799^   ztriton.codegen_upcast_to_fp32z<DEF_KERNEL>z	<ARGDEFS>F)strictg    eArb  )2r  r   make_kernel_renderr  r  r   r   r   iterprologue_fused_inputsr   r   set_subgraph_bodyr   r  r  cse
invalidater   named_input_nodesrt   r   r  r  r   patchr   #prologue_fused_inputs_preserve_zeror   rx   finalize_hookr7   r  r@  codebenchmark_kernelr\  imports_for_benchmark_kernelcodegen_kernel_benchmarkgetvaluer  r  r  r0   r!  r5  r   r#  r$  r+  )ri   template_nodeepilogue_nodesprologue_nodesrE  r  _numelr  rg   renderbuf_name_to_prologue_grouptemplate_readsprologue_groupprologuenamesr   partial_code
input_namebuffersubgraph_namecan_codegen_without_upcastprologue_noder/  r  num_gbr  s                             rW   codegen_templateSIMDScheduling.codegen_template  s    ,11F{{&++>>}?Q?QR%'"&88:&H--/E!!(+%%5zQ&@N4U+<=,,00d5k1BC!# ' >"a'''$ +<^<DMMO = "8L))*:;*DLL!<!<T__=N!OP +

%%jl3 <
 '-&>&>&D&D&F"
F".zl! <%?%C%COO%r& >  25 5ES5 2.  7=W9W $55mD1?$'(F(F(H$IQ$N(+N(;q(@'CM'R'R(.(R(R,9,J,J,L)*(R !. 5 5$*$?$?(5(@(@(B%&!" 2@ #JJ11*,?! E  'G \ ,,,&&~6&&{5&A !!&) %66;;=
".zl! <**=*G > ))*:;lC00+H ../?@+00H < NnMmMnMM&&99;cA::<=Rj66v>GGIJL  !1 *)4 ,,X}fMK||##7{S; *> 	]+;(:(:;	6#9#99	""f&?&??"&&(_ <;& ED 3 Vt <; *)s   3<U/AT!
AU".UU"B:T3	U$UA	U<6U*A5U<>U<!
T0	+U3
U=U
U	U
U'*
U9	4U<<
Vc                    [         R                  R                  R                  [         R                  R                  R                  5       5        g rQ   )r7   r   r%  r*  
device_opssynchronizerm   s    rW   codegen_syncSIMDScheduling.codegen_synch  s-    	&&qww'9'9'E'E'GHrY   c           
        SSK Jn  U Vs/ s H  owR                  5       PM     nn0 0 p[        X5       Hl  u  p[	        US S9R
                  u  nu  pU R                  XU5      nU R                  UX5      nUUX4X'   UR                  U[        UX5      U(       + S9X'   Mn     UR                  UU UU	U
S9n[        R                  S[        U5      U Vs/ s H  n[        U5      PM     sn5        / nU GHC  nU Vs/ s H  owR                  5       PM     nnU" UUS9n[        UU5       H  u  pU R                  X   S	   UR                  X   5      5        X   nX   S	   nU(       dL  [         R"                  " U5         [$        R&                  " U5       H  nUR)                  5         M     S S S 5        [         R*                  =R,                  UR,                  -  sl        [         R*                  =R.                  UR.                  -  sl        M     UR1                  5       nUR3                  UUU45        GMF     U$ s  snf s  snf s  snf ! , (       d  f       N= f)
Nr8   )ComboKernelc                4    [        U R                  5       5      $ rQ   r  r  s    rW   r   ;SIMDScheduling.generate_combo_kernel_code.<locals>.<lambda>x      #ann>N:OrY   r   )r(  optimize_mask)r   triton_schedulingcustom_algorithm
kernel_mapnode_info_mapz1ComboKernels: %d nodes partitioned into %s groups)enable_autotunemixed_sizesr   )triton_combo_kernelrv  r  r  r  r  r  r  create_triton_kernelrC   horizontal_partitionri  r  r   r  create_sub_kernelr7   r  rB   
only_nodesr   r   r#  r$  r  r   )ri   subkernel_nodescustom_part_algorithmr  r  rE  rv  r   fused_node_listssubkernel_mapnode_schedule_mappnr   r  re   r  r  r;  
partitionspkernel_code_list
node_grouprg   	subkernelr/  s                            rW   generate_combo_kernel_code)SIMDScheduling.generate_combo_kernel_codek  sQ    	59HINN,I+-r(_?IB!$U0O!P!V!VA 77fMM''uEF$165$H! + @ @+M5I"-o !A !M @ !55!"2$+ 6 

 			? '(ZSVZ(	

 $J=GHZT 0ZH  /'F
 !-=>	66%)!,,,]->? *-	 1 5a 8(--i8$6$A$A-$PD MMO %Q 9 ''9+D+DD'**i.J.JJ* ? ,,.H##Xvz$BC- %.  c J. )  I 98s   H?I=I	 .I
Ic                P   UR                  5       nUR                  nUR                  n[        R                  S:  =(       d    [        R                  S:H  =(       a    UnU R                  X#XE5      nU H  u  pxn	U R                  Xq/U5      n
[        R                  R                  (       a  [        UR                  U
5        U R                  U/5        [        R                  SU
5        UR                  [        R                   R"                  U
5        M     U R%                  5         g )Nr8   z"ComboKernels: generated kernel %s.)get_subkernel_nodesuse_custom_partition_algor  r   combo_kernel_allow_mixed_sizesr  r  r  r  r0   snodesr!  ri  r  r5  r7   r   r%  r+  )ri   combo_kernel_noder  r  r  r  r  r/  rg   r  r  s              rW   codegen_combo_kernel#SIMDScheduling.codegen_combo_kernel  s    +??A 1 K K+;;;;a? 
11Q6P;P 	  ::O
 $4Ha,,X7JFSK||##7%,,k   "3!45II:KHqww33[A $4 	&&(rY       c           
       ^ ^^
 TS:H  nSU UU
4S jjnUR                  5       u  nm
[        U5      S::  a  [        T
5      S::  d  [        UT
-   5      (       a  / $ UR                  5       u  nm
U" UU(       a  UOT
UR                  U5      5      nU Vs/ s H=  n[	        T R                  UR                  UT5      UR                  UR                  S9PM?     n	nU	$ s  snf )Nr8   c                  > [        UR                  5      [        U5      :X  d   SUR                  < SU< 35       eUR                  UR                  /n[	        S [
        R                  R                  U5       5       5      (       d   e[
        R                  R                  U5       Vs/ s HF  nUR                  [        R                  R                  ;  d  M-  [        U[        5      (       d  MD  UPMH     nn[        UR                   Vs/ s H  oDR                  PM     sn5      nSS jn[        TR!                  U" U5      /U 5      SSS9/nU GH  n[        R                  R"                  R%                  UR&                  UR                  5      n	[        U	5      [        U5      :X  d   e U	R'                  S5      S-   n
U
[        U5      :X  a  M  [	        S	 XS
  5       5      (       a  M   U" US
U
 5      U" XS
 5      4n[        R                  R"                  R+                  [-        S [/        X5       5       5      5      nUR                  U;   a  US-  n[        R1                  US   5      (       a  US-  n[        R1                  US   5      (       a  US-  n[        R                  R"                  R+                  U[-        [
        R                  " UT5      5      -
  5      S:  d  GM  UR3                  [        TR!                  U" US
U
 5      U" XS
 5      /T5      UUR                  S95        GM     U$ s  snf s  snf ! [(         a     GM  f = f)z@
Compute tiling candidates by dividing up the iteration ranges.
zrw.range_vars=z ranges=c              3  N   #    U  H  n[        U[        [        45      v   M     g 7frQ   )r   r    r!   )r   rZ  s     rW   r   HSIMDScheduling.candidate_tilings.<locals>.tile_ranges.<locals>.<genexpr>  s&      EC 3G 455Es   #%c                f    [         R                  R                  R                  [	        U 5      5      $ rQ   r  )r  s    rW   collapse_rangesNSIMDScheduling.candidate_tilings.<locals>.tile_ranges.<locals>.collapse_ranges  s"    ww''00v1FGGrY   noner   )r;  rb   scorer8   c              3  *   #    U  H	  oS :H  v   M     g7fr  r   r   s     rW   r   r    s     ;?a6?s   Nc              3  :   #    U  H  u  pUS :w  d  M  Uv   M     g7fr  r   )r   r  rh  s      rW   r   r    s      "1EST1Es   	r   r;  r  rb   )r  r  r~   r{   )r   
range_varsr  r  r  r/  r  r  rb   r7   r   r#  r   r    r   CandidateTilingcreate_partial_tilingr   stride_hintsr   
ValueErrorr   r2   r  is_good_sizer   )is_pointwiser  rwdep_sourcesrZ  depswrite_namesr  tilingsrD  splittiled_groupsr  r  r  reduction_rangess                rW   tile_ranges5SIMDScheduling.candidate_tilings.<locals>.tile_ranges  s#    r}}%V4S8H	&6SS4 88RYY/K $??88E     %??88EEC88177#:#::  sI. E   %"))%D)3hh)%DEKH
  44(01<  G ''**77		2==Q7|s6{222
#MM!,q0EF+ ;76?;;; ! < $F6EN3#F6N3  ((22! "14V1E" 
 88{*QJE"//Q@@QJE"//Q@@QJE GG$$..ioofFV.W XX 
 NN'#&#<#<$3F6EN$C$3F6N$C!" !0$ #(!$
Q l N[ &E: " s0   ,,MM3MM1#MM
MMr  )r  r   r~   list[CandidateTiling])	r  r   r   "pointwise_or_reduction_read_writesr  complete_partial_tilingr;  r  rb   )r  r   re   r  r  r  pointwise_rangespartial_tilingsr;  full_tilingsr  s   `  `      @rW   candidate_tilings SIMDScheduling.candidate_tilings  s     '!+\	 \	| .2__->** !Q&$%*$%58H%HIII .2__->**% ,2B33LA
 *	
 * 22MM5/ ll[[ * 	 	
 	
s   ACc                    / SQ[        U5      * S nSS/S[        U5       n[        / [        X15      Q[        XB5      Q5      $ )z;
Create a tiling dict from pointwise and reduction splits.
)rK   rL   rM   NrN   rO   )r   r   r  )r  	pw_tilingreduction_tilingpw_prefixesreduction_prefixess        rW   create_tilingSIMDScheduling.create_tilingB  sT     &s9~o&78#U^,Cc2B.CDVc+)VC0B,UV
 	
rY   c                R    U R                  U(       a  UO/ U(       d  U5      $ / 5      $ rQ   )r  )r  r;  r  s      rW   r  $SIMDScheduling.create_partial_tilingO  s0       "F&F
 	
,.
 	
rY   c                    [        UR                  5       5      nSU;   nX#-  nU[        U5      -  /nU(       a  XG4OXt4nU R                  " U6 $ )zR
Given a tiling for only pointwise or reduction dimensions, adds the missing one.
rM   )rg  r   r2   r  )	r  r;  re   r  splitsr  total_numelmissing_tilingtiling_argss	            rW   r  &SIMDScheduling.complete_partial_tilingZ  s^     fmmo&f}-%f(==> )5V$>:R 	   +..rY   c           
        US:H  n[         [        [        [        R                  4      " 5       n[
        R                  " U5       GH  n[        U[        R                  5      (       d  M%  UR                  5       nU(       d  [        US   5      S:X  a  MP  Xt(       a  SOS   nU/n	UR                  R                  5        V
s/ s H7  n
[        U
[        5      (       d  M  [        U
R                  5      S:  d  M5  U
PM9     nn
U GH  n
/ U
R                  R!                  5       Qn[        R"                  R$                  n[&        R(                  R*                  n[-        U5       H&  u  nu  nnUU-  nUR/                  X5      (       d  M&    O   UR1                  X5      (       d  M  WS-   nU(       a  USU OUUS n/ nU H  u  nn[2        R4                  " U
R6                  U5      n[9        SUR;                  [<        5      UR;                  [>        5      -   [        U5      5      n[2        R@                  " UUUU5      nUb  US   OU/nURC                  U5        M     U Vs/ s HN  n[&        R(                  R*                  R1                  U[        R"                  R$                  5      (       a  ML  UPMP     nn[        U5      S:  d  GM  U	RE                  U5        GM     U	 H{  n[9        S[        U5      [G        S5      -
  5      nUS-   n[I        USU 5      nU4[K        UUS 5      -   nURM                  U RO                  U RQ                  UU5      UU5      5        M}     GM     [S        U[        SS9nU$ s  sn
f s  snf )z
Creates N-dimensional tiling candidates, attempting to simplify loads/stores
by tiling the kernel into higher dimensions.

Returns a list of tilings ranked by dimensionality.
r8   r   Nr   T)r   reverse)*r   r  rx   r   Exprr@   filterr   r   r  r  r   r  reads_and_writesr    r  rt   r   r   r7   r   r   ra  statically_known_geqr   r9   get_subexpr_involving_symbolr   r  r0  r   r   match_mod_div_block_exprrt  r   rX   r2   rI  r   r  r  r-  ) r  r  pointwise_numelr  r  r  r   node_rangesranges_to_tilenode_tilingsrZ  memory_depsall_var_rangespointwise_vars_numelr   pointwise_end_idxvarre   reduction_start_idxrd   index_tilingr   num_dimsmatch_resultdimsdimnode_tilingnum_leading_dimsfirst_trailing_dimcollapsed_leading_dimcollapsed_splitsranked_tilingss                                    rW   get_nd_tilingsSIMDScheduling.get_nd_tilingso  st    '!+T#uzz/235#**=9DdI$;$;<< //+KCA$71$< )lBN*+L  ++<<>>Cc9- 25cjj/A2E >  
 # "73::#3#3#5!6',ww{{$77++7@7P3%|U(E1(44,   8Q  77(   '8!&;# $ ##7$78'(;(<=   "",JC/LL		3E
  #H-O0LLN+ H $7#O#OsE8$L /;.F<?UGD ''-% #-.  , +77++CCCU +    |$q( ''5s #x  ,#&q#k*:]1=M*M#N %5%9"(5kBUCU6V(W%$9#;e 2 34? $  //112BLQ''  ,e :F  
 qn s   M7M79M7!AM<0M<c                  ^^^^^^^^^^^^ TR                   (       d  SOTR                   R                  mTR                  R                  mTR                  R                  mTR                  R
                  nT Vs/ s H  oeU   PM	     snmT Vs/ s H  oeU   PM	     snm[        R                  " [        T5      T:H  UUU4S j5        [        R                  " [        T5      T:H  UUU4S j5        0 m/ n   S       SUUUUUUUUU4	S jjjnUR                  U" SS9U" SS945        T(       a  UR                  U" T4SSS9U" SS945        TTR                  R                  5       -  n	U	 H   nUR                  U" U4SS9U" SS945        M"     [        S	S
9S	:X  a@  TS:X  a:  [        R                  " U	S5       H  n
UR                  U" U
SS9U" SS945        M!     / nU H\  u  u  pu  p[        U R!                  X5      [#        U5      [#        U5      -   S9nU R!                  X5      nUR                  UU45        M^     U R!                  T/T/5      nSmSmUU4S jn[%        UUS9 H  u  nnU R'                  TTTUR(                  5      (       a  [+        UR(                  5      TS:X  a  SOS-
  nU[        S	S
9:  aE  [,        R/                  SU[        R0                  R2                  R4                  R6                  5        M  UR(                  U4s  $ UR(                  U:X  d  M  UR(                  U4s  $    US4$ s  snf s  snf )zb
Generates a tiling, and a score of each tile according to each tile's coalesced memory accesses.
Nc                    > T ST ST  3$ Nr   r   )r  r  	pw_rangess   rW   r   8SIMDScheduling.compute_tiling_strategy.<locals>.<lambda>  s    ykO#4B}oFrY   c                    > T ST ST  3$ r  r   )r  
red_rangesr  s   rW   r   r    s    zl"_$5RGrY   Fc                  >	 U(       a  TOTnU(       a  TOTnU(       d  U(       a  U// 4$ / / 4$ [        U 5      X4nTR                  US5      =n(       a  U$ U(       a  TOTn/ n/ n	Sn
Sn[        Xs5       GH  u  pX;  a"  X-  n
TR                  R                  US5      nM-  U(       a  UT:X  a  TR                  nUc   eUR
                  n[        XR
                  5      nUR                  U
U-  5        U	R                  UR                  5        UR                  U5        U	R                  TR                  R                  US5      5        Sn
SnM  X-  n
UR                  U
5        U	R                  TR                  R                  US5      5        Sn
GM"     U
S:w  d  U(       a1  [        U5      S:X  a"  UR                  U
5        U	R                  U5        [        [        U5      5       HQ  n[        R                  R                  R                  UU   SS9n[        US5      n[!        U	U   U-  S-  5      U	U'   MS     X4TU'   X4$ )zE
Generate a tiling, and a tiling score, given vars to use as splits.
Nr8   r   r  r      )r  r   r  coalesced_by_varsuggested_splittiling_factorr   r   r  r   ranger7   r   r   r   minr   )vars_to_useuse_split_varr  r  target_numelr   outsplitting_varsr  split_scoresprodprev_var_coalesced_scorerA  v_range
var_tilingtile	remainderrp  r   all_iter_varsall_red_varsr  r  r  r  r  scored_sub_split
tiling_vars                      rW   process_node_varsASIMDScheduling.compute_tiling_strategy.<locals>.process_node_vars  s>    #/YJF.:?L)NB//8O$mBC&**355s5
.:]NFLD'($ ".9
'OD/@/Q/Q/U/U10,  Q*_!2!B!BJ%111%33D (2J2J KIMM$"23 ''
(8(89MM$' ''(9(J(J(N(NqRS(TUD/0,d###$5$F$F$J$J1a$PQ; :> qy\c&kQ.>d###$<= 3v;'GG$$..vay2.F1I"%l1o&9A&=">Q (
 &,$:S!))rY   T)r  )r  r  r   rV   r8   r   )r  gffffff?gGz?c                   > SnU S   R                   R                  5        H)  n[        R                  U5      (       d  UT-  nM$  UT-  nM+     U S   R                  * U-  $ )Ng      ?r   )r;  r   r  r  r  )r)  score_factor	tile_size"bad_size_additional_tiling_penaltygood_size_tiling_penaltys      rW   	score_mod9SIMDScheduling.compute_tiling_strategy.<locals>.score_mod  sa    LqT[[//1	&33I>>#/2T#TL#/2J#JL	 2 aDJJ;--rY   r   r   zmFound optimal tiling with %s tiles but torch._inductor.config.triton.max_tiles set to %s. Consider increasing)r   FF)r  ztuple[sympy.Expr, ...]r  r   r  r   r~   ztuple[list[int], list[int]])r  r  norm_read_writesr   reduce_varsrd   rR   _checkr2   r   r  r@  rX   r/  combinationsr  r  rD  r-  tiling_is_compatibler;  r   perf_hint_loginforS   r   rT   rU   )r  r  r  r  r  r  rA  score_splitr  overlapping_iter_varsr  r  pw_splitpw_score	red_split	red_score	candidater-  default_tilingr  cand
tiling_lenr	  r
  r  r  r  r  r  r  s    ````                 @@@@@@@@rW   compute_tiling_strategy&SIMDScheduling.compute_tiling_strategy  s    %44 "2266 	 *::EE(99EE"33>>(561AY6	)56AQi6
)$7F	
 	*%8G	
 DF  	
 35"'!&K	*/K	*K	* K	* )	K	* K	* K	*\ 	!t4!u5	
 %#T &59	 ->>CCEE 	 'A%qd>%59 ' #q(_-A(556KQO"")+DI)u=  P HJ<G8 X"89'!!(6(mc)n4I ,,XALNNI|45 =H **O+<>OP .3*#( 	. #)i"@D,''  !-o6JPQR
a 88!&&9"..55??	 {{L00 {{n,{{L00) #A, t##s 76s   <MM	c                `   ^^ [        T[        5      (       d   e[        UU4S jU 5       5      $ )Nc              3     >#    U  HW  n[        U[        R                  5      (       d  M$  [        R	                  TR                  5       UR                  5       TS 9v   MY     g7f))r  N)r   r   r  r|   r  r   r  )r   r   r  r;  s     rW   r   6SIMDScheduling.tiling_is_compatible.<locals>.<genexpr>  sR      
 &$	 7 78	J$$!2O %  &s
   #A"8A")r   r  r  )r  r  re   r  r;  s      ``rW   r  #SIMDScheduling.tiling_is_compatible  s4     &$'''' 
 &	
 
 	
rY   c                L    U H  nU R                  XX55      (       d  M  Us  $    g rQ   )r  )r  r  re   r  r  r;  s         rW   get_first_compatible_tiling*SIMDScheduling.get_first_compatible_tiling  s+     %F''oVV % rY   Nc                ,    U R                  XX45      S   $ r  )r  )r  r  re   r  r  s        rW   r  SIMDScheduling.select_tiling  s$     ((/

 	rY   c                    US:H  nU R                  U/U/5      n[        R                  R                  R                  R
                  (       a8  U(       a1  [        R                  R                  (       d  U R                  XX45      $ U(       d  [        R                  R                  (       a  [        SS9S::  a  [        R                  [        R                  ::  a  [        R                  " U5       Hq  n[        R                  R                  (       a  M$  [!        U R#                  XrU5      5      S:  d  ME  [        R%                  [&        R(                  " S5      5          US4$    US4$ [+        5       n[,        R.                  " 5       n	[        R                  " U5       Hl  nU R#                  XrU5       HS  n
U
R0                  U;   a  M  U
R0                  b  UR3                  U
R0                  5        X==   U
R4                  -  ss'   MU     Mn     U	R7                  5        V
Vs/ s H  u  pU
R8                  PM     nn
n[        SS9S:  aG  U(       a@        S
S jn[;        S[!        U5      5       H  nU" US   X   5      nUc  M  U/U-   n  O   [!        U5      S:  a  [        R%                  S	U5        [        R                  R                  (       a  U R=                  XU5      U-   nU R?                  XX<5      =n(       a  US4$ US4$ s  snn
f )z
Heuristics to decide how to tile kernels.
Currently, we tile based on stride-1 dimensions.

Returns:
    `(tile1, tile2, reduction_numel)` s.t. `tile1 * tile2 == numel`

r8   r   r  r   z
                                Reduction over non-contiguous dims.
                                Consider setting config.triton.tile_reductions to True.
                                Nr   c                :   U S   U R                  SS5      p2US   UR                  SS5      pT[        X5/5      (       d/  [        R                  R                  R                  X5-
  5      S:X  a  g [        R                  R                  R                  X5-
  5      S:  a  XE4X#4su  p#u  pE[        R                  R                  R                  X5-
  5      S:  d   e[        R                  R                  R                  X55      (       d  g U[        X55      UU S   S.nU$ )NrM   rL   r8   r   rN   )rK   rL   rM   rN   )r   r   r7   r   r   r   r  r   )tiling0r  a0a1b0b1
new_tilings          rW   convert_tiling_to_3dBSIMDScheduling.get_tiling_and_scores.<locals>.convert_tiling_to_3d0	  s     !w{{3':B w{{3':B *2(33ww''11"':a?77##--bg6:*,B8&HRhrww''11"':Q>>>ww''DDRLL !")"5>	
 "!rY   zpossibly bad tiling: %s)r5  r  r  r  r~   r  ) r  rR   rS   r   rT   r   prefer_nd_tilingr(  tile_reductionsrX   r  levelloggingWARNINGr@   r  r   r  r  textwrapdedentr   collectionsr   rb   r   r  most_commonr;  r  r  r/  )r  r  re   r  r  r  r%  r   
seen_namescandidate_tilescandidate_tilingr  r  r;  rp  new_3d_tilingr;  s                    rW   r  $SIMDScheduling.get_tiling_and_scores  s   " '!+ **E7_4EF OO""))BB!MM22..o  V]]%B%B}H
H ""goo5+22=AD"MM999 5 5d? STWXX%**$OO!$ !4'' B "4''&0l
4?4G4G4I#**=9D$'$9$9$$W #((J6%**6NN#3#8#8915E5K5KK1 %X : ,;+F+F+H7
+H'  ##+H 	 7

 #q(\"."9N"0"8 1c.12 4"1%~'8! !,&3_~%EN 3 ~"8.I ==))""=I ! 
 44/
 
6 
 4<t##7
s   !L
c                    g rQ   r   rm   s    rW   flushSIMDScheduling.flushe	  r  rY   c                    grV  r   rm   s    rW   ready_to_flushSIMDScheduling.ready_to_flushh	  rX  rY   c                   [        S U 5       5      (       d  [        US S9R                  u  nu  pEU R                  XU5      nU R	                  XdU5      nU R                  U[        XdU5      S9nU R                  Xh5        [        R                  " SU5         [        R                  " U5         UR                  5       n	S S S 5        S S S 5        OIUS   R                  U5      u  pn[        R                  " SU5         U R                  UUU
SS9n	S S S 5        W	R                  [!        ["        R$                  5      S	5      n	U	$ ! , (       d  f       N= f! , (       d  f       NJ= f! , (       d  f       N[= f)
Nc              3  @   #    U  H  oR                  5       v   M     g 7frQ   )r  )r   r   s     rW   r   ASIMDScheduling.generate_kernel_code_from_nodes.<locals>.<genexpr>l	  s     2Eq==??ErL  c                4    [        U R                  5       5      $ rQ   r  r  s    rW   r   @SIMDScheduling.generate_kernel_code_from_nodes.<locals>.<lambda>m	  ry  rY   r   )r(  rY  r   TrD  triton_)r  r  r  r  r  r  rC   r  r   rU  r7   r  r  get_prologue_template_epiloguern  replacerx   r.   KERNEL_NAME)ri   r   rY  r  re   r  r  r;  rg   r/  re  templateepilogues                rW   generate_kernel_code_from_nodes.SIMDScheduling.generate_kernel_code_from_nodesk	  sU   2E222!$U0O!P!V!VA 77fMM''fEF%%+M&I & F 22=I/1AB$$V,!002 - CB
 ,18+R+R,(H 02BC00&*	 1  D ##C(?(?$@)L! -, CB DCs0   E/E E?E'
E	E
E$'
E5c                    g rQ   r   )ri   r  s     rW   r!  SIMDScheduling.codegen_comment	  r  rY   c                    [         erQ   rH  )ri   r/  r  rg   s       rW   r  SIMDScheduling.define_kernel	  rM  rY   r   )r   z<Union[scheduler.FusedSchedulerNode, scheduler.SchedulerNode])re   r{   r  zGIterable[Union[ir.Buffer, ir.TensorBox, ir.TorchBindObject, ir.IRNode]]r~   r   )r,  rC   )r,  rC   r~   zlist[SIMDKernel])r~   Optional[str])F)r  zlist[BaseSchedulerNode]r  r   r  r   r  r   rE  r   r~   zlist[tuple[str, Any, Any]])r~   r  )r  r  r  r  r~   r  )r;  r  r  r   r~   r  )r;  r  re   r{   r  r{   r~   r  )r~   z"list[dict[str, tuple[sympy.Expr]]])
r  list[NodeScheduleEntry]r  r{   r  r{   r  rG   r~   =tuple[dict[str, sympy.Expr], Optional[dict[str, sympy.Expr]]])r  rc  re   r{   r  r{   r;  r  )r  rc  re   r{   r  r{   r  zlist[dict[str, sympy.Expr]])r  Optional[CoalesceVarAnalysis]r~   r  )r  re  r~   rd  r   )-r   r   r   r   r   r|   r  r  r  r  can_fuse_verticalr  r  r  r  r  r  r  r  rn  rs  r  r  r  r   r   r  r  r  r  r  r(  r  r/  r   r   r   r  r  rL  rO  r\  r!  r  r   r   rY   rW   r  r  Z  s-   
 (K'Q6B !"^@
P
, 

 
 :G)R
1
	
 -F SXt	tlI #(; 0;   $;  	; 
 ;   ;  
$; z)2 }  }~ 

,

@T

	

 

 
$
 
 
	
 
 /%/ / $	/
 
/ /( w
 
,w wr M$.M$ $M$ $	M$
 /M$ 
GM$ M$^ 
.
 
 $	

 &
 
  .  $	
 4  
 ;?	
 9	 
	 	 
 ;?~$
 9~$ 
G~$ ~$@<"rY   r  T)frozenc                  H    \ rS rSr% S\S'   S\S'   SrS\S'   \S	 5       rS
rg)r  i	  r  r;  r   r  Nrb  rb   c                |    [         R                  R                  R                  U 5      n U S:  =(       a    U S-  S:H  $ )z@Somewhat arbitrary heuristic used to boost scores for some sizesr  r   r  )r   s    rW   r  CandidateTiling.is_good_size	  s5     GG&&q)Bw(AFaK(rY   r   )	r   r   r   r   r  rb   r  r  r   r   rY   rW   r  r  	  s)    !!JD-) )rY   r  c                      \ rS rSrSrg)r  i	  r   N)r   r   r   r   r   r   rY   rW   r  r  	  s    rY   r  )r   )rV   r   r~   r   )r  r  r~   rx   )|
__future__r   rD  r  dataclassesr   r/  r@  r  r  rB  r   typingr   r   r   r   r	   r
   r   typing_extensionsr   r   rR   torch._loggingtorch._inductor.tiling_utilsr   %torch.fx.experimental.symbolic_shapesr   torch.fx.immutable_collectionsr   torch.utils._ordered_setr   torch.utils._sympy.functionsr   r   r   torch.utils._sympy.symbolr   r   r   r   _dynamo.utilsr    r   r   r   analyze_preserves_zero_maskr   	codecacher   dependenciesr    r!   r"   r#   optimize_indexingr$   runtime.runtime_utilsr%   r&   r'   r(   r)   utilsr*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   virtualizedr5   r6   r7   block_analysisr9   commonr:   r;   r<   r=   multi_kernelr>   simd_kernel_featuresr?   r@   rA   rB   rC   collections.abcrD   rE   rF   rG   	getLoggerr   ri  _logginggetArtifactLoggerr  r  
fusion_logdoprintr  rf  rX   	dataclassr[   r}   r   r  r  r|   r  r  	Exceptionr  r   rY   rW   <module>r     s   "           X X X %    B G 9 / L L  & $ $ F ! 6 6  A ; D D    - , / P P %  <<@ !00<H~~//*E^^--hA
 	78;
 5+ 5+ 5+pN;/ N;b;'? ;'| +;Td('/*B dNs"^ s"l) d#	) 	) $	)		 	rY   