
    8ha                      S SK Jr  S SKrS SKrS SKrS SKrS SKrS SKrS SKrS SK	r	S SK
r
S SKrS SKJrJr  S SKJr  S SKJrJrJrJrJrJr  S SKrS SKJr  S SKrS SKrS SKJs  Jr  S SK J!r!  S SK"J#r#J$r$  S S	K%J&r&  S S
K'J(r(  S SK)J*r*J+r+J,r,  S SK-J.r.  SSK/J0r0J1r1J2r2J3r3  SSK4J5r5  SSK6J7r7J8r8J9r9  SSK:J;r;  SSK<J=r=J>r>J?r?J@r@  SSKAJBrB  SSKCJDrD  SSKEJFrF  SSKGJHrHJIrIJJrJJKrK  SSKLJMrMJNrN  SSKOJPrPJQrQJRrRJSrS  SSKJTrTJUrUJVrVJWrWJXrXJYrYJZrZJ[r[J\r\J]r]J^r^J_r_J`r`Jara  SSKbJcrdJereJfrfJgrg  SSKhJiri  SSKjJkrk  SSKlJmrmJnrnJoroJprpJqrqJrrrJsrsJtrtJuruJvrvJwrwJxrxJyryJzrzJ{r{  SS K|J}r}J~r~JrJrJrJr  SS!KJrJrJrJrJr  SS"KJr  \(       a&  S S#KJr  S S$KJr  S S%KJr  SS&K8Jr  SS'KJr  \" S(5      r\GR(                  " \5      r\GR.                  GR1                  \S)5      r\GR.                  GR1                  \S*5      r\GR.                  GR1                  \S+5      r\;" 5       r: " S, S-5      r\" S5      SWS. j5       r\" S5      SWS/ j5       r " S0 S15      r\GR@                   " S2 S35      5       r\GR@                   " S4 S55      5       r        SXS6 jr " S7 S8\v5      r\" 5       GRJ                  rSYS9 jrSYS: jrSZS; jrSYS< jrS[S= jrS\S> jr " S? S@\q5      rS]SA jrS^S_SB jjr " SC SD\u5      r\GRc                  SE5         " SF SG\5      r " SH SI5      r\GR@                   " SJ SK5      5       r " SL SM5      r\GR@                   " SN SO5      5       r " SP SQ\p\\\\\\4   4   4   5      r " SR SS\\   5      r " ST SU\5      rS`SV jrg)a    )annotationsN)IterableSequence)	lru_cache)AnyCallablecastOptionalTYPE_CHECKINGUnion)
PRECEDENCE)get_interface_for_device)identitypreserve_rng_state)is_integer_dtype)
OrderedSet)CeilDivFloorDivModularIndexing)has_triton_package   )free_symbol_is_type
prefix_strsymbol_is_typeSymT)ValueRanges   )configirmetrics)AsyncCompile)	code_hashget_pathPyCodeCachewrite_atomic)DefaultHandler)triton_heuristics)benchmarker)AutotuneHintDevicePropertiesTRITON_MAX_BLOCKTRITON_MAX_RSPLIT)get_max_y_gridnext_power_of_2)BaseSchedulerNodeFusedSchedulerNode	SchedulerSchedulerNode)cache_on_selfDelayReplaceLineget_bounds_index_exprget_fused_kernel_nameget_kernel_metadatais_welford_reductionPlaceholderprefix_is_reduction	sympy_dotsympy_product
sympy_substriton_typetriton_version_uses_attrs_dictupcast_compute_type)_opsReductionType	StoreModeV)"get_kernel_category_by_source_code   )BlockPatternMatcher)ArgNameBackendFeatureConstexprArgCSECSEVariableDeferredLineIndentedBufferInplacedBufferOpOverridesPythonPrinter
RemovedArgSizeArg	TensorArgWorkspaceArgWorkspaceZeroMode)constant_reprIterationRangesIterationRangesEntryIterationRangesRoot
SIMDKernelSIMDScheduling)	config_ofequal_1_arg_indicesnon_constexpr_signatureshould_unwrap_unspec_argsignature_to_meta)SymbolicCallArg)
ModuleType)TypeVarDtypePropagationOpsHandler)IRNode)SIMDKernelFeatures_T
perf_hintsschedulefusionc                  J    \ rS rSr% Sr0 rS\S'   0 rS\S'   \S
S j5       r	Sr
g	)OpDtypeSupportw   z
Some Triton ops such as libdevice and tl.math only support float32 and float64.
This class records which dtypes are supported by specific IR ops.
z"dict[str, OrderedSet[torch.dtype]]supported_dtypeszdict[str, bool]convert_outputsc                    UR                   n[        [        R                  [        R                  /5      U R
                  U'   X R                  U'   g N)__name__r   torchfloat32float64rp   rq   )clsfuncconvert_outputop_names       X/var/www/fran/franai/venv/lib/python3.13/site-packages/torch/_inductor/codegen/triton.pyregister_upcastOpDtypeSupport.register_upcast   s;    --(2EMM5==3Q(RW%'5G$     N)ry   zCallable[..., str]rz   boolreturnNone)rt   
__module____qualname____firstlineno____doc__rp   __annotations__rq   classmethodr}   __static_attributes__r   r   r|   rn   rn   w   s1    
 <>8=')O_)6 6r   rn   c                 x    [        5       (       d  gSSKn [        U R                  R                  S5      (       a  gg)zX
import AttrsDescriptor if the triton version is new enough to have this
class defined.
 r   NAttrsDescriptorz4from triton.compiler.compiler import AttrsDescriptor)r   triton.compiler.compilerhasattrcompiler)tritons    r|   gen_attr_descriptor_importr      s3     # v''):;;Er   c                     [        5       n U R                  S5        [        5       =n(       a  U R                  U5        U R                  S5        U R	                  5       $ )NzD
        import triton
        import triton.language as tl
        a  
        from torch._inductor.runtime import triton_helpers, triton_heuristics
        from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
        from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
        )rN   splicer   	writelinegetvalue)imports	attr_descs     r|   gen_common_triton_importsr      s[    GNN	 /00y0)$NN	 r   c                     \ rS rSrSr\" \R                  \R                  /5      r	\" \R                  \R                  \R                  /\	Q5      r\ V VVVs0 s H$  nU[        R                  " [         U    S3SSS9_M&     snnnn r\ V VVVs0 s H2  nU[        R                  " [         U   R%                  5        S3SSS9_M4     snnnn r\SS j5       r\SS	 j5       rS
rgs  snnnn f s  snnnn f )TritonSymbols   zM
Stores sympy.Symbol instances and constants associated with triton codegen.
offsetTintegernonnegativeBLOCKr   positivec                4    U R                   UR                     $ rs   )block_sizessymtrx   trees     r|   get_block_sizeTritonSymbols.get_block_size   s    tyy))r   c                4    U R                   UR                     $ rs   )block_offsetsr   r   s     r|   get_block_offsetTritonSymbols.get_block_offset   s      ++r   r   N)r   rX   r   zsympy.Symbol)rt   r   r   r   r   r   r   R0_INDEXR1_INDEXreduction_typesXBLOCKYBLOCKZBLOCKblock_typessympySymbolr   r   upperr   r   r   r   r   ).0r   r   r   s   0000r|   r   r      s    !$--!?@Odkk4;;VoVWK  D 	ellj./v6RVWWM  	  D 	ell$%%'(.t
 	
  	K * * , ,#
s   +C'
9C/
r   c                      \ rS rSr% S\S'   S\S'   S\S'   S\S	'   S
\S'   SS jrSS jrSS jrSS jrSS jr	\
SS j5       rSrg)IndexingOptions   str	index_strOrderedSet[str]	mask_varsOptional[str]
expand_strr   _has_rindex
sympy.Exprindexc                ,    [        U R                  5      $ rs   )r   r   selfs    r|   has_maskIndexingOptions.has_mask   s    DNN##r   c                J    [        U R                  [        R                  5      $ rs   )r   r   r   TMPr   s    r|   has_indirectIndexingOptions.has_indirect   s    "4::txx88r   c                    U R                   $ rs   )r   r   s    r|   
has_rindexIndexingOptions.has_rindex   s    r   c                :    [        S U R                   5       5      $ )Nc              3  V   #    U  H  n[        U5      R                  S 5      v   M!     g7f)tmpNr   
startswithr   masks     r|   	<genexpr>.IndexingOptions.has_tmpmask.<locals>.<genexpr>   s"     J>43t9''..>   ')anyr   r   s    r|   has_tmpmaskIndexingOptions.has_tmpmask   s    J4>>JJJr   c                :    [        S U R                   5       5      $ )Nc              3  V   #    U  H  n[        U5      R                  S 5      v   M!     g7f)rNr   r   s     r|   r   ,IndexingOptions.has_rmask.<locals>.<genexpr>   s"     H3t9'',,r   r   r   s    r|   	has_rmaskIndexingOptions.has_rmask   s    HHHHr   c                    U R                   (       a2  SR                  [        [        [        U R                   5      5      5      $ S$ )N & r   )r   joinsortedmapr   r   s    r|   mask_strIndexingOptions.mask_str   s4     =ANNEJJvc#t~~678	
PV	
r   r   Nr   r   r   r   )rt   r   r   r   r   r   r   r   r   r   propertyr   r   r   r   r|   r   r      sN    N$9 KI 
 
r   r   c                     \ rS rSr% S\S'   S\S'   S\S'   S\S	'   S
\S'   S\S'   S
\S'   SrS\S'   \S#S j5       r\S#S j5       r\S#S j5       r	\S#S j5       r
          S$S jr\            S%S j5       r        S&S jrS'S(S jjr      S)S jrS*S jrS+S jrS,S jrS,S jrS,S jrS,S  jrS,S! jrS"rg)-BlockPtrOptions   BlockParametersparamsr   constant_offset	list[int]orderr   r   Sequence[sympy.Expr]broadcast_shapez
list[bool]broadcasting_dimsfinal_shapeNzOptional[list[int]]_boundary_checkc                .    U R                   R                  $ rs   )r   shaper   s    r|   r   BlockPtrOptions.shape   s    {{   r   c                .    U R                   R                  $ rs   )r   block_shaper   s    r|   r  BlockPtrOptions.block_shape   s    {{&&&r   c                .    U R                   R                  $ rs   )r   stridesr   s    r|   r  BlockPtrOptions.strides      {{"""r   c                .    U R                   R                  $ rs   )r   offsetsr   s    r|   r  BlockPtrOptions.offsets  r  r   c                b  ^	 [        U R                  U R                  5       VVs/ s H(  u  pVU(       a  [        R                  R
                  OUPM*     nnn[        XU5      n[        R                  R                  m	U=(       a<    [        U5      [        U5      :H  =(       a    [        U	4S j[        Xs5       5       5      n[        U R                  5      (       a7  U(       d0  SU S[        R                  R                  U R                  5       S3n[        XR                  U5      nU$ s  snnf )z
Generate a broadcast and a reshape for the block pointer.
This restores stride-0 dimensions which were removed from the block pointer.
c              3  |   >#    U  H1  u  pTR                  US 5      =(       d    TR                  X5      v   M3     g7frF   N)statically_known_equals)r   pre_dimpost_dimsizevarss      r|   r   @BlockPtrOptions.codegen_broadcast_and_reshape.<locals>.<genexpr>%  sE       *O%G 00!< G33GFG)Ns   9<tl.broadcast_to(, ))zipr   r   r   SOnetriton_reshaperD   graphr  lenallr   kernelindex_to_str)
r   valueinitial_shaper   allow_implicitdimis_broadcastingpre_broadcast_shapesupports_implicit_broadcastr  s
            @r|   codegen_broadcast_and_reshape-BlockPtrOptions.codegen_broadcast_and_reshape	  s    ),$$d&<&<)
)$ +EGGKK3) 	 
 u5HI 77##&4 '
#$K(88   *--@)N  	$ t%%&&/J&ugR0E0EdFZFZ0[/\\]^E u&:&:KH9
s   /D+c                V  ^^ [         R                  R                  mS	U4S jjnU" U R                  5      U l        U" U R                  5      U l        U R                   Vs/ s H  nTR                  US5      PM     nnU R                   Vs/ s H  nTR                  US5      PM     n	n[        U	5      (       a  SU	S'   [        U R                  U	5       VV
s/ s H  u  pU
(       a  M  UPM     nnn
[        X5       Vs/ s H  n[        U5      PM     snmU4S jn[        S
0 [        R                  " U 5      R                  5        VVs0 s H  u  pX" U5      _M     snnD6n U Vs/ s H  n[        R                  U5      PM     nn[         R                   R"                  (       a&  US   R$                  S:X  d   eUR'                  S5        [         R                   R(                  n[         R                   R*                  (       d  [-        U R                  5      [-        [         R                   R.                  5      U-
  :X  aN  [         R                   R0                  R3                  5       (       a!  U[4        R6                  R8                  /U-  -  n[;        U [         R                  R                  R=                  U5      [?        [A        [C        [-        U R                  5      5      5      5      UUUUS9nURE                  XB5        U$ s  snf s  snf s  sn
nf s  snf s  snnf s  snf )z,Helper to create a  BlockPtrOptions instancec                R   > U  Vs/ s H  nTR                  U5      PM     sn$ s  snf rs   )lookup_precomputed_size)exprsexprr  s     r|   lookup_size+BlockPtrOptions.create.<locals>.lookup_sizeA  s&    GLMutH44T:uMMMs   $r   rF   Fc                d   > [        U T5       VVs/ s H  u  pU(       a  M  UPM     snn$ s  snnf )z@Removes any broadcasting or singleton dims from a given sequence)r  )ititemis_removableremovable_dimss      r|   remove_dims+BlockPtrOptions.create.<locals>.remove_dimsc  s7     +.b.*A*A&D# *A  s   ,,x)r   r   r   r   r   r   r   )r*  zIterable[sympy.Expr]r   list[sympy.Expr]r   )#rD   r  r  r   r  r  r  r  r  r   r   dataclassesasdictitemsr   r   r  no_x_dimprefixpopnum_reduction_dimsinside_reductionr  numelsfeaturesis_reductionr   r  r  r   r)  listreversedrangecompute_boundary_check)r   r   range_treesr   get_max_blockr,  strider   r!  singleton_dimsis_singletonr   dimsr4  keyvalr   r   reduction_ndimresultr3  r  s                       @@r|   createBlockPtrOptions.create4  s    77##	N #6<<0$V^^4
 GMnn
FTFH,,VQ7n 	 
 AG@R@R
@RH,,S!4@R 	 
 ~!&N2 &)););^%L
%L! %L 	 
 14N0VW0V#d)0VW	 ! 
5@5G5G5O5U5U5WX5WsK$$5WX

 GRRkd}33D9kR88q>((C///OOA44))FNN#s188??';n'LL!!..00 EGGKK=>99K GG,,DD_Uxc&,,&7 89:#+/
 	%%mA


 X Y Ss*   !LLL*L LL *L&c                B    [         R                  U   n[        XU05      $ )z>
Replaces instances of {symt}_offset with the new expression.
)r   r   r=   )r   r+  replacementr   roffsets        r|   replace_offsetBlockPtrOptions.replace_offset  s$      --d3$+ 677r   c           	       ^  SU 4S jjn[         R                  R                  n/ T R                  QnU(       d  U Vs/ s H
  oc" U5      PM     nnT R                  S:w  a  U SU" T R                  5       S3OUSU" T R
                  5       3SU" T R                  5       3SU" T R                  5       3SU" T R                  5       3S	U" U5       3/nS
SR                  U5       S3$ s  snf )z
Codegen a call to tl.make_block_ptr()

Args:
    name: variable name for pointer
    roffset: should rn_offset be included in offsets=..., for use with tl.advance()

Returns:
    "tl.make_block_ptr(...)"
c                   > [         R                   H*  nTR                  U [        R                  " S5      U5      n M,     U $ Nr   )r   r   rV  r   Integer)r+  r   r   s     r|   remove_roffsets/BlockPtrOptions.format.<locals>.remove_roffsets  s5    %55**4q1A4H 6Kr   r    + (r  zshape=zstrides=zblock_shape=zorder=zoffsets=ztl.make_block_ptr(r  )r+  r   r   r   )
rD   r  r  r  r   r   r  r  r   r   )r   namerU  r\  fr  r   argss   `       r|   formatBlockPtrOptions.format  s    	
 HH!!!DLL/=DEW6v.WGE ''1, &Qt3345Q7Qtzz]O$q'(1T--./0Qtzz]O$qzl#
 $DIIdO#4A66 Fs   C*c           
     &   [         R                  R                  nU Vs0 s H8  n[        R                  UR
                     U" [        UR
                     5      _M:     nn[        [        [         R                  R                  U5      5      n[        [        U R                  5      5       Vs/ s GHK  nUR                  U R                  U   [         R"                  R$                  5      (       a  MB  U(       a<  [        R                  [&        R(                     U R*                  U   R,                  ;   dp  UR/                  U R                  U   U R*                  U   5      (       a  M  UR/                  U R                  U   [1        U R*                  U   U5      5      (       a  M  [         R                  R2                  (       a5  U R*                  U   [        R                  [&        R4                     :X  a  GMI  UPGMN     snU l        gs  snf s  snf )z6List of indices to pass to tl.load(boundary_check=...)N)rD   r  r  r   r   r   r   r   r   r  needs_yz_grid_overflowrE  r  r   r  r  r   r  Zeror   r   r  free_symbolsstatically_known_multiple_ofr=   r;  r   r   )r   rH  rG  r  tblock_to_maxneeds_overflow_grididxs           r|   rF  &BlockPtrOptions.compute_boundary_check  s    77## !/
  %%aff-}Z=O/PP  	 /
 "#ahh&E&E{"ST S_- 
-44T\\#5FU  ,)55dkkB++C0==> %AA JJsOT-=-=c-B  !) E E JJsO&t'7'7'<lK! * HH%%((-1J1J4;;1WW- C- 
/
 
s&   ?H	-AH2A2H(9H%AH9Hc                8    U R                   c   eU R                   $ rs   r   r   s    r|   boundary_checkBlockPtrOptions.boundary_check  s     ##///###r   c           	         [         R                  U   nU R                   Vs/ s HA  nU R                  X2U5      U R                  U[        R
                  R                  U5      -
  PMC     nnU$ s  snf )aF  
Codegen string to pass to tl.advance(name, ...).

Advance is the difference between offsets in each loop iteration.
To compute it, we replace rN_offset with multiples of RN_BLOCK.
Since we expect rN_offset to vary in range(0, rN_numel, RN_BLOCK), the first
iteration has rN_offset=0, while the second has rN_offset=RN_BLOCK.
)r   r   r  rV  r   r  rf  )r   r   rblockr   advances        r|   advance_roffsetBlockPtrOptions.advance_roffset  sw     **40 ,,

 ' ##FD9%%feggllDAB ' 	 
 
s   AA.c                    gNFr   r   s    r|   r   BlockPtrOptions.has_indirect      r   c                :    [        S U R                   5       5      $ )Nc              3  V   #    U  H  n[        U[        R                  5      v   M!     g 7frs   )r   r   r   )r   r+  s     r|   r   -BlockPtrOptions.has_rindex.<locals>.<genexpr>  s'      
(  m&C&CDD(r   )r   r  r   s    r|   r   BlockPtrOptions.has_rindex  s"     
((
 
 	
r   c                "    U R                  5       $ rs   )r   r   s    r|   r   BlockPtrOptions.has_rmask  s      r   c                    grx  r   r   s    r|   r   BlockPtrOptions.has_tmpmask  rz  r   c                4    [        U R                  5       5      $ rs   )r   rp  r   s    r|   r   BlockPtrOptions.has_mask  s    D'')**r   ro  r   r7  )
r  r   r  r   r   r   r   r   r   r   )r   r   r   r   rG  list[IterationRangesRoot]r   r   rH  Callable[[str], int]r   r   )r+  r   rT  r   r   r   r   r   T)r_  r   r   r   )rH  r  rG  r  r   r   )r   r   )r   r   r   r   r   )rt   r   r   r   r   r   r   r   r  r  r  r%  staticmethodrQ  rV  rb  rF  rp  ru  r   r   r   r   r   r   r   r   r|   r   r      s   ))!!%%+/O(/! ! ' ' # # # #)) ,) *	)
 ) 
)V TT $T /	T
 #T ,T 
T Tl88-78?C8	8!7F2
+2
 /2
 
	2
h$&
!+r   r   c                   [        U[        5      (       a  [        U[        5      (       d   eU Vs/ s H"  n[        R                  R	                  U5      PM$     nnU Vs/ s H"  n[        R                  R	                  U5      PM$     nnXE:X  a  U $ U Vs/ s H  ofS:w  d  M
  UPM     snU:w  a  SU  SSR                  U5       S3$ Sn/ nU HK  n	U[        U5      :  a   XU   :X  a  UR                  S5        US-  nM2  U	S:X  d   eUR                  S	5        MM     U[        U5      :X  d   eU  S
SR                  U5       S3$ s  snf s  snf s  snf )z<Workaround https://github.com/triton-lang/triton/issues/28361ztl.reshape(z, [r  z])r   :rF   r   [])
isinstancerC  rD   r  r  r   r  append)
r  	old_shape	new_shaper   old_shape_strnew_shape_strsrl  expandsizes
             r|   r  r    sF    i&&:i+F+FFF?HIyeQXX**51yMI?HIyeQXX**51yMI% -=aH=->UG3tyy'?&@CC
CF]##c0B(BMM#1HC3;;MM&!  #m$$$$WAdii'(**% JI .s   )E )E
	E#Ec                  @   \ rS rSrS"S jrS"S jrS"S jrS"S jrS"S jrS"S jr	S"S jr
S"S	 jrS"S
 jrS"S jrS"S jrS"S jrS"S jrS"S jrS#S jrS"S jrS"S jrS"S jrS"S jrS"S jrS"S jrS"S jrS"S jrS"S jrS"S jrS"S jrS"S jrS"S jrS"S jr S"S jr!S r"g!)$TritonPrinteri4  c                    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S[        R                  R
                   S3$ )NrF   libdevice.trunc(r   ).to(r  r  ra  _printrD   r  index_dtyper   r+  s     r|   _print_TruncToIntTritonPrinter._print_TruncToInt5  M    499~"""t{{499Q<89qxx?S?S>TTUV	
r   c                    [         R                  " 5       (       a$  [        R                  R                  (       a  U nU$ SU S3nU$ )Nztl.full([], z, tl.float64))r   	is_fbcoderu   versionhip)r   r+  rets      r|   _print_FloatTritonPrinter._print_Float;  s@    %--"3"3FC 
 !m4C
r   c                    [        UR                  5      S:X  d   eU R                  UR                  S   [        S   S-
  5      nU S3$ )NrF   r   Atom      ?z.to(tl.float64))r  ra  parenthesizer   )r   r+  r  s      r|   _print_ToFloatTritonPrinter._print_ToFloatB  sI    499~"""diilJv,>,DEO$$r   c                   UR                   u  p#UR                  (       a8  UR                  (       a'  U R                  UR                   S[        S   S-
  5      $ U R	                  U5      nU R	                  U5      nSU SU S3$ )N % r  r  z!triton_helpers.remainder_integer(r  r  )ra  is_nonnegative	stringifyr   r  r   r+  quotdivquot_sdiv_ss         r|   _print_PythonModTritonPrinter._print_PythonModG  sp    II	3#5#5>>$))UJv4F4LMMT"C 26("UG1EEr   c                ,   UR                   (       d   eUR                  u  p#UR                  (       a8  UR                  (       a'  U R                  UR                  S[        S   S-
  5      $ U R                  U5      nU R                  U5      nSU SU S3$ )N // r  r  z!triton_helpers.div_floor_integer(z,  r  )
is_integerra  r  r  r   r  r  s         r|   _print_FloorDivTritonPrinter._print_FloorDivO  s|    II	3#5#5>>$))VZ5G#5MNNT"C 26(#eWAFFr   c                P    U R                  UR                  S[        S   S-
  5      $ )N / r  r  )r  ra  r   r  s     r|   _print_IntTrueDivTritonPrinter._print_IntTrueDivZ  s#    ~~dii
60BS0HIIr   c                    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S[        R                  R
                   S3$ NrF   libdevice.floor(r   r  r  r  r  s     r|   _print_floorTritonPrinter._print_floor_  r  r   c                    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S[        R                  R
                   S3$ r  r  r  s     r|   _print_FloorToIntTritonPrinter._print_FloorToInte  r  r   c                    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S[        R                  R
                   S3$ NrF   libdevice.ceil(r   r  r  r  r  s     r|   _print_ceilingTritonPrinter._print_ceilingk  K    499~""" TYYq\!: ;5AUAU@VVWXXr   c                    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S[        R                  R
                   S3$ r  r  r  s     r|   _print_CeilToIntTritonPrinter._print_CeilToInto  r  r   c                ,    SU R                  U5       S3$ )Nzlibdevice.sqrt(().to(tl.float32)))r  r  s     r|   _helper_sqrtTritonPrinter._helper_sqrts  s    !$++d"3!44EFFr   c                    SU R                  UR                  S   5       SU R                  UR                  S   5       S3$ )Nlibdevice.pow(r   r  rF   r  )r  ra  r  s     r|   _print_FloatPowTritonPrinter._print_FloatPowv  s?    T[[167r$++diiPQl:S9TTUV	
r   c                6   UR                   S   R                  (       a;  S[        UR                   S   5       SU R                  UR                   S   5       S3$ SU R                  UR                   S   5       SU R                  UR                   S   5       S3$ )Nr   r  r  rF   r  )ra  
is_Integerfloatr  r  s     r|   _print_PowByNatural!TritonPrinter._print_PowByNatural{  s    99Q<""#E$))A,$7#84;;tyyQR|;T:UUVWWT[[167r$++diiPQl:S9TTUV	
r   c                    U R                  UR                  S   5      nU R                  UR                  S   5      nU R                  UR                  S   5      nSU SU SU S3$ )Nr   rF   r   	tl.where(r  r  )doprintra  )r   r+  cpqs        r|   _print_WhereTritonPrinter._print_Where  s_    LL1&LL1&LL1&1#Rs"QCq))r   c                   [        UR                  5      S:X  a  U R                  UR                  S   5      $ [        UR                  5      S-  n[        U5      nU R                  U" UR                  SU 6 5      nU R                  U" UR                  US 6 5      n[	        S XV4 5       5      u  pVUS;   d   SU S35       eS	U S
U SU SU SU S
U SU SU S3$ )z1
Helper for max/min code generation.
cmp: > or <
rF   r   r   Nc              3  .   #    U  H  nS U S3v   M     g7f)(r  Nr   r   r6  s     r|   r   6TritonPrinter._print_min_max_helper.<locals>.<genexpr>  s     .v!q1Xvs   )><zUnexpected comparator: ''r  z * ( z= z) + )))r  ra  r  typetuple)r   r+  cmpmidrx   abs          r|   _print_min_max_helper#TritonPrinter._print_min_max_helper  s    
 tyy>Q;;tyy|,,$))n!4jKKTYYt_-.KKTYYst_-. .v..j C$<SE"CC 1#T!AcU"QCtA3d1#Qse1QCrBBr   c                &    U R                  US5      $ )Nr  r  r  s     r|   
_print_MinTritonPrinter._print_Min      ))$44r   c                &    U R                  US5      $ )Nr  r  r  s     r|   
_print_MaxTritonPrinter._print_Max  r  r   c                |    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S3$ )NrF   tl_math.abs(r   r  r  ra  r  r  s     r|   
_print_AbsTritonPrinter._print_Abs  s9    499~"""dkk$))A,78::r   c                |    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S3$ )NrF   zlibdevice.cos((r   r  r  r  s     r|   _print_OpaqueUnaryFn_cos&TritonPrinter._print_OpaqueUnaryFn_cos  :    499~""" TYYq\!: ;;LMMr   c                |    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S3$ )NrF   zlibdevice.cosh((r   r  r  r  s     r|   _print_OpaqueUnaryFn_cosh'TritonPrinter._print_OpaqueUnaryFn_cosh  :    499~"""!$++diil";!<<MNNr   c                |    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S3$ )NrF   zlibdevice.acos((r   r  r  r  s     r|   _print_OpaqueUnaryFn_acos'TritonPrinter._print_OpaqueUnaryFn_acos  r  r   c                |    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S3$ )NrF   zlibdevice.sin((r   r  r  r  s     r|   _print_OpaqueUnaryFn_sin&TritonPrinter._print_OpaqueUnaryFn_sin  r  r   c                |    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S3$ )NrF   zlibdevice.sinh((r   r  r  r  s     r|   _print_OpaqueUnaryFn_sinh'TritonPrinter._print_OpaqueUnaryFn_sinh  r  r   c                |    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S3$ )NrF   zlibdevice.asin((r   r  r  r  s     r|   _print_OpaqueUnaryFn_asin'TritonPrinter._print_OpaqueUnaryFn_asin  r  r   c                |    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S3$ )NrF   zlibdevice.tan((r   r  r  r  s     r|   _print_OpaqueUnaryFn_tan&TritonPrinter._print_OpaqueUnaryFn_tan  r  r   c                |    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S3$ )NrF   zlibdevice.tanh((r   r  r  r  s     r|   _print_OpaqueUnaryFn_tanh'TritonPrinter._print_OpaqueUnaryFn_tanh  r  r   c                |    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S3$ )NrF   zlibdevice.atan((r   r  r  r  s     r|   _print_OpaqueUnaryFn_atan'TritonPrinter._print_OpaqueUnaryFn_atan  r  r   c                |    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S3$ )NrF   zlibdevice.log2((r   r  r  r  s     r|   _print_OpaqueUnaryFn_log2'TritonPrinter._print_OpaqueUnaryFn_log2  r  r   c                    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S[        R                  R
                   S3$ )NrF   zlibdevice.llrint(r   r  r  r  r  s     r|   _print_RoundToIntTritonPrinter._print_RoundToInt  sM    499~"""DIIaL 9:%@T@T?UUVW	
r   c                    [        UR                  5      S:X  d   eUR                  u  p#UR                  (       a  US:  d   e[        SU S35      eU R	                  U[
        S   5      nSU SU SU*  3$ )	Nr   r   zOFor integer inputs, only non-negative ndigits are currently supported, but got .Mulzlibdevice.nearbyint(1e * z) * 1e)r  ra  r  
ValueErrorr  r   )r   r+  numberndigits
number_strs        r|   _print_RoundDecimal!TritonPrinter._print_RoundDecimal  s    499~"""))Q;;abiajjkl  &&vz%/@A
'yJ<vwhZPPr   r   N)r+  r   r   r   )r+  r   r  r   r   r   )#rt   r   r   r   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r"  r%  r(  r+  r5  r   r   r   r|   r  r  4  s    
%
FGJ


YYG


*C&55;NOONOONOOO
Qr   r  c                *    [        [        U 5      5      $ )zCConvert torch.dtype to triton type and upcast [b]float16 to float32)r>   r@   dtypes    r|   triton_compute_typer:    s    *5122r   c                `    U [         R                  :X  a  [         R                  n [        U 5      $ )z@Convert torch.dtype to triton type, with fix for storing tl.bool)ru   r   int8r>   r8  s    r|   triton_store_typer=    s"    



ur   c                    [        U 5      (       a1  U R                  (       a   U R                  S::  a  [        R                  $ [        U 5      $ )z0Implicit upcasts used for Triton reduction types   )r   	is_signeditemsizeru   int32r@   r8  s    r|   upcast_acc_dtyperC    s3    5??u~~7J{{u%%r   c                *    [        [        U 5      5      $ )z:Convert torch.dtype to triton type, with reduction upcasts)r:  rC  r8  s    r|   triton_acc_typerE    s    /677r   c                F    U R                   S:*  =(       a    U R                  $ )Nr   )rA  is_floating_pointr8  s    r|   low_precision_fprH    s    >>Q:5#:#::r   c                    [        U [        5      (       d  gU R                  n[        U[        R                  5      (       a  [	        U5      $ S$ rx  )r  rL   r9  ru   rH  )varr9  s     r|   low_precision_fp_varrK     s<    c;''IIE&0&D&DE"O%Or   c                  2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )TritonCSEVariablei  c                X   > [         TU ]  XU5        [        5       U l        Uc   S5       eg )Nz!TritonCSEVariable must have dtype)super__init__r   r   )r   r_  boundsr9  	__class__s       r|   rP  TritonCSEVariable.__init__	  s-    u-*4, E"EE r   c                p   U H  n[        U[        5      (       a'  U R                  R                  UR                  5        M?  [        U[        R
                  5      (       d  M`  [        R                   H<  n[        XE5      (       d  M  U R                  R                  [        U    S3/5          M     M     g )Nr   )
r  rM  r   updater   r   r   r   r   r   )r   r_  ra  kwargsargr   s         r|   update_on_args TritonCSEVariable.update_on_args  s    C#011%%cmm4C.. *55D%c00--*T2B1C4/H.IJ 6 r   )r   )rQ  zValueRanges[Any]r9  torch.dtyper   r   )rt   r   r   r   rP  rX  r   __classcell__rR  s   @r|   rM  rM    s    F r   rM  c                     SSK Jn   U " 5       $ )Nr   re   )!torch._inductor.dtype_propagationrf   re   s    r|   get_dtype_handlerr_    s    L%''r   c                :   ^ ^^ SS jmSU4S jjmSU UU4S jjnU$ )z
Codegen helper to upcast arguments to float32, depending on the config and dtype.
This decorates tl.math/libdevice codegen functions.
c                    [         R                  R                  (       + =(       aD    [        U [        5      =(       a-    U R
                  [        R                  [        R                  4;   $ rs   )	r   r   codegen_upcast_to_fp32r  rL   r9  ru   float16bfloat16)rJ  s    r|   needs_upcast*maybe_upcast_float32.<locals>.needs_upcast)  sD    444 =3,=		emmU^^<<	
r   c                2   > T" U 5      (       a  SOSnU  U 3$ )N.to(tl.float32)r   r   )rJ  upcast_stringre  s     r|   maybe_upcast_arg.maybe_upcast_float32.<locals>.maybe_upcast_arg0  s$    -9#->->)B}o&&r   c                L   >^  [         R                  T T5        SUU UU4S jjnU$ )Nc                   > U  Vs/ s H  nT" U5      PM     nnUR                  5        VVs0 s H  u  pEUT" U5      _M     nnnT" U0 UD6nT=(       a7    [        U4S j[        R                  " XR	                  5       5       5       5      nU(       d  S O#[        [        5       TR                  5      " U 0 UD6n	U	[        R                  S 4;  n
U
(       a  U	b  S[        U	5       S3OSnU U 3$ s  snf s  snnf )Nc              3  4   >#    U  H  nT" U5      v   M     g 7frs   r   )r   rJ  re  s     r|   r   Kmaybe_upcast_float32.<locals>.decorator.<locals>.wrapped.<locals>.<genexpr>?  s      6-ScS!!-Ss   .to(r  r   )r:  r   	itertoolschainvaluesgetattrr_  rt   ru   rv   r>   )ra  rV  rW  upcast_argsrM  rN  upcast_kwargsrP  any_needs_upcastresult_dtypeneeds_downcastdowncast_stringrz   ry   rj  re  s               r|   wrapped8maybe_upcast_float32.<locals>.decorator.<locals>.wrapped8  s   <@ADS+C0DKAHNWHCS"23"77MW ;8-8F-  # 6-6__T==?-S6 3
 ( .0$--@$Q&Q 
 *%--1FFN "l&> {<013 
 Xo.//' BWs
   C0C5r   )rn   r}   )ry   r{  rz   rj  re  s   ` r|   	decorator'maybe_upcast_float32.<locals>.decorator4  s$    &&t^<	0 	0. r   r   r   )ry   Callable[..., Any]r   r  r   )rz   r}  rj  re  s   ` @@r|   maybe_upcast_float32r  #  s    
' : r   c                     \ rS rSrSr\R                  " \R                  5      r\	  SP   SQS jj5       r
\	SRS j5       r\	S 5       r\S 5       r\	\" 5       S	 5       5       r\	S
 5       r\	S 5       r\	\" 5       S 5       5       r\	\" 5       S 5       5       r\	\" 5       S 5       5       r\	\" 5       S 5       5       r\	S 5       r\	S 5       r\	S 5       r\	S 5       r\	S\R8                  SSS.S j5       r\	\" 5       S 5       5       r\	\" 5       S 5       5       r\S 5       r \	S 5       r!\	\" 5       S 5       5       r"\	\" 5       S 5       5       r#\	\" 5       S 5       5       r$\	\" 5       S 5       5       r%\	\" 5       S 5       5       r&\	\" 5       S  5       5       r'\	\" 5       S! 5       5       r(\	\" 5       S" 5       5       r)\	\" 5       S# 5       5       r*\	\" 5       S$ 5       5       r+\	\" 5       S% 5       5       r,\	\" 5       S& 5       5       r-\	\" 5       S' 5       5       r.\	\" 5       S( 5       5       r/\	\" 5       S) 5       5       r0\	\" 5       S* 5       5       r1\	\" 5       S+ 5       5       r\	\" 5       S, 5       5       r2\	S- 5       r3\	S. 5       r4\	S/ 5       r5\	S0 5       r6\	S1 5       r7\	S2 5       r8\	S3 5       r9\	S4 5       r:\	S5 5       r;\	S6 5       r<\	S7 5       r=\	S8 5       r>\	S9 5       r?\	S: 5       r@\	\" 5       S; 5       5       rA\	\" 5       S< 5       5       rB\	\" 5       S= 5       5       rC\	\" 5       S> 5       5       rD\	\" 5       S? 5       5       rE\	S@ 5       rF\	\" 5       SA 5       5       rG\	\" 5       SB 5       5       rH\	\" 5       SC 5       5       rI\	\" SDSE9SF 5       5       rJ\	\" SDSE9SG 5       5       rK\	\" 5       SH 5       5       rL\	\" 5       SI 5       5       rM\	SJ 5       rN\	SK 5       rO\	\" 5       SL 5       5       rP\	SM 5       rQ\	\" 5       SN 5       5       rRSOrSg)STritonOverridesiT  zMap element-wise ops to TritonNTc                J         SS jnUb=  [        U" X!5      [        R                  R                  5      [        R                  l        U[        R
                  :X  a  SU  S3$ U[        R                  :X  a  U  S3$ U(       a  [        U5      nO[        U5      nU  SU S3$ )Nc                    X:X  a  g[         R                  [         R                  4nX;   a  X;   a  X:w  a   S5       eU [         R                  :X  d  U[         R                  :X  a  gU [         R                  :X  d  U[         R                  :X  a  gg)Nr   zCConversions between float8_e5m2 and float8_e4m3fn is not supported!r?  r   )ru   float8_e4m3fnfloat8_e5m2)	src_dtype	dst_dtype
fp8_dtypess      r|   _get_min_elements_per_thread>TritonOverrides.to_dtype.<locals>._get_min_elements_per_thread`  s     % ##!!J '+*U U	U 
 E---e>O>O1OE///9@S@S3Sr   r  z != 0)z.to(tl.int8).to(tl.uint8)rp  r  )r  rZ  r  rZ  r   int)	maxrD   r  min_elem_per_threadru   r   uint8r:  r=  )r6  r9  r  use_compute_typesr  	out_dtypes         r|   to_dtypeTritonOverrides.to_dtypeY  s    	"	/:		6   ,/,Y>,,,AHH(
 EJJqc= ekk! S122+E2I)%0ID1%%r   c                    UR                   UR                   :X  d   eU R                  U:w  a  U  S[        U5       S3n U  S[        U5       S3n[        U5      U:w  a  U S[        [        U5      5       S3nU$ )Nrp  r  z, bitcast=True))rA  r9  r>   r@   )r6  r9  r  outs       r|   to_dtype_bitcast TritonOverrides.to_dtype_bitcast  s    !!U^^333 77i#T+i013A4E*+?;u%.Ek*=e*DEFaHC
r   c           	         [         R                  R                  U5      n[        U" U 5      5      n[	        U5      nUS:X  a  U$ U S:  a(  UR
                  (       d  SUSS   3nSU SU SU SU S3	$ SU SU SU S3$ )	Nz
tl.float32r   ztl.r?  tl.full(r  r  r  )ru   _prims_commondtype_to_typerW   r:  r@  )r  r9  r   type_
triton_valr>   triton_signed_types          r|   _shaped_constant TritonOverrides._shaped_constant  s    ##11%8"5<0
)%0,& 19U__#&{12&7!8eWBzl"5G4Hk]Z[\\eWBzl"[MCCr   c                "    U R                  X/ S9$ )Nr   )r  )rx   r  r9  s      r|   constantTritonOverrides.constant  s    ##E#;;r   c                    SU  S3$ )Nr  r  r   r6  s    r|   absTritonOverrides.abs       aS""r   c                    SU  SU S3n[        U 5      (       d  [        U5      (       aN  [        5       R                  X5      nU[        R                  [        R
                  4;   a  U S[        U5       S3nU$ )Nr  r  r  rp  )rK  r_  truedivru   rc  rv   r>   r6  yr  r  s       r|   r  TritonOverrides.truediv  sp    !Cs!n""&:1&=&=)+33A9IU]]EMM::T+i"8!9;
r   c                    SU  SU S3n[        U 5      (       d  [        U5      (       aN  [        5       R                  X5      nU[        R                  [        R
                  4;   a  U S[        U5       S3nU$ )Nr  r  r  rp  )rK  r_  modru   rc  rv   r>   r  s       r|   r  TritonOverrides.mod  sp    !Cs!n""&:1&=&=)+//5IU]]EMM::T+i"8!9;
r   c                f    [         R                  (       a  SU  S[        R                   S3$ SU  S3$ )z
When use_fast_math, use the ftz (flushing to zero) variant
of exponent computation.

Check https://github.com/triton-lang/triton/issues/5735 for
more details.
libdevice.exp2(r0  r  ztl_math.exp()r   use_fast_mathr  _LOG_2_Er  s    r|   expTritonOverrides.exp  s8     $QCs?+C+C*DAFF!!A&&r   c                    SU  S3$ )Nr  r  r   r  s    r|   exp2TritonOverrides.exp2       !1%%r   c                    SU  S3$ )Nzlibdevice.expm1(r  r   r  s    r|   expm1TritonOverrides.expm1       "!A&&r   c                    SU  S3$ )Nzlibdevice.sqrt(r  r   r  s    r|   sqrtTritonOverrides.sqrt  r  r   c                   [         R                  R                  nUS:X  a  gUS:X  a	  SU  SU  S3$ US:X  a  U  S3$ Uc:  [        R                  " [        R
                  " S	[        R                  5      U 5      $ [        S
U< 35      e)Ncompile_errorzcompile error!runtime_errorz"triton_helpers.device_assert_then(z == 0, "injected assert fail", r  accuracyz + 1r   z:unrecognized config triton.inject_relu_bug_TESTING_ONLY = )	r   r   inject_relu_bug_TESTING_ONLYopsmaximumr  ru   rB  AssertionError)r6  bugs     r|   reluTritonOverrides.relu  s    mm88/!#O# 8s:YZ[Y\\]^^JS:[;;s||Au{{;Q?? LSGT r   c                    SU  SU S3$ )Nztriton_helpers.minimum(r  r  r   r  r  s     r|   minimumTritonOverrides.minimum      (2aS22r   c                    SU  SU S3$ )Nztriton_helpers.maximum(r  r  r   r  s     r|   r  TritonOverrides.maximum  r  r   c                    SU  SU SU S3$ )Nr  r  r  r   )r  r  r  s      r|   whereTritonOverrides.where  s    1#Rs"QCq))r   rF   )constraintsr9  is_purepackc                    [        U5      nSR                  U Vs/ s H  n[        U5      PM     sn5      nUc&  SR                  S/U V	s/ s H  n	SPM     sn	-   5      nSU  SU SU SU SU S	U S
3$ s  snf s  sn	f )Nr  z=rr   ztl.inline_asm_elementwise('z', 'z', [z	], dtype=z
, is_pure=z, pack=r  )r:  r   r   )
asmr  r9  r  r  inputsr>   i
input_refs_s
             r|   inline_asm_elementwise&TritonOverrides.inline_asm_elementwise  s     *%0YY71A78
))TF6-B6ac6-B$BCK,SEk]$zlR[\g[hhrszr{  |C  DH  CI  IJ  K  	K  8-Bs   A5A:
c                    SU  S3$ )Nztl_math.cos(r  r   r  s    r|   cosTritonOverrides.cos  r  r   c                    SU  S3$ )Nztl_math.sin(r  r   r  s    r|   sinTritonOverrides.sin  r  r   c                    [        S5      e)Nz/ops.index_expr not implemented outside a kernelNotImplementedError)rx   r+  r9  s      r|   
index_exprTritonOverrides.index_expr$  s    !"STTr   c                    [        S5      e)Nz+ops.masked not implemented outside a kernelr  )r   bodyothers      r|   maskedTritonOverrides.masked(  s    !"OPPr   c                    SU  S3$ )Nzlibdevice.lgamma(r  r   r  s    r|   lgammaTritonOverrides.lgamma,       #1#Q''r   c                    SU  S3$ )Nzlibdevice.erf(r  r   r  s    r|   erfTritonOverrides.erf1        s!$$r   c                    SU  S3$ )Nzlibdevice.cosh(r  r   r  s    r|   coshTritonOverrides.cosh6  r  r   c                    SU  S3$ )Nzlibdevice.sinh(r  r   r  s    r|   sinhTritonOverrides.sinh;  r  r   c                    SU  S3$ )Nzlibdevice.acos(r  r   r  s    r|   acosTritonOverrides.acos@  r  r   c                    SU  S3$ )Nzlibdevice.acosh(r  r   r  s    r|   acoshTritonOverrides.acoshE  r  r   c                    SU  S3$ )Nzlibdevice.asin(r  r   r  s    r|   asinTritonOverrides.asinJ  r  r   c                    SU  S3$ )Nzlibdevice.asinh(r  r   r  s    r|   asinhTritonOverrides.asinhO  r  r   c                    SU  SU S3$ )Nzlibdevice.atan2(r  r  r   r6  r  s     r|   atan2TritonOverrides.atan2T       "!Bqc++r   c                    SU  S3$ )Nzlibdevice.atan(r  r   r  s    r|   atanTritonOverrides.atanY  r  r   c                    SU  S3$ )Nzlibdevice.atanh(r  r   r  s    r|   atanhTritonOverrides.atanh^  r  r   c                    SU  SU S3$ )Nzlibdevice.copysign(r  r  r   r  s     r|   copysignTritonOverrides.copysignc  s     %QCr!A..r   c                    SU  S3$ )Nzlibdevice.erfc(r  r   r  s    r|   erfcTritonOverrides.erfch  r  r   c                    SU  S3$ )Nzlibdevice.erfinv(r  r   r  s    r|   erfinvTritonOverrides.erfinvm  r  r   c                    SU  SU S3$ )Nzlibdevice.hypot(r  r  r   r  s     r|   hypotTritonOverrides.hypotr  r  r   c                    SU  S3$ )Nzlibdevice.log10(r  r   r  s    r|   log10TritonOverrides.log10w  r  r   c                    SU  S3$ )Nzlibdevice.log2(r  r   r  s    r|   log2TritonOverrides.log2|  r  r   c                    SU  SU S3$ )Nzlibdevice.nextafter(r  r  r   r  s     r|   	nextafterTritonOverrides.nextafter  s     &aS1#Q//r   c                    U  SU 3$ Nr   r   r  s     r|   logical_andTritonOverrides.logical_and      Cs|r   c                    U  S3$ )Nz == 0r   r  s    r|   logical_notTritonOverrides.logical_not  s    E{r   c                    U  SU 3$ Nz | r   r  s     r|   
logical_orTritonOverrides.logical_or  r1  r   c                    SU  SU S3$ )Nr   ^ r  r   r  s     r|   logical_xorTritonOverrides.logical_xor  s    1#S1~r   c                    U  SU 3$ r.  r   r  s     r|   bitwise_andTritonOverrides.bitwise_and  r1  r   c                    SU  3$ )N~r   r3  s    r|   bitwise_notTritonOverrides.bitwise_not  s    1#wr   c                    U  SU 3$ r7  r   r  s     r|   
bitwise_orTritonOverrides.bitwise_or  r1  r   c                    U  SU 3$ )Nr;  r   r  s     r|   bitwise_xorTritonOverrides.bitwise_xor  r1  r   c                    U  SU 3$ )Nz << r   r  s     r|   bitwise_left_shift"TritonOverrides.bitwise_left_shift      D}r   c                    U  SU 3$ )Nz >> r   r  s     r|   bitwise_right_shift#TritonOverrides.bitwise_right_shift  rN  r   c                     SU S3nSU  SU S3$ )Nr  ).to(tl.uint32)ztl.rand(r  r  r   seedr   s     r|   randTritonOverrides.rand  s%    VHO,$r&++r   c                     SU S3nSU  SU S3$ )Nr  rS  z	tl.randn(r  r  r   rT  s     r|   randnTritonOverrides.randn  s%    VHO,4&6(!,,r   c           	     ,    SU S3nSU  SU SU SU S3	$ )Nr  rS  ztriton_helpers.randint64(r  r  r   )rU  r   lowhighs       r|   	randint64TritonOverrides.randint64  s1    VHO,*4&6("SED6KKr   c                    [        S5      e)Nz.ops.load_seed not implemented outside a kernelr  )r_  r   s     r|   	load_seedTritonOverrides.load_seed  s    !"RSSr   c                    SU  S3$ )Nzlibdevice.rsqrt(r  r   r  s    r|   rsqrtTritonOverrides.rsqrt  r  r   c                    SU  S3$ )Nzlibdevice.log1p(r  r   r  s    r|   log1pTritonOverrides.log1p  r  r   c                    SU  S3$ )Nzlibdevice.tan(r  r   r  s    r|   tanTritonOverrides.tan  r  r   c                    SU  S3$ )Nzlibdevice.tanh(r  r   r  s    r|   tanhTritonOverrides.tanh  r  r   c                    SU  S3$ )Nztl.sigmoid(r  r   r  s    r|   sigmoidTritonOverrides.sigmoid  s     QCq!!r   c                    SU  SU  SU  S3$ )Nz(libdevice.signbit(z) != 0) if (z).dtype is tl.float32 else z < 0r   r  s    r|   signbitTritonOverrides.signbit  s#     "!L3NqcQUV	
r   c                    SU  SU S3$ )Nzlibdevice.fmod(r  r  r   r  s     r|   fmodTritonOverrides.fmod  s     !2aS**r   c                    SU  SU S3$ )Nr  r  r  r   r  s     r|   powTritonOverrides.pow  s      s"QCq))r   c                    SU  S3$ )Nztl_math.log(r  r   r  s    r|   logTritonOverrides.log  r  r   F)rz   c                    SU  S3$ )Nzlibdevice.isinf().to(tl.int1)r   r  s    r|   isinfTritonOverrides.isinf       "!M22r   c                    SU  S3$ )Nzlibdevice.isnan(r  r   r  s    r|   isnanTritonOverrides.isnan  r  r   c                    SU  S3$ )Nzlibdevice.nearbyint(r  r   r  s    r|   roundTritonOverrides.round  s     &aS**r   c                    SU  S3$ )Nr  r  r   r  s    r|   floorTritonOverrides.floor  r  r   c                H    U  SU 3nU  SU 3nSU  SU SU SU SU SU S	3$ )
Nr  r  z
tl.where((z
 < 0) != (z < 0), tl.where(z != 0, z - 1, ), r  r   )r  r  r  rems       r|   floordivTritonOverrides.floordiv  sV    
 D}3qclA3j+;C5vVTXSYY\]a\bbcddr   c                l   [         R                  " S[        R                  5      n[         R                  " [         R
                  " X5      [        R                  5      n[         R                  " [         R
                  " X5      [        R                  5      n[         R                  " X#5      nU SU  S3$ )Nr   rp  .dtype))r  r  ru   rB  r  ltr<  sub)r6  zleftrightr  s        r|   signTritonOverrides.sign  so    LLEKK(||SVVA\EJJ7cffQlUZZ8ggd"d1#W%%r   c                    SU  S3$ )Nr  r  r   r  s    r|   truncTritonOverrides.trunc  r  r   c                    U  SU 3$ )Nr  r   r  s     r|   truncdivTritonOverrides.truncdiv  s     D}r   c                    SU  S3$ )Nr  r  r   r  s    r|   ceilTritonOverrides.ceil   r  r   r   )NT)r9  rZ  r  zOptional[torch.dtype])r9  rZ  r  rZ  )Trt   r   r   r   r   mathr(  er  r  r  r  r  r   r  r  r  r  r  r  r  r  r  r  r  r  r  ru   rv   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r"  r%  r+  r/  r4  r8  r<  r?  rC  rF  rI  rL  rP  rV  rY  r^  ra  rd  rg  rj  rm  rp  rs  rv  ry  r|  r  r  r  r  r  r  r  r  r  r   r   r   r|   r  r  T  su   (yy H ,0	6&6& )6& 6&p    D D" < < #  #     '  ' &  & '  ' &  &  " 3 3 3 3 * * "&emmTPQK K #  # #  # U U Q Q (  ( %  % &  & &  & &  & '  ' &  & '  ' ,  , &  & '  ' /  / &  & (  ( ,  , '  ' &  & 0  0                     , , - - L L T T '  ' '  ' %  % &  & "  " 
 
 +  + *  * #  # /3 0 3 /3 0 3 +  + '  ' e e & & '  '  
 &  &r   r  r   c                     ^  \ rS rSrSrU 4S jr\\R                  S 5       5       r	\S 5       r
\S 5       r\S 5       r\S 5       r\S	 5       rS
rU =r$ )TritonKernelOverridesi)  a  Map element-wise ops to Triton within a TritonKernel

Unlike TritonOverrides, these assume the code is going to be inserted into
the body of the main triton kernel and so it may use indexing and mask
variables which are assumed to already be defined in the current scope.
c                F   > [         TU ]  " U0 UD6  U R                  5         g rs   )rO  rP  _setup_libdevice_routing)r   ra  rV  rR  s      r|   rP  TritonKernelOverrides.__init__1  s#    $)&) 	%%'r   c                  ^ SSK Jm  [        R                  R                  R
                   H  n[        X5      (       d   e[        X5      nU4S jnUS:X  aE  [        TS5      (       d   e[        R                  " X2US9nXl
        [        X[        U5      5        Mq  S n[        R                  " XRUS9nXl
        [        X[        U5      5        M     g)z<Set up routing to libdevice implementations for fp64 inputs.r   )OpDecompositionsc                   > U R                   [        R                  :w  a  U" U 5      $ [        TU5      " U 5      R                  $ rs   )r9  ru   rw   rt  r  )r6  _original_impl_fn_namer  s      r|   decomposition_routerLTritonKernelOverrides._setup_libdevice_routing.<locals>.decomposition_routerC  s7    77emm+)!,,"#3X>qAGGGr   rp  )r  r  c                `    U R                   [        R                  :X  a	  SU SU  S3$ U" U 5      $ )Nz
libdevice.r  r  )r9  ru   rw   )r6  r  r  s      r|   dtype_routerDTritonKernelOverrides._setup_libdevice_routing.<locals>.dtype_routerR  s2    77emm+'z1#Q77)!,,r   N)torch._inductor.codegen.commonr  ru   	_inductorutilsop_requires_libdevice_fp64r   rt  	functoolspartialrt   setattrr  )rx   fn_nameoriginal_implr  fnr  r  s         @r|   r  .TritonKernelOverrides._setup_libdevice_routing8  s    
 	D,,GGG3((((#C1MH )#/;;;;&&(QX &l2&67- ""WB "KC,r"23; Hr   c                j    [         R                  R                  5       nS/U-  nU R                  XUS9$ )NrF   r  )rD   r  triton_tensor_ndimr  )rx   r  r9  ndimr   s        r|   r  TritonKernelOverrides.constant^  s7    
 xx**,d
##E#>>r   c                8   [         R                  R                  USS9n[        U[        5      (       d   e[         R                  R                  5       nU[        R                  [        R                  4;  a  UOUn[        R                  R                  n S[        R                  l        [         R                  R                  R                  [         R                  R                  UR                  [!        U5      US9nU[        R                  l        U[        R                  [        R                  4;  aZ  [         R                  R                  R                  [         R                  R                  U R#                  Xb5      [%        U5      S9nOUnUR&                   Ht  n[)        U[*        R,                  5      (       d  M$  [        R.                  " U[         R                  R                  R0                  UR2                     R4                  5      nMv     X$:w  aP  [         R                  R                  R                  [         R                  R                  U R#                  Xd5      US9nUR6                  Ul        U$ ! U[        R                  l        f = f)NF	block_ptrrQ  r9  r8  )rD   r  indexingr  r   get_index_dtype_as_torch_dtyperu   rB  int64r   test_configsruntime_triton_dtype_assertcsegeneratecomputer   r5   r  r@   rg  r   r   r   promote_typesvarname_mapr_  r9  r   )rx   r+  r9  r  r  origrJ  	index_vars           r|   r   TritonKernelOverrides.index_exprg  s   88$$TU$;(O4444 hh==?u{{EKK&@@k "">>		C>CF;((,,''  "",T2	 ( C ?CF;ekk22((,,''  S()%0 ( C  E!..	!)TXX66!//qxx||77	GMME / #hhll++HH$$LL2% ,  !**
9 ?CF;s   A*J Jc           
        U bm  [         R                  R                  bR  [        R                  R
                  R                  [        R                  R                  U  S3[         R                  S9n UR                  R                  SS9nU(       d   S5       eSnU HH  nUR                   H5  nUR                  S:w  d  [        UR                  S   5      (       d  M2  S	n  MF     MJ     U(       a  S OUn[        R                  R                  XS
9 nU" 5       n	S S S 5        U(       a  W	R                  R                   (       a  [        U5      n[        R                  R
                  R                  [        R                  R                  SU	 S[#        U5       SU	 S3[$        R&                  " U5      U	R(                  S9n[*        R,                  " WX5      n
OW	n
U
R.                  R1                  W5        U
$ ! , (       d  f       N= f)N.to(tl.int1)r8  output)opz)graph for body does not contain an outputFloadrF   Tr  r  z.shape, r  r  r  )ru   r  r  rD   r  r  r  r  r   r  
find_nodesra  targetr`   
mask_loadsrQ  is_boolrW   r   wrapr9  r  r  r   discard)r   r  r  nodes
need_wherenoderW  r  new_maskrP  r  s              r|   r  TritonKernelOverrides.masked  s    1 1 =88<<((  &%jj ) D 

%%%2AAAu
 Dyy::'+CCHHQK+P+P!%J !  #XX   3xVF 4 }}$$UHHLL))  6((=+?*@6('R"''.ll	 * E ))Hf4CCh'
' 43s   G77
Hc                    [         R                  R                  R                  U 5      nSU S[         R                  R                  R	                  SU5       S3$ )Ntl.load( + load_seed_offsetr  )rD   r  ra  inputseed_offset)r_  r   rJ  s      r|   ra  TritonKernelOverrides.load_seed  sI    hhmm!!$'se3qxx}}889KVTUUVW	
r   c                   SU  S3n[         R                  R                  R                  U5      =n(       a  U$ [         R                  R                  R	                  U R
                  S9n[         R                  R                  R	                  [        R                  S9n[         R                  R                  R                  U SU SU  S35        [         R                  R                  R                  XU45        X44$ )Nzfrexp(r  r8  r  z = triton_helpers.frexp()rD   r  r  try_getnewvarr9  ru   rB  r  r   put)r6  	cache_keycse_valmantissaexponents        r|   frexpTritonKernelOverrides.frexp  s    QCqM	hhll**95575N88<<&&QWW&588<<&&U[[&9	""j8*$<QCqA	
 	
x$89##r   r   )rt   r   r   r   r   rP  r   r  cacher  r  r  r  r  ra  r  r   r[  r\  s   @r|   r  r  )  s    ( __"4  "4H ? ? 0 0d * *X 
 
 $ $r   r  c                  V    \ rS rSr% SrS\S'   S\S'   SS jrSS	.SS
 jjrS rS r	Sr
g)HelperFunctionsi  z#An ordered set of helper functions.zdict[str, str]_templates_seen	list[str]finalized_helpersc                     0 U l         / U l        g rs   r  r  r   s    r|   rP  HelperFunctions.__init__  s    !!#r   _triton_helper_fn	base_namec                   U R                   R                  U5      nUb  U$ U [        U R                  5       3nX@R                   U'   U R                  R	                  UR                  US95        U$ )a  This accepts a function definition with the function name
left as a format specifier e.g.

    @triton.jit
    def {name}(arg0, arg1):
        return arg0 + arg1

We add the templated code to the function set and return the name
assigned to that function.

)r_  )r  getr  r  r  rb  )r   template_coder  existing_namer_  s        r|   addHelperFunctions.add  su     ,,00?$  S!7!789:.2]+%%m&:&:&:&EFr   c                ,    [        U R                  5      $ rs   )iterr  r   s    r|   __iter__HelperFunctions.__iter__  s    D**++r   c                     U R                   U   $ rs   )r  )r   rl  s     r|   __getitem__HelperFunctions.__getitem__   s    %%c**r   r  Nr   r   )r  r   r   r   )rt   r   r   r   r   r   rP  r  r
  r  r   r   r   r|   r  r    s+    -##  $ 4G ,,+r   r  c                      \ rS rSr% Sr\R                  " \S9rS\	S'   \R                  " \S9r
S\	S'   \R                  " \S9rS\	S'   \R                  " \S9rS\	S'   SS	 jrS
rg)r   i  zE
Class representing ND block dimensions, for block pointer analysis.
)default_factoryr7  r   r  r  r  c                    [        U 5      n[        S X4 5       5      u  p4U" S0 U Vs0 s H  oUX5   XE   -   _M     snD6$ s  snf )z 
Concatenates block parameters.
c              3  N   #    U  H  n[         R                  " U5      v   M     g 7frs   )r8  r9  r  s     r|   r   *BlockParameters.__add__.<locals>.<genexpr>  s     BMq[''**Ms   #%r   )r  r  )r   r  rx   r  r  rM  s         r|   __add__BlockParameters.__add__  sL     4jBTMBB9a8as16AF?*a8998s   Ar   N)r  r   r   r   )rt   r   r   r   r   r8  fieldrC  r   r   r  r  r  r  r   r   r   r|   r   r     sf     *//EEE$/$5$5d$KK!K + 1 1$ GGG + 1 1$ GGG:r   r   c                  4    \ rS rSrSrS rS	S jrS rS rSr	g)
"CooperativeReductionWorkspaceCachei  z
The scratch space used for cooperative reductions can be reused
after two reduction loops.  This keeps track of what can be reused.
c                    Xl         / U l        / U l        [        R                  " [        R
                  5      U l        SU l        SU l        g rZ  )	ra  current_loop
prior_loopcollectionsdefaultdictdequeready_for_reuse
loop_countstore_count)r   ra  s     r|   rP  +CooperativeReductionWorkspaceCache.__init__  s>    	*66{7H7HIr   c                    U R                   R                  U5      nU(       a  UR                  5       $ U R                  R	                  US5      u  p4U R
                  R                  XU45        X44$ rx  )r   r  popleftra  	workspacer  r  )r   nbytescachedws_name	ws_offsets        r|   allocate+CooperativeReductionWorkspaceCache.allocate&  sc    %%))&1>>##!YY00?  &9!=>##r   c                    U R                    H%  u  pnU R                  U   R                  X#45        M'     U R                  U l         / U l        U =R                  S-  sl        g NrF   )r  r   r  r  r!  )r   r'  r)  r*  s       r|   on_loop_end.CooperativeReductionWorkspaceCache.on_loop_end.  sT    *.//&FY  (//0DE +:++1r   c                H    U R                   nU =R                   S-  sl         U$ r.  )r"  )r   priors     r|   increment_store_count8CooperativeReductionWorkspaceCache.increment_store_count6  s#      Ar   )ra  r  r!  r  r   r"  N)r'  r   )
rt   r   r   r   r   rP  r+  r/  r3  r   r   r   r|   r  r    s    
$r   r  c                  ,    \ rS rSr% S\S'   S rS rSrg)FixedTritonConfigi<  zdict[str, int]r   c                     U R                   U   $ rs   r   r   r1  s     r|   r  FixedTritonConfig.__getitem__@  s    {{4  r   c                    XR                   ;   $ rs   r8  r9  s     r|   __contains__FixedTritonConfig.__contains__C  s    {{""r   r   N)rt   r   r   r   r   r  r<  r   r   r   r|   r6  r6  <  s    !#r   r6  c                  "    \ rS rSrSrSS jrSrg)	TritonCSEiG  zy
Subclasses CSE to apply the current load mask to the cache key to avoid CSEing
variables across separate masked blocks.
c                b    [         R                  R                  =n(       a  XR                  4$ U$ rs   )rD   r  
_load_maskr_  )r   r  r   s      r|   augment_keyTritonCSE.augment_keyM  s*    88&&&4&yy))r   r   N)r  r   r   zUnion[str, tuple[str, str]])rt   r   r   r   r   rB  r   r   r   r|   r?  r?  G  s    
r   r?  c                    ^  \ rS rSr% \rS\S'   \rS\S'   Sr	   SK     SLU 4S jjjr
SMS	 jrSNS
 jrS rS rS rS rSNS jrS r\SOS j5       rSSSSS. SPS jjr SQ       SRS jjrSQS jr        SSS jrS rSTS jr SU         SVS jjrS r  SW               SXS jjrSOS jrSYS jr          SZS jr   S[S  jr!S[S! jr"S" r#S# r$S$ r%S% r&S& r'      S\S' jr(S]S( jr)        S^S) jr*          S_S* jr+S+ r,S`S, jr-S- r.S. r/S/ r0\1S0 5       r2SUS1 jr3\1S2 5       r4\1S3 5       r5S4 r6SaS5 jr7S6 r8SUSbS7 jjr9ScS8 jr:SdS9 jr;SeS: jr<SfS; jr=      SgS< jr>SfS= jr?ShS> jr@SiS? jrASjS@ jrBSNSA jrCSkSB jrD\ES`SC j5       rFSlSD jrGSmSE jrH\ESnSF j5       rISoSG jrJSlSH jrK      SpSI jrLSJrMU =rN$ )qTritonKerneliT  r  helper_functionszCallable[[sympy.Expr], str]kexprTNc                8  > X0l         X@l        [        TU ]  " U40 UD6  [	        U R
                  U R                  5      U l        [        5       U l	        [        5       U l
        [        [           " 5       U l        X l        [        R                   " 5       U l        [$        [&        [&        4   " 5       U l        [+        5       U l        [.        R0                  " [$        5      U l        [.        R4                  " 5       U l        [        [8           " 5       U l        S U l        U R>                  (       a  U RA                  U RB                  5        U RD                  (       a  U RG                  5         U RI                  5         U RD                  (       a  U RK                  5         g g rs   )&optimize_maskfixed_configrO  rP  r?  newvar_prefixsuffixr  rN   post_loop_combinepost_loop_storer   r   outside_loop_varsr  rq  countblock_ptr_iddictr   block_ptr_to_bufferr  rF  r  r  pointer_advancementsCounter_load_countsr)   autotune_hintstriton_metar?  codegen_reduction_numelsr  cooperative_reductioninit_cooperative_reductioncodegen_range_treeinit_cooperative_reduction_mask)r   tilingr  rI  rJ  rV  rR  s         r|   rP  TritonKernel.__init__Z  s-    $1(*6*T//=1?1A/=/?!+C!2#6 %OO-#'S>#3  / 1##D) 	! 7B6I6I6K )6859  ))$))4%%++-!%%002 &r   c                    [        U5      $ rs   )r>   )r   r9  s     r|   dtype_to_strTritonKernel.dtype_to_str  s    5!!r   c                z    U R                   =(       a)    [        R                  R                  U R                  5      $ rs   )r?  rD   choices should_use_cooperative_reductionrA  r   s    r|   re  -TritonKernel.should_use_cooperative_reduction  s-    $$ 
)S)SMM*
 	
r   c                6  ^  T R                   (       d   eT R                   H'  nUR                  c  M  U=R                  S-  sl        M)     T R                  S   nT R                  (       a  [        UT R                  S   5      nT R                  R                  U5      T l        [        T R                  5      T l
        T R                  R                  S5        [        U 4S jT R                   5       5      (       a  T R                  R                  S5        gg)z/One time setup code for cooperative reductions.NrF   r6  r   a              RSPLIT_NEXT_POWER_OF_2: tl.constexpr = triton_helpers.constexpr_next_power_of_2(RSPLIT)
            RSPLIT_IS_POWER_OF_2: tl.constexpr = RSPLIT == RSPLIT_NEXT_POWER_OF_2
            HAS_RSPLIT: tl.constexpr = RSPLIT > 1
            rsplit_id = tl.program_id(0)
            num_rblocks = (rnumel + RBLOCK - 1) // RBLOCK
            rsplit_chunk = (num_rblocks + RSPLIT - 1) // RSPLIT * RBLOCK
            rsplit_start = rsplit_chunk * rsplit_id
            rsplit_end = rsplit_chunk * (rsplit_id + 1)
            c              3  v   >#    U  H.  nUR                   (       d  M  TR                  U5      (       + v   M0     g 7frs   )rB  _has_constant_mask)r   r   r   s     r|   r   :TritonKernel.init_cooperative_reduction.<locals>.<genexpr>  s4      
(   .''---(s   99z>rsplit_end = tl.where(rsplit_end < rnumel, rsplit_end, rnumel))rZ  rG  grid_dimr@  rJ  r   ra  
semaphoressemaphores_namer  %cooperative_reduction_workspace_cacher  r   r   r   )r   r   	sem_counts   `  r|   r[  'TritonKernel.init_cooperative_reduction  s    )))) $$D}}(" % KK$		4+<+<X+FGI#yy33I>5WII6
2 					
  
((
 
 

 IIP
r   c                .   SnU R                   (       d  U S3nU R                  R                  SU 35        U R                  5       (       a  U R                  R	                  S5        g U R                   (       a   eU R                  R                  S5        g )Nz$tl.arange(0, RSPLIT_NEXT_POWER_OF_2)z	[None, :]zrsplit_arange = z                if RSPLIT_IS_POWER_OF_2:
                    rsplit_mask: tl.constexpr = None
                else:
                    rsplit_mask = rsplit_arange < RSPLIT
                zSrsplit_mask = xmask if RSPLIT_IS_POWER_OF_2 else ((rsplit_arange < RSPLIT) & xmask))r;  r  r   _has_constant_xmaskr   )r   rsplit_aranges     r|   r]  ,TritonKernel.init_cooperative_reduction_mask  s~    >}},oY7M		.}o>?##%%II }}$$IIer   c                Z   U R                    H}  nUR                  (       d  U R                  XR                  5        M1  U R                  (       d  MD  U R                  R                  UR                   SU R                  U5       35        M     U R                  (       a  [        S U R                    5       5      (       aP  U R                  SSSS9nU R                  U5      nU R                  R                  SU R                  U5       35        g U R                  U R                  5        g g )Nzbase = c              3  8   #    U  H  oR                   v   M     g 7frs   )is_loopr   r   s     r|   r   2TritonKernel.codegen_range_tree.<locals>.<genexpr>  s     =,<D<<,<   baseTr   zrbase = )rG  rw  iteration_ranges_codegen_headerr  r?  r   r<  iteration_ranges_ranges_coder   _get_reduction_symbols_flatten_reduction_indicesr   r  codegen_reduction_indices)r   r   rn_basesrbases       r|   r\  TritonKernel.codegen_range_tree  s    $$D<<44T99E&&& 		##{{m74+L+LT+R*ST %   =D,<,<===66Dd 7  77A		  8D,=,=e,D+E!FG ..tyy9 !r   c                    g)z
Indicate whether we need provide numel as arguments for the generated
kernel calls in the benchmark.

Should be true for pointwise/reduction kernels but false for triton
matmul kernels.
Tr   r   s    r|   need_numel_argsTritonKernel.need_numel_args  s     r   c                    U R                   =(       a4    [        R                  R                  U R                  U R
                  5      $ rs   )r?  rD   rd  should_use_persistent_reductionrA  rZ  r   s    r|   r  ,TritonKernel.should_use_persistent_reduction  s5    $$ 
)R)RMM455*
 	
r   c                
   U R                   (       ar  [        U R                  5      U R                  S-   :X  aL  U R                  (       a  U R                  S   S:H  $ [
        R                  R                  U R                  5      $ g)NrF   r   F)	persistent_reductionr  r@  r>  rJ  rD   rd  want_no_x_dimrA  r   s    r|   r  TritonKernel.want_no_x_dim  sb    %%DKK D$;$;a$??  ((2a7799**4==99r   c                    g)Nztl.device_assertr   r   s    r|   assert_functionTritonKernel.assert_function  s    !r   F)
copy_shapedense_indexingoverride_maskr  c          
     	  ^ ^^^^^ T R                  T5      mTR                  nSn[        5       m[        U[        R
                  " S5      S9 GH  n[        U[        R                  5      (       d   eU=(       d    [        U[        R                  5      nU(       a  MQ  [        U[        R                  5      (       a@  T R                  R                  UR                      n	TR#                  U	R$                  5        M  [        U[        R&                  [        R(                  [        R*                  [        R,                  [        R.                  [        R0                  45      (       a  GM  [        R2                   V
s/ s H  n
[        X5      (       d  M  [4        U
   PM      nn
[7        U5      S:X  d   SUR                    35       eTR9                  US    S35        GM     [:        R<                  R>                  =(       d    U=(       d    T R@                  SL=(       a    TS:g  nS	nSn[        5       nT RC                  5        HF  nURE                  URF                  5      (       a  S	nOSnUR9                  URH                   S35        MH     U(       a  T RJ                  (       a  [:        R<                  RL                  (       a  U(       d  T R@                  (       d~  [7        TU-
  5      S:X  al  T RO                  T5      (       dV  U(       aO  T RP                  S
:X  a?        SS jm      SU 4S jjm      SUU4S jjmSUUUU 4S jjnU" 5       nUb  U$ SnT RS                  T5      n[        T[        RT                  5      (       a  U(       a  U S3OT RW                  5       nSU SU S3nT RX                  (       a"  T R[                  5       (       d  [        S/5      mO
[        5       mT R@                  (       a  TR9                  T R@                  5        []        UTUUT5      $ U(       a/  U(       d(  U(       a  U S3OT RW                  5       nSU SU S3nUmOU(       d  U(       a  SU SU S3nUmU(       a  [        U/5      mT R@                  (       a  TR9                  T R@                  5        T R_                  T5        []        UTUUT5      $ s  sn
f )z?
Compute the index and mask to pass to tl.load() or tl.store()
Fr_  rM  rF   zAmbiguous type: r   r   NTtl.int32c                    [         R                  " XR                  5       5      nUc  g[        UR                  /[
        R                  U5      /U/[
        R                  U5      /S9$ )zg
Matches expressions of the form:
    idx = s * xindex

This implies stride (s,), and shape (XBLOCK,).
Nr   r  r  r  )rG   match_affine_block_exprsymbolr   numelr   r   r   )r   
range_treerI  s      r|   match_affine_block1TritonKernel.indexing.<locals>.match_affine_blockH  sj     -DD,,. >&%++,!.!=!=j!I J#H*;;JGH	 r   c                  >^^ UR                  5       n[        R                  " S[        R                  " [        R
                  U/S9S9u  p4[        S[        TR                  5      U R                  [        X#5      5      U R                  [        X#U5      5      -   5      n[        R                  " XUR                  U5      nUc  gUu  nnn	[        R                  " U5      n
[         R"                  R$                  mTR'                  UR(                  5      m[+        UU4S jU
 5       5      (       a  g[,        R/                  U5      n[1        XS   5      /[3        U
SS USS 5       VVs/ s H%  u  p[        R4                  " [1        X5      U5      PM'     snn-   nU	 Vs/ s H#  n[7        X[,        R9                  U5      05      PM%     nn[;        UUUUS	9$ s  snnf s  snf )
a  
Matches higher-dimensional blocks coming from FloorDiv and ModularIndexing.

Example expression to match:
   sN * ((rindex//(d1 * ... * d(N-1))))
       + s1 * ModularIndexing(rindex, 1, d1)
       + ...
       + s(N-1) * ModularIndexing(rindex, d1 * ... * d(N-2), d(N-1))

This iterates over a block of shape (dN, ..., d1) and stride
(sN, ..., s1). (d1,...,d(N-1)) and (s1,...,sN) are
wildcards that we match.

Note that dN does not appear in the expression, but we solve for it
using range tree numels and the other dims.
zdenom modulo)exclude)rx   r   Nc              3     >#    U  H9  nTR                  UT5      (       + =(       a    TR                  U5      (       + v   M;     g 7frs   )rh  statically_known_power_of_2)r   r  	max_blockr  s     r|   r   ETritonKernel.indexing.<locals>.match_mod_div_block.<locals>.<genexpr>  sG       ". !==eYOO H$@@GGH!-s   AAr   rF   r  )r  r   symbolsr  r  Wildr  r  range_tree_nodesrP  r   r   rG   match_mod_div_block_exprr  get_slice_numelsrD   r  r  r  r<  r   r   r   r   r  Minr=   r   r   )r   r  r  denommodulonum_dimsmatch_resultrL  r  block_index_exprsslice_numelslinear_block_sizer  r!  r  r+  r   r  r  r   s                    @@r|   match_mod_div_block2TritonKernel.indexing.<locals>.match_mod_div_block^  s   ( '--/	 !&"!))%**ykJ! --.HY$>?++oi&OPQ	  3KKj&6&6   ' !	%2CCDI 77++ NN:+<+<=	  ".  
   %2$@$@$L!-A?1 '*,qr*:DH&E&E
 IIg&7?E&E1 !2	3 !2 -*H*H*TU !2	  3 ' +#)	 3s   ,,G"*G c                :   > TT4 H  nU" X5      nUc  M  Us  $    g)zE
Match a block indexing subexpression involving a single range tree.
Nr   )r+  r  
match_funcmatchr  r  s       r|   match_block_pointer_subexpr:TritonKernel.indexing.<locals>.match_block_pointer_subexpr  s3     ''#J 't8E($# r   c            	     ~  > [        TTR                  R                  5        V Vs0 s H  u  pXR                  _M     snn 5      nTR	                  5       nU Vs/ s H'  n[
        R                  " X$R                  5       5      PM)     nn[        S U 5       5      n[        5       n[        X55       H@  u  pH[        UR                  UR                  5      5      S:  a    g T" X5      n	U	c    g Xy-  nMB     U[        U5      -
  n
TR                  T5        [         R#                  UU
UTTR$                  S9$ s  snn f s  snf )Nc              3  @   #    U  H  oR                  5       v   M     g 7frs   )r  rx  s     r|   r   ETritonKernel.indexing.<locals>.match_block_pointer.<locals>.<genexpr>  s     *Q[T;;==[   rF   )r   r   rG  r   rH  )r=   r  r:  r+  active_range_treesrG   get_subexpr_involving_symbolr  r   r   r  r  intersectionrg  sumfilter_masksr   rQ  r  )vri  index_relative_to_xyr_indexrG  r   index_subexprsrange_symbolsblock_paramssubexprr   r   r   r   r  r   s              r|   match_block_pointer2TritonKernel.indexing.<locals>.match_block_pointer  sH   .8$2G2G2M2M2OP2O$!AvvI2OP/+ #557 !,	" !, (DD3[[] !,	  " !+*Q[*Q Q.0%(%EMD =55g6J6JKLqP# 9GF~# *L &F 5s>7JJ !!),&--'$* +'"&.. .  E Q"s   D4.D:z.shaper  r  z, tl.int32)xmaskr  r  .shape))r   r   r  rZ   r   Optional[BlockParameters])r+  r   r  rZ   r   r  )r   zOptional[BlockPtrOptions])0prepare_indexingrg  r   r   operator
attrgetterr  r   r   r   r   r   r   r   r  r  r_  rU  r   UNBACKED_INTSIZEPRECOMPUTED_SIZEINDEXFLOATUNBACKED_FLOATr   r   r  r  r   r   r  rA  r  r  var_listr<  allow_block_ptruse_block_ptris_indirect_indexingr  r  r[  dense_size_strrJ  rr  r   r  )r   r   r  r  r  r  
index_varsr   rJ  cse_varr   prefix_matches
need_dense
have_densehave_loop_varsdense_mask_varsr   r  optionsr   r   r   r  r  r  s   ``                   @@@@r|   r  TritonKernel.indexing  s?    %%e,''

%/\	*(*=*=f*EFCc5<<0000# ~]22(J TXX..((..sxx8  !2!23%%II))JJJJ''
 
 
 !. 9 9" 9%c0 %Jt$ 9  "
 >*a/N3CCHH:1NN/!2 3489? GD MM(( ++d* qj	 	 
+5<++-D&&t}}55!%"
4;;-t 45 . $$++!OOI/0A5--e44  J.!/B*,`!`/B`*`D .A*  * *Z *+G"
%%e,	eU]]++2<J<v.$BUBUBWJ":,b;GI  )A)A)C)C&y1	&L	doo."9iZQVWWj2<J<v.$BUBUBWJ*9+R
|1EI'IJ*9+R
|7KI'I"M?3I??MM$//*)$y)ZUSSy"s   >SSc                   UR                  5       nU(       d  SnOU(       a  US:X  d   eSU< S3nOSU< 3nU R                  (       a  U R                  S   R                  (       a  UR	                  5       (       a  S[        U R                  5       3nU R                  R                  [        X SUR                  USS	9 35      5        XR                  U'   [        R                   HL  nUR                  U5      n[        S
 U 5       5      (       a  M-  U R                   U   n	Xi;  d   S5       eXU'   MN     Xd4$ UR                  U5      nXd4$ )Nr   , other=0.0, boundary_check=z, padding_option='zero'r.  r   = F)rU  c              3     #    U  HC  n[         R                  R                  R                  U[        R
                  " S 5      5      v   ME     g7fr   N)rD   r  r  r  r   r[  )r   r   s     r|   r   1TritonKernel.codegen_block_ptr.<locals>.<genexpr>@  s;      "1 GG$$<<VU]]STEUVV"1s   AAz@duplicate advancement for pointer '{block_ptr}' at type '{symt}')rp  r?  rG  rw  r   nextrQ  r  r   rM   rb  rS  r   r   ru  r  rT  )
r   r_  rJ  r  r  checkr  r   advance_offsetsadvancementss
             r|   codegen_block_ptrTritonKernel.codegen_block_ptr   sg    '')EM)))'y0GHE'y1E!!  $,,##%%#D):):$;#<=IIIKs8??3?+N*OP 37$$Y/ &55"*":":4"@  "1   #88> 4 V4 +:Y' 6"  !,Ir   c                   SU SUR                    S3n[        [        UR                   UR                  5      5       HG  u  nu  px[        R
                  R                  R                  Xx5      (       d  M8  SUR                  U'   MI     UR                  XBR                   UR                  S5      nU S[        [        R
                  R                  U5      5       S3nSU SU U S3$ )Nr  r  r  Frp  	tl.store()r   	enumerater  r   rD   r  r  r  r   r%  r  r=  	get_dtype)	r   r_  r  r  r  r  rl  r!  broadcast_dims	            r|   codegen_block_ptr_store_line)TritonKernel.codegen_block_ptr_store_lineO  s     #5'H,@,@+AC *3$$h&>&>?*
%C%# ww77KK27**3/	*
 66'')=)=u

 '/0A0A$0GHIK9+RwugQ77r   c                    U(       d  U(       d  g [        U[        R                  5      (       d   eU R                  USS9n[        U[        5      (       d   eUR
                  nUR                  5       (       a  UR                  OS nU(       a  [        U R                  U5      5      OS nU R                  Xc(       a  SOS X5      n	U R                  U5      n
U R                  R                  XS[        R                  S9  g )NFr  0)
assignmentr9  )r  r   Exprr  r   r   r   r   texprrename_indexingindirect_assertget_load_bufferr  r  ru   rB  )r   r+  r  lowerr   r  r   r   size_strlinebuffers              r|   check_boundsTritonKernel.check_boundsf  s     $

++++===7(O4444&&	(0(9(9(;(;8$$8=5--d344 ##esx
 %%h/&5Lr   c                &   UR                  5       (       d  UR                  5       (       a  U R                  $ U R                  (       a?  U R                  S   R
                  (       a!  UR                  5       (       d  U R                  $ U R                  $ )Nr.  )	r   r   r  r?  rG  rw  r   r  loads)r   r  s     r|   r  TritonKernel.get_load_buffer  sk      ""h&:&:&<&<<<!!  $,,'')) 99::r   c           
     
  ^^^^^ U R                   R                  T5      nU R                  mTT==   S-  ss'   [        nU R	                  U5      mUnU R                  USS9nUR                  5       mUR                  5       n[        S U R                  U5      R                  5        5       5      nU R                  U5      (       a  Sn	OiU(       d  Sn	O_U R                  (       aL  U R                  S   R                  (       a.  UUUUU4S jn
TT   mSn	[        R                   " ["        S	U
5      nOS
n	U(       d  T(       aB  UR%                  5       (       a-  U R&                  (       a  S[)        U R&                  5       3nOSnOS
n Sn[*        R,                  R.                  (       a"  U R0                  R3                  5       nUT   S:  n U R                  U5      (       + =(       a(    U R                  (       + =(       a    U(       + =(       a    UnS
nU(       a  SnS n[4        R6                  R9                  T5      n[;        T5      (       a8  UnU[<        R>                  [<        R@                  4;   a  [<        RB                  nGOW[E        U[F        5      (       aK  U RI                  TX6U5      u  nnSU U U	 U S3nURK                  UURL                  URN                  S5      nO[[E        U[P        RR                  5      (       a  SU SU S3nURT                  nO&SU SURV                   SURX                   U	 U U S3
nU[<        R>                  [<        R@                  4;   a4  [*        R,                  RZ                  (       a  US-  n[<        RB                  nU[<        R\                  :X  a0  [<        R^                  R`                  c  US-  n[<        R\                  nU Rc                  U5      nU Rd                  Rg                  UU" U5      US9nURh                  S:  a  TT==   S-  ss'   [E        U[j        5      (       d   eURl                  Ul6        U(       a  SU SU S3nU Rd                  Rg                  UUUS9nURl                  (       a  URn                  (       a  SnOU[<        R\                  :X  a  SnOSnU R&                  (       a  [)        U R&                  5      OUnSURX                   SU SU S3nU Rd                  Rg                  UUUS9nU R                  (       a  URq                  5       (       d"  T(       d  U Rr                  Ru                  U5        U$ )NrF   Tr  c              3  *   #    U  H	  oS :H  v   M     g7fr  r   )r   r  s     r|   r   $TritonKernel.load.<locals>.<genexpr>  s      
MqFM   z, eviction_policy='evict_last'r.  c                 6   > TT   T :  a  T(       d  T(       a  gg)N
evict_lastevict_firstr   )expected_countr   indirect_indexingload_countsr_  s   r|   decide_later'TritonKernel.load.<locals>.decide_later  s    t$~5"3'$r   z, eviction_policy='<EP>'z<EP>r   z, other=r  z, cache_modifier='.cg'r  r  r^  r  r  rh  r  r8  r  r  z0.0Truer  r  );ra  r  rV  r   r  r  r   r   r   get_strides_of_loadrs  is_broadcastedr?  rG  rw  r  r  r4   r   _load_otherrW   r   r   skip_l1_cacherA  buffer_read_countsrD   r  r  r`   ru   rc  rd  rv   r  r   r  r%  r  r   r   r[  r   r   r   rb  r   r  r  r  r  r  	use_countrM  r   rG  r   rO  r  )r   r_  r   rJ  	make_lineoriginal_indexr  r   is_coalescedepr  r  has_read_depsr  r  cachemodappend_broadcastr9  r  r  load_buffer
result_varzero	other_valr  r   r  r  s    `                      @@@@r|   r  TritonKernel.load  s   iiood#''DQCK	 55e<==$=7((*
**,  
 44^DKKM
 
 ~..1B1B""t'7'7';'C'C% % ).N+B!))*:FLQIB:8+<+<+>+>"=1A1A#B"CD%E	 ==&&!%!A!A!C.t4q8M	 ##N33 )))!! 	 	 /H!!$'#D))D 77 (O44#'#9#9$u#U 	5!)UGB4zC==(..0D0Dd NEMM::!#d>*:"=#+#6#6 !#d8+=+=*>c(BSBSATUWTXY^X_`h_iijk %--88MM88))

"u}}'8'8'@ &

**84XX&&{IdO5&Q
!#"*&78888'11
%j\4D3EQGD**;E*JJ!!** Dejj(!DD7;7G7GM$"2"23T  #8#4#4"5R
|2i[PQR!XX..{D.N
$$X-?-?-A-A*""&&z2r   c           	        U R                   R                  U5      nUnU R                  USUS L S9nXR                   R                  ;   nU R	                  U5      n	U(       a,  U	(       a%  U R
                  R                  [        US5      5        [        U[        5      (       a(  U R                  XU5      u  pU R                  XXU5      nO_Uc$  SU SUR                   SU SUR                   S3	nO8US	:X  a$  S
U SUR                   SU SUR                   S3	nO[        SU 35      e[        R                   " 5       nU R"                  (       d;  U R$                  (       a*  UR'                  U R)                  XR
                  5      5        U R
                  R                  [        X5      5        U R"                  (       d  U R*                  R-                  U5        UR/                  5         g )NT)r  r  ztl.debug_barrier()r  r^  r  r  r  
atomic_addztl.atomic_add(z, sem='relaxed')zstore mode=)ra  r  r  inplace_buffersr  storesr   rM   r  r   r  r  r   r   r  
contextlib	ExitStackr?  rZ  enter_contextguard_cooperative_storerO  r  close)r   r_  r   r  moderJ  r  r  
is_inplacer  r  r  r  
exit_stacks                 r|   storeTritonKernel.store	  s    iit$==ttt|=T YY666
,,^<.KK!!,t5I"JKh00#55dJI44	%D \se4(:(:';3ugRHYHYGZZ[\D\!#C5X-?-?,@E7"XM^M^L__opD%D6&:;;))+
$$)C)C$$T%A%A$%TUl467$$""&&u-r   c                    U R                   R                  5       nUR                  [        USU S35      5        UR	                  5       $ )z
For cooperative reductions only one thread block should write out the result.
We rotate which thread block does each write for better parallelism
zif rsplit_id == (z % RSPLIT):)rn  r3  r   rM   indent)r   r_  r  rl  s       r|   r/  $TritonKernel.guard_cooperative_store@	  sC    
 88NNPd.?uK,PQR}}r   c                   U R                   R                  [        R                  5        U R                  R                  US   5      nU R                  US   5      n	U R                  US   5      n
U R                  US   5      nU(       a  U R                  R                  US   5      OSnU(       a  U R                  US   5      OSnU[        R                  :X  a  SnO"U[        R                  :X  a  SnO[        S5      eU R                  R                  U R                  S	U S
U S
U	 S
U
 S
U S
U S
U S
U S
U S
U S
U S3US9nU$ )z#
See [Note: Inductor bucketize op]
r   rF   r   r   r   r  ztl.int64z5Bucketize only supports indexing with int32 and int64z'triton_helpers.bucketize_binary_search(r  z, )r8  )rW  r  r)   ONE_ELEMENT_PER_THREADra  r  r  ru   rB  r  r  r  r  r  )r   rs  
boundariesboundary_indicesindexing_dtyper  sortersorter_indicesboundaries_ptrboundary_sizeboundaries_underlying_numelboundary_stride
sorter_ptrsorter_stridetriton_dtyperP  s                   r|   	bucketizeTritonKernel.bucketizeI	  s^   $ 	 C CDA7))*Q-8&*&7&7
1&F#++JqM:39TYY__VAY/v
8>))&)4FU[[(%Lu{{*%L%G  ""LL5fXRbr2M1NbQ`Paac nBgRl"]O2  ! # 
 r   c                    U R                  5       nUS:X  a  SU S3$ U R                  nS/X#-
  -  S/U-  -   nU SSR                  U5       S3$ )	NrF   z!triton_helpers.promote_to_tensor(r  r  r   r  r  r  )r  r>  r   )r   r  ndimsnreducesizess        r|   reduction_resizeTritonKernel.reduction_resize|	  sh    '')A:6ugQ??)))VHw,>>$))E*+1--r   c           
         U R                   S:X  a  U$ U R                  5       U R                   -
  nU R                  5       nUSU S/-   n[        U R                  R                  U[        X%U5      US95      $ )z3
Reshape to RBLOCK, collapsing all reduction dims.
rF   NRBLOCKr8  )r>  r  dense_size_listr   r  r  r  )r   r  r  r9  target_ndimr  target_shapes          r|   reduction_collapse_dims$TritonKernel.reduction_collapse_dims	  s    
 ""a'L--/$2I2II,,.$\k2hZ?HHu\JRW  
 	
r   c                x  ^ ^^^3^4^5^6^7^8 S7S jn[         R                  " U5       Vs/ s H  ofR                  PM     nn[         R                  " XT5      n[	        S U 5       5      (       aJ  [
        R                  " U[
        R                  5      n[
        R                  " T[
        R                  5      mT R                  (       d   e[        S T R                   5       5      nT R                  U5        [        U5      nT R                  (       a  UR                  T R                  5        T R                  S   R                  S   n	T R!                  5       m4T R#                  U4U 4S jU5      nT R%                  5       T R&                  -
  m5      S8U5UUU 4S jjm6        S9U64S jjn
U5UU7U 4S	 jnUTU4nUT R(                  R*                  ;   a  T R(                  R*                  U   $ [-        U5      n[/        U5      nT R(                  R1                  US
9n[        S U 5       5      Ul        SR5                  U5      m3U34S jm8T R6                  (       Ga%  [8        R:                  R=                  TU5      nT R#                  [>        U5      nS:U U84S jjnTS:X  a  OG[A        U[B        5      (       a)  [E        UU5       VVs/ s H  u  nnU" UU5      PM     nnnO	U" UU5      nTS;   ay  [F        RH                  RK                  5       n[M        T R(                  RO                  T RP                  SU	 SW S3US
95      nSSS.T   m7U" T RP                  UUU5        UUl        GOTS:X  a=  T RR                  (       a  T RU                  UTUT8UT5      nGOT RW                  TU5      nGOpTS:X  aS  [A        W[X        5      (       d   eUu  nnn[C        UU 4S jT R[                  T RP                  UUUT5T5       5       5      nGOTS:X  a  T R]                  TU5      nGO[A        W[^        5      (       d   eT R(                  RO                  T RP                  T6" T RP                  [M        U5      S 5      UR                  S
9nGOT R(                  Ra                  SU 3US
9n[8        R:                  Rc                  TU5      nT R#                  [>        U5      n[A        U[B        5      (       d5  T Rd                  Rg                  U ST R!                  5        SU SU S35        TS;   a  SU S3nT Rh                  Rk                  5       nT Rd                  Rg                  U ST R!                  5        S[
        Rl                  " U5      Rn                   ST Rq                  U5       S35        SSS.T   m7T RP                  Rs                  SU SU S T7 S!U SU SU SU	 S"U S#T8" U S$3U5       S%U S#T8" U S$3U5       S%35        U" T Rt                  UUU5        GO[w        T5      (       a  T RU                  UTUT8UT5      nGOTS:X  Ga%  SU S&3nSU S'3nT Rd                  Rg                  U ST R!                  5        S(U S35        T Rd                  Rg                  U S)T R!                  5        SU S35        T RP                  Rs                  S*U SU S+U SU SU S[x        Rz                   S,35        T RP                  Rs                  S*U S#T8" U S$3U5       S*U S#T8" U S$3U5       S*3	5        UnT R(                  R1                  TS
9nT R}                  T Rt                  UUUUT5T5      nO[8        R~                  " TU5      n U " UU5      n!T RP                  Rg                  U S#T8" U!U5       35        U[
        R                  :X  a/  U S-3n"[        T5      n#U
" T Rt                  [M        U5      U"U#5        O'U
" T Rt                  [M        U5      [M        U5      S 5        T RR                  (       Ga  [8        R:                  Rc                  TU5      n[        R                  " 5       n$T Rt                  T R                  4 H3  n%U%Rg                  S.5        U$R                  U%R                  5       5        M5     TS;   a  T Rt                  Rg                  U S/T R                  U S035       35        T R                  U S13UU5      n&T Rh                  Rk                  5       nT R                  UU[
        Rl                  " U5      Rn                  5      n'U" T R                  UU&U'5        GOD[w        T5      (       a  TS:X  d   eUu  n(n)n*T R                  U([/        U5      US   5      n+T R                  U)[/        U5      US2   5      n,T R                  U*[/        U5      US3   5      n-T R                  T R                  U(U)U*U+U,U-T5T5	        OTS:X  ae  Uu  nnT R                  U[/        U5      US   5      n.T R                  U[/        U5      US2   5      n/T R}                  T R                  UUU.U/T5T5        O:T R                  U[/        U5      U5      n0U
" T R                  [M        U5      U0S 5        U$R                  5         UT R(                  R*                  U'   [A        U[B        5      (       a  [        S4 U 5       5      (       d   eT R                  R                  U5        TS5;   a  [        U5      S2:X  d   e[        U5      U-  n[        U5      [        U5      :X  d   e[E        X5       HJ  u  n1n2U2c   eU1R                  U2:w  d  M  T Rt                  Rg                  U1 S#U1 S6[        U25       S35        ML     U$ [A        U[        5      (       d   eT R                  R                  U5        UR                  US   :w  a8  US   c   eT Rt                  Rg                  U S#U S6[        US   5       S35        U$ s  snf s  snnf );Nc                    U R                   [        R                  [        R                  4;   a%  [        R
                  " U [        R                  5      $ U $ rs   )r9  ru   rc  rd  r  r  rv   r  s    r|   maybe_upcast,TritonKernel.reduction.<locals>.maybe_upcast	  sF     ;;MMNN UEMM2 r   c              3  f   #    U  H'  o[         R                  [         R                  4;   v   M)     g 7frs   )ru   rc  rd  r  s     r|   r   )TritonKernel.reduction.<locals>.<genexpr>	  s      M_U]]ENN33_s   /1c              3  >   #    U  H  oR                    S 3v   M     g7fr   Nr<  rx  s     r|   r   r[  	       M<LDkk]$/<L   r.  r   c                p   > TR                   R                  TR                  SU  ST S3U R                  S9$ )Nr  r  r  r8  r  r  r  r9  )r  r  r   s    r|   <lambda>(TritonKernel.reduction.<locals>.<lambda>	  s:    dhh''"1#R'7q9gg ( r   c           
        > TS;   nU(       a  SOSnTR                  XT5      nTS;   a  TR                  U ST SU ST S35      nOTR                  U ST S	U ST S35      nUb  U S
U S3nU$ )z3
Helper to generate a reduction call, e.g. tl.sum.
)r   r  minprodtriton_helperstl)r  rf  r.  z2(r  r  r  rp  )rT  rM  )	r  r  result_type
use_helpermoduler!  r9  reduction_typer   s	        r|   final_reduction/TritonKernel.reduction.<locals>.final_reduction	  s     (+HHJ)3%F00FE/--ha/r%3%qA --ha/qr#a@ & 'k]!4Lr   c                D   > T" XU5      nU R                  U SU 35        g)z=
Generate a reduction and assign it to an existing variable.
r  N)r   )r  r$  r  rj  rn  s       r|   final_reduction_define6TritonKernel.reduction.<locals>.final_reduction_define	  s(     $F;?EMMZLE734r   c                   > TR                  XT5      nTR                  XT5      nU R                  SU SU ST SU SU ST SU STR                  U S35       S	35        g )
N                z_val, z_idx = triton_helpers.z_with_index(r  )
                r  _idx
                )rT  r   rM  )r  r$  r  r   r!  r9  root_opr   s       r|   final_argreduce/TritonKernel.reduction.<locals>.final_argreduce	  s    00FE00FEMMF:,.DWI\Z_Y``bchbiiklokp qC 5 5D6I JK Lr   r8  c              3  P   #    U  H  n[        US    5      (       a  M  Uv   M     g7fr  )r:   )r   rJ  s     r|   r   r[  
  s!      *
 C(;CF(CCC5s   &	&r   c                B   > T(       d  U $ [         R                  TX5      $ rs   )r  r  )tvalfvalconds     r|   
where_cond*TritonKernel.reduction.<locals>.where_cond	
  s    (..tT@@r   c                n   > TR                   R                  TR                  T" X5      U R                  S9$ )Nr8  rb  )r  defaultr   r  s     r|   _mask_value+TritonKernel.reduction.<locals>._mask_value
  s3    xx((LL*U"<EKK )  r   online_softmax_reduce)argmaxargminr  zindex, r  r  rf  welford_reducewelford_combinec              3  n   >#    U  H*  nTR                   R                  TR                  UTS 9v   M,     g7f)r8  N)r  r  r  )r   r  r9  r   s     r|   r   r[  =
  s6      #" HH%%dllE%G"s   25r   = tl.full(r  r  _indexrt  _next, z_next = triton_helpers.z%imum_with_index(
                    z(index
                )
                r  _nextrw  _max_sumz, float('-inf'),  = tl.zeros(z
                    zG_next = triton_helpers.online_softmax_combine(
                        z+
                    )
                    z.to(tl.int8)zif HAS_RSPLIT:z_bval = _val_bvalrF   r   c              3  B   #    U  H  n[        U[        5      v   M     g 7frs   )r  rM  r  s     r|   r   r[    s     LAz!%677s   )r  r  rp  )r  rL   r   rL   )r  r   rj  r   r   r   )r$  r   r  r   rj  r   r   r   )r   rL   )Qpytreetree_leavesr9  tree_mapr   ru   r  rv   r?  r   rG  r  r   rA  r  r<  r  _map_tuple_or_scalarr  r>  r  reduction_cacherE  rC  r  r   r   r  r   	Reductiondefault_valuerW   r  r  r  rD   r  r  r   r  r  rZ  r  welford_reduce_fallbackr   _welford prepare_softmax_twopass_fallbackrL   namedvardefault_accumulatorr  r   rA  select_index_dtypeiinfor  ra  r   rM  r8   r   r  %online_softmax_reduce_final_reductionget_reduction_combine_fnr   r:  r,  r-  rN  r.  r7  rM  *codegen_cooperative_reduction_peer_combinewelford_reduce_final_reductionr0  r  rO  rU  r  rM  r  )9r   r9  r  rm  r  rX  rN  original_dtypesmasksreduction_range_prefixrq  ry  r  acc_typetorch_acc_typer$  r  r  r  dmasked_valueaccumulator_dtypeaccumulator_indexmeanm2weightaccumulatorr  accumulator_maxaccumulator_sum
result_max
result_sum
combine_fnupdatedaccumulator_casted_strrj  r3  bufpeer_valpeer_idxresult_mean	result_m2result_weight	peer_meanpeer_m2peer_weightpeer_maxpeer_sumpeersrJ  
orig_dtyper  r  r!  rn  rx  r  s9   `` `                                               @@@@@@r|   	reductionTritonKernel.reduction	  s   	 170B0B50IJ0I990IJ4M_MMM++Iu}}EI''u}}=E$$$$MD<L<LMM% u??LL)!%!1!1"!5!<!<Q!? ,,.))
 
 %%'$*A*AA		 '	 		 	4
	5
	5 
	5 '	
	5
 
	5	 	 6	00088++I66"9-))4((///?
) *
 *
  

 zz% 	A
 $$$ll00KG//wGG  !88 E5))>A%>QR>QdaAq 1>QR*5':!55$%HH$K$K$M!$'HH%%*+A*B',W^_/ & %! &+e<^LLL*l<M $5
 #33--!%!4!4"NE:xQV"J "&!=!=eU!KJ#44!,9999%1"r6" #!%dBU"# 
  #:: "BB5%P
!,<<<<!XX..LL#DLL#l2CTJ&,, / 
 ((++a
|,<N+SKll66~yQG//wGGgu--		##"m;t/B/B/D.ERyPRS[R\\]^ !55&'
|6$:!"mm>>@		##()T5H5H5J4K2{{;/334Bt7H7H7U6VVWY &+e<^L##W%6$77Nwi X M$5#6brBXAY ZS{m5,A;!O P Q"#3z5F4Gu2MO`'a&b c  **JEV &n55!00z8U
  #::$%j\"6$%j\"6 		##&'{43F3F3H2IIZ[cZddef 		##&'|D4G4G4I3J"XJVWX ##$%W_,= >()O+<BugRH\H\G] ^ ##$%S6Gu4M)_(` a$%S6Gu4M)_(` a (
!XX__5_9
!GG**##
  88S
$[%8&&"m3z';'G&HI 

* 1<}L-I*"5e"<K*..J.#	 +..J[AQSW %%%ll66~yQG#--/J..0D0DE./((6 F
 !55&&00!l(4+@+@J<tAT+U*VW  JJ!l%()W #mm>>@JJU[[-E-I-I   4 4j(HU%n55%)99998B5Y KK$Y/AJ	
 II$Y/AJ
 #MM!$Y/AJ
 33((!
  #::)3&
JJJ 0 ;WQZ  JJ 0 ;WQZ ::(( GG 0 ;W '((#j/5$ .8  +j%((LLLLLL""))*5 !LL?+q000"%j/O"Cz?c/&::::#&z#CZ!---99
***44%s3%t,?
,K+LAN $D"  j*;<<<<""&&z2 ?1#55&q)555&&00!l#j\6I/Z[J\6]5^^_` k Kd  Ss   p1p6c                   U R                  XU5      nU R                  XU5      n[        S5       Vs/ s H%  n[        U R                  R	                  US95      PM'     snu  pxUR                  SU SU SU SU SU S[        R                   SU SU R                  U 5       SU SU R                  U 5       S35        Xx4$ s  snf )Nr   r8  
            r  z9 = triton_helpers.online_softmax_reduce(
                )
            r  )	rT  rE  r   r  r  r   r   r  rM  )	r   r  r  r  r!  r9  r  r  r  s	            r|   _online_softmax_reduce#TritonKernel._online_softmax_reduce"  s     66vPUV66vPUVMRSTX!VX#dhhooEo&B"CX!V
L:, ' !O#4Bse2f>R>R=S TLD11ZLBC DLD11ZLBC D		
 %% "Ws   ,Cc           	     D  ^ ^^ UUU 4S jX#U4 5       u  p#nSU SU SU SU S3	n[        S5       Vs/ s H%  n[        T R                  R                  TS95      PM'     n	nTR	                  SR                  U	5       SU 35        [        U 4S jU	 5       5      n
U
$ s  snf )	z+
Helper to codegen triton_helpers.welford.
c              3  J   >#    U  H  nTR                  TUT5      v   M     g 7frs   )rT  )r   r  r  r9  r   s     r|   r   (TritonKernel._welford.<locals>.<genexpr>7  s*      
+ ((>>+s    #ztriton_helpers.welford(r  r  r   r8  r  c              3  F   >#    U  H  nTR                  U5      v   M     g 7frs   )rM  )r   r  r   s     r|   r   r  ?  s     Xud33E::s   !)rE  r   r  r  r   r   r  )r   r  r  r  r  r!  r9  welfordr  welford_resultsresult_valuess   ``    `    r|   r  TritonKernel._welford3  s    
F+
& ,D6B4r&C5JFKAhOh3txxU;<hODIIo67s7)DEXXX	 Ps   ,Bc                   U R                  5       U R                  -
  nU S3nU S3n	U S3n
U R                  R                  U SU R	                  5        SU S35        U R                  R                  U	 SU R	                  5        SU S35        U R                  R                  U
 SU R	                  5        SU S35        US:X  a=  Uu  pnU R
                  R                  SU S	U	 S	U
 S
U SU	 SU
 SU SU SU S35        O9US:X  d   eU R
                  R                  SU S	U	 S	U
 SU SU SU	 SU
 S35        U R
                  R                  SU SU" U S3U5       SU	 SU" U	 S3U	5       SU
 SU" U
 S3U
5       S35        UnU R                  R                  US9nU R                  R                  US9nU R                  U R                  UUUUU	U
UU5	      $ )z%Helper to codegen a welford reduction_mean_m2_weightr  r  r  r  rt  r  z<_next = triton_helpers.welford_combine(
                    z,
                    z#
                )
                r  z;_next = triton_helpers.welford_reduce(
                    z1, roffset == 0
                )
                z            r  r  r  r8  )r  r>  r  r   r  r  r   r  r  r  rM  )r   r$  rm  r  r  r  r9  r!  r  accumulator_m2accumulator_weightr  r  r  r  r  r  s                    r|   r  TritonKernel.welford_reduceB  s    %%'$*A*AA#E*&<s+ *|73		m<(;(;(='>b
!L	
 			l4+>+>+@*AH:QO	
 			!",t/B/B/D.ERzQRS	
 ..$DfLLW^$4G<N;O P MN#326H5I JF"RD6( + "%5555LLW^$4G<N;O PG2k]"^,<B?Q>R S 	MZ;-u(={KL MC
n-=U+C^ TU V J2D1EU/KM_$`#a b	
 !HHOO%O0	e422""

 
	
r   c
                    U R                  XXgX5      n
X#U/n[        X5       H  u  pUR                  U SU 35        M     X#U4$ )z0Helper to codegen call to triton_helpers.welfordr  )r  r  r   )r   r  r  r  r  r  r  r  r!  r9  rs  result_exprsresult_exprr  s                 r|   r  +TritonKernel.welford_reduce_final_reduction}  sV     vRD#>"%l";KMM[MUG45 #< }44r   c                    U R                  XXVU5      nX#/n	[        X5       H  u  pUR                  U
 SU 35        M     X#4$ Nr  )r  r  r   )r   r  r  r  r  r  r!  r9  rs  r  r  r  s               r|   r  2TritonKernel.online_softmax_reduce_final_reduction  sT     ,,VxeT"/"%l";KMM[MUG45 #< %%r   c                N    U R                   (       a  U R                   S   $ [        $ )NRSPLIT)rJ  r,   r   s    r|   
max_rsplitTritonKernel.max_rsplit  s"    $$X..  r   c                   U R                   S   nU R                  5       (       d  SOSnXBR                  -  U R                  5       -  nU R                  R                  U5      u  pxU R                  R                  SU SU SU R                  U5       S[        U5       SU S	U S
U S3SS9  U R                  R                  U SU S[        U5       S35        U S3$ )z
Generate code to save a [XBLOCK, RSPLIT] temporary workspace, where each thread block writes a different
column.  After the barrier, every thread block loads the completed value so that it can compute the final
value independently.
r6  zxindex < xnumelNrw  z_ws = (r  z).to(tl.pointer_type(z))
                tl.store(z%_ws + (xindex * RSPLIT + rsplit_id), r  r  Tstripz_peers = tl.load(z_ws + (xindex * RSPLIT + rsplit_arange), rsplit_mask, eviction_policy='evict_first', other=triton_helpers.if_mask(rsplit_mask, r  _peers)r@  rr  rA  r  rn  r+  rM  r   r  r>   rN  r   rW   )	r   r$  r9  default_valxnumelr   r'  r)  r*  s	            r|   r  7TritonKernel.codegen_cooperative_reduction_peer_combine  s#    S!(,(@(@(B(B ..(4??+<<!GGPPQWX%%GG9C0A0A)0L/MMbcnotcubv w$%J:,VXY]X^ _  	& 	
 	&&l+J< 8eers~e  eA  ACD	
 V$$r   c                   U R                   (       d   eSU l         U R                  USS9nSU l         U R                  R                  U5      n[        R
                  " 5       nU R                  (       a*  UR                  U R                  XR                  5      5        [        U[        5      (       aZ  U R                  R                  [        UU R                  UUUR                  U5      USUR!                  5       < 35      5      5        O][        U["        5      (       d   eU R                  R                  [        USU SUR$                   SU SUR&                   S	3	5      5        UR)                  5         g )
NFTr  r  r  r^  r  r  r  )r?  r  ra  r  r,  r-  rZ  r.  r/  rN  r  r   r   rM   r  rb  rp  r   r   r   r0  )r   r_  r   r  r  rJ  r3  s          r|   store_reductionTritonKernel.store_reduction  sV    $$$$ %==$=7 $iit$))+
%%$$,,T3G3GH h00  **55  ,+H,C,C,E+HI	 h8888  **uD););(<CwbIZIZH[[\] 	r   c           	     :  ^^^
^^^^ [        5       mTR                  S5        [        5       m
[        S5       V^s/ s H#  m[	        U
UU4S j[        U5       5       5      PM%     nnSR                  S [        R                  R                  U5       5       5      nTR                  SU S35        [        5       mSmS	S
K
Jn  U" 5       m " U
UUUU4S jS[        5      nTR                  5          [        R                  " U" 5       5         U" U6 n	SR                  S U	 5       5      n	TR                  SU	 35        S S S 5        S S S 5        U R                   R#                  TR%                  5       TS9$ s  snf ! , (       d  f       NC= f! , (       d  f       NL= f)Nz@triton.jitr   c              3  V   >#    U  H  nTR                  S T SU 3TU   S9v   M      g7f)rW  r  r8  N)r  )r   nr  dtypesr  s     r|   r   ,TritonKernel._lift_helper.<locals>.<genexpr>  s.     X1#,,QCq}F1I,>s   &)r  c              3  8   #    U  H  n[        U5      v   M     g 7frs   r   r  s     r|   r   r    s     R.Qc!ff.Qrz  zdef {name}():r  r   re   c                  >   > \ rS rSr        SU UUUU4S jjrSrg)+TritonKernel._lift_helper.<locals>.CSEProxyi  c                z   > TSU 3-  m[        TU5      " U0 UD6nTR                  T[        T	U5      " U0 UD6US9$ )Nr  r8  )rt  r  )
r   r_  ra  rV  output_dtyper  dtype_handlerhelperhelper_name	overridess
        r|   _default4TritonKernel._lift_helper.<locals>.CSEProxy._default   sk     4&z)&!   # " #
 ||It,d=f=& $  r   r   N)r_  r   ra  ztuple[Any, ...]rV  dict[str, Any]r   r   )rt   r   r   r   r  r   )r  r   r  r  r  s   r|   CSEProxyr    s-    '6@N r   r  c              3  8   #    U  H  n[        U5      v   M     g 7frs   r  )r   r  s     r|   r   r    s     B'F'rz  return r   )rN   r   rK   rE  r  r   rq  rr  from_iterabler  r^  rf   r&   r7  rD   set_ops_handlerrF  r  r   )r   r  num_argsr  r  ra  	signaturerf   r  outputsr  r   r  r  r  s      ``     @@@@@r|   _lift_helperTritonKernel._lift_helper  sK     !'e 1X
 XhXX 	 
 IIRioo.K.KD.QRR	=267#%	 *P24	 	~ 	$ ]]_a//
;$iGiiB'BBGwwi01 <_
 $$(():k(RRU
J <;__s)   *E60F2E;>F;
F		F
Fc                $  ^ ^ T R                   (       d   eT R                  (       a   S5       e[        S T R                   5       5      nT R	                  U5        [        U5      nT R                  (       a   S5       e/ n/ n[        S U 5       5      n[        R                  " T R                  R                  T R                  5      nT R                  U[        U5      U5      nT R                  5       T R                   -
  n	[#        X15       GH3  u  pT R                  R                  T R                  U
 S[%        U5       S3US9nT R                  R                  T R                  SU S	T R'                  5        S3US9n
UR)                  U
5        [+        U5      nT R,                  (       a  M  T R                  R/                  US9nT R1                  5       nS
US'   SS	R3                  U5       S3nUR4                  (       a  SOSnT R6                  R9                  U SU S	U S	U S35        UR)                  U5        GM6     S mUU 4S jnU" ST" U5       SU	 S	U S3UUU5      nT R,                  (       d  U Vs/ s H!  nU" SU S3[;        UR<                  5      S9PM#     nnU" [        U5      [        U5      5      nU" [        U5      U5      n[#        UU5       VVs/ s H  u  nnU" SU S	U S3UR<                  S9PM      nnn[#        UUU5       H+  u  nnnT R                  R9                  U SU S	U S35        M-     OUnU H*  n[?        U[@        5      (       d   e[        U5      Ul!        M,     [        U5      $ s  snf s  snnf )NTODOc              3  >   #    U  H  oR                    S 3v   M     g7fr]  r^  rx  s     r|   r   $TritonKernel.scan.<locals>.<genexpr>"  r_  r`  z(ops.scan not supported inside ops.maskedc              3  8   #    U  H  n[        U5      v   M     g 7frs   r@   r   r9  s     r|   r   r  *       Fve*511vrz  rp  r  r8  r  r  r  r.  r  r  zfloat('nan')z-1r  c                2    SR                  S U  5       5      $ )Nr  c              3  *   #    U  H	  o S 3v   M     g7f,Nr   r   r  s     r|   r   1TritonKernel.scan.<locals>.csv.<locals>.<genexpr>L       <VEgQKVr  r   rs  s    r|   csvTritonKernel.scan.<locals>.csvK      88<V<<<r   c                4  > [        U5      n[        U5       Vs/ s H  oP SU SU 3PM     nn[        U4S jU 5       5      (       a,  U Vs/ s H  nTR                  R	                  U5      PM      sn$ U Vs/ s H  nTR                  R                  US9PM     n	nTR                  R                  T" U	5       SU  35        [        X5       H-  u  pU(       a  X*l	        TR                  R                  Xz5        M/     [        U	5      $ s  snf s  snf s  snf )Nr  c              3  Z   >#    U  H   nTR                   R                  U5      v   M"     g 7frs   r  containsr   r  r   s     r|   r   :TritonKernel.scan.<locals>.cse_multiple.<locals>.<genexpr>Q  #     LI488$$Y//   (+r8  r  )r  rE  r  r  r  r  r  r   r  r   r  r  )r  rs  r  r  r  r  
cache_keysr  _dtyperesult_varsr$  r"  r   s              r|   cse_multiple'TritonKernel.scan.<locals>.cse_multipleN  s    FA;@8D8aF"QCr%18JDLLLLAKLIY/LLGMNvV488???8vKNLL""{#$Cv. *-[)E%
+0(Y3 *F %% ELNs   D%D9#Dztl.associative_scan((r  ztriton_helpers.select_one((z1), rbase == (RBLOCK - 1), dim=-1, keep_dims=True)ztl.where(roffset > 0, z = tl.where(roffset > 0, )"r?  rZ  r   rG  r  r   rA  r  r  r  r  r  r  r  r  r  r>  r  r:  r  r  rE  r  r  rQ  r   rG  r  r   r@   r9  r  rM  r   )r   r  r  rs  r  broadcasted_valuesaccumulatorscse_computecombine_helper_fnr!  r  r9  value_dtyper  r  reduced_sizer  r0  partial_scan_varspartial_scan_varpartial_reduce_vars	accs_nextfull_scan_vars	full_scanpartial_scanr/  acc_nextpartial_reducer$  r"  s   `                            @r|   scanTritonKernel.scan  s    $$$$--5v5-MD<L<LMM% u??N$NN"FvFF''(9(94<<H --j#f+vN%%'$*A*AA/LE((++'1%89; , K
 HH%%";-r$2E2E2G1HJ & E
 %%e,&u-H,,,"hhooEo:#335#&R !"499\#:";1=,1,C,C.		##"m;|nBwir(STU ##K05 08	=	& )#C(:$;#<CuBGXFYYZ[	
 (( ):#
 ):$	 12B1CCtu-.>.D.DE ):   # #5#6>Q8RSI'l(;=NON 03>CT/U
 0V+I|	 ,YKr,qI&,, 0V   :=<)<:5+~ &&"m#<XJbHXXYZ: ,K%Jj*;<<<<#-e#4J  & [!!;#s   ?(N%%Nc                L  ^ ^ T R                   (       d   eT R                  (       a   S5       e[        S T R                   5       5      nT R	                  U5        [        U5      nT R                  (       a   S5       eT R                  (       d   S5       e[        R                  " T R                  R                  T R                  5      nT R                  5       T R                  -
  n[        S U 5       5      n[!        U5      [!        U5      :X  d   e[#        U5       VV	s/ s H#  u  pU" SU	 ST R%                  5        S3X   S	9PM%     n
nn	S
 mUU 4S jnT R                  S   R&                  (       d   eT R)                  T R                  S   5      (       a  SOSn[!        U5      S:X  a/  SU
S    SU
S    SU SU SU SU S3nU" U[!        U5      XQ5      nO[+        S5      e[-        X5       H  u  nnX_l        UR0                  Ul        M     [        U5      $ s  sn	nf )Nr  c              3  >   #    U  H  oR                    S 3v   M     g7fr]  r^  rx  s     r|   r   $TritonKernel.sort.<locals>.<genexpr>  r_  r`  z(ops.sort not supported inside ops.maskedz3ops.sort is only supported in persistent reductionsc              3  8   #    U  H  n[        U5      v   M     g 7frs   r  r  s     r|   r   rE    r  rz  r  r  r  r8  c                2    SR                  S U  5       5      $ )Nr  c              3  *   #    U  H	  o S 3v   M     g7fr  r   r  s     r|   r   1TritonKernel.sort.<locals>.csv.<locals>.<genexpr>  r  r  r   r!  s    r|   r"  TritonKernel.sort.<locals>.csv  r$  r   c                4  > [        U5       Vs/ s H  o@ SU SU 3PM     nn[        U
4S jU 5       5      (       a,  U Vs/ s H  nT
R                  R                  U5      PM      sn$ [        U5       Vs/ s H  nT
R                  R	                  X4   S9PM      nnT
R
                  R                  T	" U5       SU  35        [        Xu5       H-  u  pU(       a  X(l        T
R                  R                  Xh5        M/     [        U5      $ s  snf s  snf s  snf )Nr  c              3  Z   >#    U  H   nTR                   R                  U5      v   M"     g 7frs   r'  r)  s     r|   r   :TritonKernel.sort.<locals>.cse_multiple.<locals>.<genexpr>  r+  r,  r8  r  )rE  r  r  r  r  r  r   r  r   r  r  )r  r  r  r  r  r-  r  r/  r$  r"  r   s            r|   r0  'TritonKernel.sort.<locals>.cse_multiple  s    ;@8D8aF"QCr%18JDLLLLAKLIY/LLEJ1XNX488???;XKNLL""{#$Cv. *-[)E%
+0(Y3 *F %% ELNs   D%D7%Dr.  r   rnumelr   ztriton_helpers.sort_with_index(r   rF   z	, stable=z, descending=zUnhandled sort)r?  rZ  r   rG  r  r   rA  r  r  r  r  r  r  r  r>  r  r  r  r  rB  ri  r  r  r   rQ  )r   r  rs  stable
descendingr  r4  r!  r  r  r2  r0  rO  r  r/  r$  	input_varr"  s   `                @r|   sortTritonKernel.sort  s)    $$$$--5v5-MD<L<LMM% u??N$NN"(( 	
A	
(  ''(9(94<<H%%'$*A*AAFvFF6{c&k)))
 &f-	
 . "5'D,?,?,A+B!DFI .	 	 
	=	& #00002243C3CB3GHHhv;!12DQ2G1HK]^_K`Ja b82cU)F8=AO  'tS[%HK !122%(%=!J	#(  ) 0 0J &> [!!Q
s   "*H c                   U R                   (       dV  U R                  (       dE  U R                  (       d4  U R                  (       d#  U R                  (       d  U R
                  (       d  gU R                   Vs/ s H  oR                  (       d  M  UPM     nnU R                  (       GaW  [        U5      S:  GaG  [        U5       H  u  p1U R                  R                  US9   UR                  nU R                  (       a  SOSnU R                  (       a  SOU S3nU R                  R                  SU S	U S
U S
UR!                  5        S3	5        SSS5        U R                  R                  US-   S9   U R#                  XR                  5        SSS5        M     U R                  R                  [        U5      S9   U R%                  U R                  5        U R                  R'                  U R                   5        U R                  R'                  U R                  5        U R                  R'                  U R                  5        U R                  R'                  U R                  5        SSS5        [)        / [        U5      Q5       GHl  u  p1U R                  R                  US-   S9   U R*                  UR,                     R/                  5        H  u  pxU[        U5      S-
  :  ar  X#S-      n	U R*                  U	R,                     U   n
[0        R3                  U	5      n[5        U	R6                  U5      n[9        X5       VVs/ s H  u  pXU-  -
  PM     nnnU R                  R                  [;        U R<                  U   U SU S
[>        R@                  RC                  U5       S35      5        M     SSS5        U RD                  RG                  U RH                  5        URK                  5         GMo     OU R                  R'                  U R                   5        U R                  R'                  U R                  5        U R                  R'                  U R                  5        U R                  R'                  U R                  5        U R                  R'                  U R                  5        U R                  (       ai  U R                  (       d  U R
                  (       aG  U RL                   S3nU R                  R'                  SU S3SS9  U RN                  RQ                  5         U R                  R'                  U R
                  5        U R                   RS                  5         U R                  RS                  5         U R                  RS                  5         U R                  RS                  5         U R                  RS                  5         U R
                  RS                  5         gs  snf ! , (       d  f       GN= f! , (       d  f       GMX  = f! , (       d  f       GN= fs  snnf ! , (       d  f       GN= f)z
Concat output code from index_code, loads, compute, stores,
suffix into self.body.

For pointwise kernels, this is called just once at the end.

For reduction kernels, this generates a loop over the reduction
axis.
Nr   )r   rsplit_startr  
rsplit_endr  zfor zoffset in range(r  zBLOCK):rF   z = tl.advance(r  z + tl.program_id(1)zR
                if HAS_RSPLIT:
                    triton_helpers.x_grid_barrier(ru  Tr  )*indexing_coder  r+  r  rM  rN  rG  rw  r?  r  r  r  r7  r<  rZ  r   r   r|  r  r   rD  rT  r   r:  r   r   r   r  r  rM   rS  rD   r  r  r  
invalidaterO  cache_clearrm  rn  r/  clear)r   r   
loop_treeslevelr<  
loop_startloop_endr  advancement	prev_treeprev_advancement
prev_blockprev_num_itercurprevsem_ptrs                   r|   codegen_bodyTritonKernel.codegen_body  s    zz{{||%%##'+'7'7H'7t<<d'7
H   S_q%8(4YY%%U%3![[F373M3MSVJ(,(B(B6(RWHX  II''vh&6zl"XJbQWQ]Q]Q_P``gh 4 YY%%UQY%788yyI 87  5 !!Z!9..tyy9		  !3!34		  ,		  .		  - :  ((@)J*?(@AYY%%UQY%7262K2K		3eg3.	 !3z?Q#66(219(=I/3/H/H )0'0), *7)E)Ei)PJ,3IOOZ,PM 25[1S+1SIC !$]&: :1S ( +
 		++( $ 8 8 C#,+^I;bI^I^_jIkHllm n!3 84 ##D$:$:;  "9  B< IIT//0IITZZ(IIT\\*IIT[[)		//0%%""d&:&:--..ABGII33:) <    66BBD		--.  "

$$&""$] I 43 87 :9,+ 87sR   6W4W4!A0W95X?B0X/BX6
X0AX69
X	
X	
X-0X66
Y	c                   / nU R                  5       (       Ga)  / nU R                  SU/ 5        U GH  n[        U[        5      (       a  UR	                  [        U5      5        M5  [        U[        5      (       aM  UR	                  [        [        R                  R                  R                  UR                  5      5      5        M  [        U[        R                  5      (       aC  UR	                  [        [        R                  R                  R                  U5      5      5        M  [        S[        U5       35      e   U$ )Nr   z!Unsupported numel argument type: )r  add_numel_to_call_argsr  r  r  r   rb   rD   r  r  	size_hint
inner_exprr   r  r1  r  )r   ra  
numel_argsrW  s       r|   kernel_benchmark_extra_args(TritonKernel.kernel_benchmark_extra_args+  s    !!+-J''J;!c3''KKC)_55KKAGG$4$4$>$>s~~$N OPUZZ00KKAGG$4$4$>$>s$C DE$'Hc%TUU " r   c                z   [        5       nU R                  R                  5       u  p4pVUR                  / SQ5        UR	                  5          [
        R                  " 5       n/ n[        XE5       GH  u  pS[        U5       3n[        R                  R                  U	5      nU(       a  UR                  U S[        R                  R                  R                  UR                  5       5       S[        R                  R                  R                  UR!                  5       5       SUR#                  5        SUR%                  5        S3
5        GOU	[        R                  R&                  ;   a  [        R                  R&                  U	   nUR                  U S[        R                  R                  R                  UR)                  5       5       S[        R                  R                  R                  UR+                  5       5       SUR,                   SUR.                   S3
5        GO[1        U
[2        5      (       a\  [        R                  R                  R5                  U
R6                  5      nSU
R8                  ;   a  S	nUR                  U S
U 35        O[1        U
[:        5      (       ay  [        R                  R=                  5       n[        R                  R                  R5                  U
R                  5      nUR                  U SU SU SU
R.                   S35        O[?        SU	 35      eURA                  U5        GM     URC                  U RE                  5       5        UR                  SSRG                  U5       S35        S S S 5        UR                  / SQ5        [        R                  R=                  5       nURH                  nUR	                  5          UR                  S[        R                  RJ                  RM                  U5       S35        UR	                  5          UR                  [        R                  RJ                  RO                  U5      5        SU 3nUR                  U SU S35        UR                  [Q        [R        RT                  5       SU S35        S S S 5        S S S 5        UR                  / SQ5        UR	                  5          UR                  S[        R                  RJ                  RM                  U5       S35        UR	                  5          UR                  [        R                  RJ                  RO                  U5      5        UR                  S[Q        [R        RT                  5       S35        S S S 5        S S S 5        UR                  / SQ5        UR	                  5          UR                  S5        UR                  S5        UR                  S5        UR                  S5        UR                  SU 35        UR                  S5        UR                  S5        S S S 5        U$ ! , (       d  f       GN= f! , (       d  f       GN= f! , (       d  f       GN= f! , (       d  f       N= f! , (       d  f       N= f! , (       d  f       U$ = f)N)r   r   zdef get_args():arg_z = rand_strided(r  z
, device='z	', dtype=r  r  r   r  z = torch.zeros(z*Don't find the buffer or const tensor for r	  r  )
rs  zdef call(args):zwith r  streamz = get_raw_stream(z.run(*args, stream=)rs  rs  z def benchmark_all_configs(args):z.benchmark_all_configs(*args))rs  rs  zif __name__ == '__main__':z<from torch._inductor.runtime.benchmarking import benchmarkerr   zargs = get_args()z:ms = benchmarker.benchmark_gpu(lambda: call(args), rep=40)z	num_gb = zgb_per_s = num_gb / (ms / 1e3)z<print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s"))+rN   ra  python_argdefs
writelinesr7  rq  rP  r  r  rD   r  try_get_bufferr   r  
size_hintsget_size
get_stride
get_devicer  	constantsr  rI  devicer9  r  rS   rl  r+  r_  rU   get_current_device_or_throwKeyErrorr  extendro  r   r   
device_opsdevice_guard
set_devicer   r9   KERNEL_NAME)r   num_gbrP  _argdefs	call_argsr  r  name_cnt	var_namesarg_namearg_sigvar_namer  const_tensorsymval_hintr}  rP  current_devicer   stream_names                       r|   codegen_kernel_benchmark%TritonKernel.codegen_kernel_benchmark;  s   !,0II,D,D,F)Y56]]_ (HI%(%>!!$x.!12gg,,X6$$#*$4QWW5E5E5P5PQTQ]Q]Q_5`4aacdedkdkdtdtdd  AD  AO  AO  AQ  eR  dS  S]  ^a  ^l  ^l  ^n  ]o  ox  y|  yF  yF  yH  xI  IJ  K !2!22#$77#4#4X#>L$$#*$4QWW5E5E5P5PQ]QbQbQd5e4ffhijipipiyiy  jE  jE  FR  FY  FY  F[  j\  i]  ]g  ht  h{  h{  g|  |E  FR  FX  FX  EY  YZ  [  11"#''"2"2"<"<W\\"JK
 %4&'$$z[M%BC66WW@@BFGG,,66w}}EE$$#*OE7*VHIV]VcVcUddef #DXJO    *A &?B T==?@wtyy';&<A>?K N 	9:<<>$$]]_uQWW%7%7%D%DU%K$LANO  GG&&11%8 !'ug.  K=0B5'!KL  ;22344G}TUV !  	JK]]_uQWW%7%7%D%DU%K$LANO  GG&&11%8   c+"9"9:;;XY	 !  	DE]]_N R 01L y12=>N   g _X ! _  ! _ _  sf   MY&AY73BY%6Y7)AZ6A%Z	ZA;Z+
Y"%
Y4	/Y77
Z	
Z	Z
Z(+
Z:c                    [         R                  " SR                  [        R                  R
                  R                  S5      5      5      $ )Nzl
            from torch._dynamo.testing import rand_strided
            {}
            import torch
        get_raw_stream)textwrapdedentrb  rD   r  r  import_get_raw_stream_asr   s    r|   imports_for_benchmark_kernel)TritonKernel.imports_for_benchmark_kernel  s:     F177%%>>?OPQ
 	
r   c                    U R                   (       a  gU R                  (       a  gU R                  (       a  U R                  (       d   egU R                  (       a  gg)NrJ  rZ  r  r  	pointwise)rJ  rZ  r  r?  r   s    r|   _get_heuristicTritonKernel._get_heuristic  sD    !''*&&(((()""r   c                    [         R                  R                  R                  5       [         R                  " 5       [
        R                  [
        R                  [
        R                  R                  [
        R                  [
        R                  [
        R                  [
        R                  [
        R                  [
        R                  R                  [
        R                  R                   [
        R                  R"                  S.n [         R$                  R&                  b  SU S'   [
        R(                  " 5       (       a  SU S'   [
        R*                  (       aL  [
        R*                  U S'   [
        R,                  U S'   [
        R.                  U S'   [
        R0                  U S'   [
        R2                  (       a9  [
        R2                  U S	'   [
        R4                  U S
'   [
        R6                  U S'   U $ )N)backend_hash$are_deterministic_algorithms_enabledassert_indirect_indexingautotune_local_cacheautotune_pointwiseautotune_remote_cacheforce_disable_cachesdynamic_scale_rblockmax_autotunemax_autotune_pointwisemin_split_scan_rblockspill_thresholdstore_cubinTis_hipr  profile_bandwidthprofile_bandwidth_regexprofile_bandwidth_output/profile_bandwidth_with_do_bench_using_profilingcoordinate_descent_tuning coordinate_descent_search_radius'coordinate_descent_check_all_directions)ru   r  _tritontriton_hash_with_backendr  r   r  r  r   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  )inductor_metas    r|   inductor_meta_common!TritonKernel.inductor_meta_common  sx    "KK//HHJ494^4^4`(.(G(G$*$?$?"(--"B"B%+%A%A$*$?$?$*$?$?"//&,&C&C%+]]%H%H%}}<<!==44
 ==(&*M(#)-M+&##171I1IM-.7=7U7UM348>8W8WM45FF KL ++00 56 77 <= >> CD r   c                   ^ ^! [        5       n0 nU R                  R                  5        H  u  pE[        U5      (       a  U R                  (       d  M(  [
        R                  R                  R                  U5      n[        U[        [        R                  45      (       d  SnO[        [        U5      5      nXsU'   M     Uc  UR                  [        5       5        [
        R                  R!                  5       R"                  nUS:X  a  UR                  S5        OUR                  S5        [$        R&                  (       a  UR                  U R)                  5       5        U R*                  R-                  5       u  m n	m!n	[/        T!5       H  u  p[        U[0        5      (       d  M  [3        [        R4                  UR6                  5      nU[
        R                  R                  R8                  ;   d  Mj  [1        UR:                  [
        R                  R                  R8                  U   5      T!U
'   M     [=        5       nU R>                   GH'  nXR*                  R@                  ;   a(  URC                  U R*                  R@                  U   5        XR*                  RD                  ;   am  U[
        R                  RF                  ;  aO  XRF                  ;  a@  URC                  [3        [H        U R*                  RD                  U   5      RJ                  5        XR*                  RL                  ;   d  M  U R*                  RL                  U   n[        U[N        5      (       a   eURC                  U5        GM*     [Q        T T!5       HX  u  nn[        U[R        5      (       d  M  URT                  [V        RX                  :X  d  M=  URC                  UR:                  5        MZ     [[        U5      nU R]                  5        H[  n[1        UR^                   S3UR`                  5      nT!Rc                  U5        T Rc                  [e        UR:                  5      5        M]     U U!4S jnU Rf                   HY  nURh                  (       a  U Rj                  (       a  M'  URl                  c  M6  U" UR^                  Ro                  5        S35        M[     U Rp                  (       a  U" S5        [s        T!U Rt                  T S	9nU[v        Rx                  " [
        R                  R!                  5       5      0 S
.n[
        R                  Rz                  =(       d    [
        R                  R|                  nU R                  5       R                  [        U R                  5      [        [        R                  5      UUU R                  U R                  U R                  S.U R                  5       EnU R                  (       a  U R                  US'   U Rp                  (       a  U Rj                  US'   S n[$        R&                  (       d  [$        R                  (       a  U R                  5       S-  nUUS'   [        T!5      /US'   [        T!5       H  nSUS   T!U   R:                  '   M     UU lO        U R                  5         U R                   H%  nUR                  S5        UR                  U5        M'     U R                  (       a5  SU R                  5        SU R                  R$                  < SU< SU< S3	nOU R                  (       a>  U R                  R                  5       nSU R                  5        SU< SU SU< SU< S3nO^Sn[        U5      S:X  a  [        [        T!5      5      S:X  a  SnOSnSU R                  5        SU< SU S U< SU< S!U R                   S3nUR                  U5        UR                  S"U=(       d    [        [        R                  5       S#SR                  S$ T  5       5       S%35        UR                  5          U R                  U5        U R*                  R                  5        H  u  nnUR                  U S&U 35        M     UR                  U R                  5        S S S 5        [$        R&                  (       a   UR                  U R                  U5      5        UR                  5       $ ! , (       d  f       NS= f)'Ni    cpuz"triton_helpers.set_driver_to_cpu()z"triton_helpers.set_driver_to_gpu()r  c                   > [        5       (       a  TR                  [        U 5      5        TR                  [        U SS95        g )NT)is_constexpr)r?   r  rJ   rH   )r  argdefsr  s    r|   add_constexpr_arg6TritonKernel.codegen_kernel.<locals>.add_constexpr_arg3  s2    -//  h!78NN78$?@r   r   r  )
size_dtyper  )r  r}  r|  )	grid_typerW  kernel_namemutated_arg_namesoptimize_memr;  num_loadnum_reductiontiling_scoresr  g    eAkernel_num_gbconfigsrF   r|  r   z$
                @triton_heuristics.z(
                    config=zI,
                    filename=__file__,
                    triton_meta=z$,
                    inductor_meta=z;
                )
                @triton.jit
            z!(
                    size_hints=z%,
                    reduction_hint=r   r?  ztile_hint=TileHint.SQUARE,ztile_hint=TileHint.DEFAULT,r  zH
                    filename=__file__,
                    triton_meta=z*,
                    min_elem_per_thread=zdef r  c              3  @   #    U  H  oR                  5       v   M     g 7frs   )	full_namer  s     r|   r   .TritonKernel.codegen_kernel.<locals>.<genexpr>  s     Cc[bVWKKMM[br  r  r  )brN   r@  r:  r:   r?  rD   r  r  symbolic_hintr  r  r   r[  r.   r   r   r~  r  r   benchmark_kernelr  ra  ru  r  rS   r	   r   r+  inv_precomputed_replacementsr_  r   	mutationsinput_buffersr  r*  removed_buffersrO   
inner_nameoutput_buffersrR   r  rU   	zero_moderV   ZERO_ON_CALLr   r  r<  r  r  rH   rG  rB  r  
tensor_dimr   rZ  ra   r  r*   rQ  is_inferenceis_backward_get_grid_typert   setrW  r   r9   DESCRIPTIVE_NAMEr;  r  r  r  r  r  estimate_kernel_num_bytesr]   r^   rX  rh  rF  r   rJ  r  rA  get_reduction_hintr  r_   r  r  r   r7  codegen_static_numelsaliasesr  r  r   )"r   r_  coderx  r<  r  
numel_hintrl  device_typer  r  rW  r  mutated_argsmutationmutation_argargnamer   sizeargr  triton_meta_signaturerX  r  r  r  arg_numr  heuristics_linereduction_hint	tile_hintoldnewr  r  s"                                   @@r|   codegen_kernelTritonKernel.codegen_kernel  sZ   
![[..0MF"6**43H3H))77>Jj3*>?? !	+C
O<	!*v) 1, <KK134''==?DDKe#@A@A&&D==?@#'99#;#;#= Iq	*FA#w'' ellCHH5QWW--JJJ#*!''"2"2"O"OPV"W$IaL + )3H99222  !8!8!BCII555AGG$;$;;$8$88  )B)B8)LMXX 99333#yy77A%lJ????  . '6  3LGS3--MM%6%C%CC  . 4 l+++-DU3TZZ@GW%NN77<<01 .	A $$D  T%>%>&!2!2!4 5U;< % %%h' 1$"2"2G!
 /&--agg.Q.Q.ST'
 ww++Bqww/B/B ,,.77!$"5"56{;;<!-(!//
 '')
 -1-?-?M/*%%484M4MM01""f&>&>335;F-3M/*"+I"6!7I +95G@AK$Yw%7%<%<= 6 '++FNN2KK , #$$($7$7$9#: ; --447 8!!, 0##0"3 4O ""!]]==?N#$$($7$7$9#: ;  *~ .$$2#3 4!!, 0##0"3 4	O I:!#/	:;q@ <I =I#$$($7$7$9#: ;  *~R	{ ;!!, 0##0"3 4))-)A)A(B C	O 	O$473{6678$))Cc[bCc:c9ddfg	
 [[]&&t, II--/S#c#/0 0KK		"	  ""KK55f=>}} ]s   A'c??
dc                   [         R                  R                  R                  U 5      n [	        U [
        R                  [        45      (       a  [        U 5      n[        U5      nU$ Sn[         R                  R                  R                  X5      (       dI  US:  a  [        SU  35      eUS-  n[         R                  R                  R                  X5      (       d  MI  U$ )N   i @  z!Failed to find static RBLOCK for r   )rD   r  r  simplifyr  r   r[  r  r.   statically_known_leqr1  )rO  rN  s     r|   _get_persistent_RBLOCK#TritonKernel._get_persistent_RBLOCK  s    !!**62fu}}c233f+C!#&C 
 Cgg&&;;FHH?$'H%QRRq gg&&;;FHH 
r   c                P     [         R                  U 5        g! [         a     gf = f)NTF)rE  r  r1  )rO  s    r|   has_persistent_RBLOCK"TritonKernel.has_persistent_RBLOCK  s*    	//7 		s    
%%c                   S	S jnU R                    GHt  nUR                  (       a  U R                  (       ai  [        R                  R
                  R                  UR                  5      nU" U5      (       a)  UR                  UR                   S[        U5       35        UR                  (       a  U R                  (       a  U R                  (       a1  U R                  U R                  UR                  5      5      nSU S3nOU R                  UR                  5      nUR                  UR                  R!                  5        SU 35        UR                  S:X  d  GMO  U R"                  (       d  GMc  UR                  S5        GMw     g)
ay  
We get a small speedup from hard coding numels if they are static.

This code stomps on the passed-in values by writing an constant to the top of the kernel.

In a kernel like:
def KERNEL_NAME(in_ptr0, in_ptr1, out_ptr2, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):

We would add
xnumel = 4096
r0_numel = 768

After the signature, before the kernel code, if we decided to make these static. As its hardcoded, it becomes
a better signal to triton on how to unroll and do some static indexing. So, it's not so much that downstream
knows that its a static numel, as that you just plop a constant into the kernel.
c                B    [        U [        R                  [        45      $ rs   )r  r   r[  r  )r+  s    r|   is_static_integer=TritonKernel.codegen_static_numels.<locals>.is_static_integer  s    dU]]C$899r   znumel = z*triton_helpers.constexpr_next_power_of_2((z + RSPLIT - 1) // RSPLIT)zBLOCK: tl.constexpr = r6  zXBLOCK: tl.constexpr = 1N)r+  r   r   r   )rG  rB  r?  rD   r  r  r  r  r   r<  r  r  rZ  rG  r  r  r   r;  )r   r  r  r   simplified_tree_numelr  rN  s          r|   r  "TritonKernel.codegen_static_numels  s   $	: $$D$$(=(=()(8(8(A(A$**(M%$%:;;NNdkk](3?T;U:V#WX  T%>%>-- JJt';';DJJ'GHEFugMfgC55djjAC$++"3"3"5!66LSERS{{c!dmmm9: %r   c                   [        U R                   Vs/ s H  n[        UR                  (       + 5      PM     sn5      nU R                  (       a  US:X  d   e[
        R                  $ US:X  a  [
        R                  $ US:X  aN  [        [        U R                  U R                  5      5      (       a  [
        R                  $ [
        R                  $ US:X  a  [
        R                  $ [        SU 35      es  snf )NrF   r   r   z"Unsupported number of dimensions: )r  rG  r  rB  rZ  r'   CooperativeReductionGridGrid1Dr   r   re  Grid2DWithYZOverflowGrid2DGrid3Dr1  )r   r   r  s      r|   r  TritonKernel._get_grid_type  s    8H8HI8H***+8HIJ%%6M6$===!V$+++!V3t22D4D4DEFF(===$+++!V$+++=aSABB Js   $C6c                   U R                    H  n[        UR                  [        R                  [        R
                  45      (       a  UR                  nO)[        R                  R                  R                  X5      nUR                  (       a  U R                  (       d  M  UR                  U5        UR                  [        U5      5        M     g rs   )rG  r  r  r   r[  r   rD   r  wrapper_codegenerate_numel_exprrB  r?  r  r  )r   r_  r  	arg_typesr   r+  s         r|   rk  #TritonKernel.add_numel_to_call_args  s    $$D$**u}}ell&CDDzzww++??K$$(=(=(=  &  d, %r   c                   [         R                  R                  nUR                  5         U R                  R                  5       u  pEpFU R                  XU5        U R                  R                   H  nUR                  U5        M     UR                  UUSUU R                  S9  [        U R                  R                  5       H  nUR                  U5        M     g )NT)r   r  rX  )rD   r  r	  write_triton_header_oncera  ru  rk  workspace_argsgenerate_workspace_allocationgenerate_kernel_callrX  rD  generate_workspace_deallocation)r   r_  r  wrapperr  r  r  wss           r|   call_kernelTritonKernel.call_kernel  s    ''&&((*%)YY%=%=%?"a##DY?))**B11"5 + 	$$(( 	% 	
 499334B33B7 5r   c                   [         R                  R                  nU R                  R	                  5       u  p#pB[        X45       H  u  pV[        U[        5      (       d  M  [         R                  R                  (       a  UR                  SU SU S35        MU  SU S3nUR                  U5        SU S3nUR                  U5        M     g )Nz:AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_check_inf_and_nan("z", z));zassert not z.isnan().any().item()z.isinf().any().item())
rD   r  r	  ra  ru  r  r  rT   cpp_wrapperr   )r   r  r  r  arg_signaturesrW  arg_signaturer  s           r|   codegen_nan_checkTritonKernel.codegen_nan_check  s    ''&&*.))*B*B*D'n"%i"@C-3377&&%%TUXTYY\]`\aade )-BCD%%d+(-BCD%%d+ #Ar   c                    [        U0 UD6$ rs   )rM  )r   ra  rV  s      r|   create_cse_varTritonKernel.create_cse_var.  s     $1&11r   c                   UR                    SU R                  U R                  UR                  5      5       3nUR                  R
                  (       a  U R                  R                  U5        g U R                  R                  U5        g r  )	r_  rG  r  r+  rootrw  rX  r   r  )r   entryr  s      r|   codegen_iteration_ranges_entry+TritonKernel.codegen_iteration_ranges_entry1  sd    **SD,@,@,L!M NO::((. II%r   c                >   UR                   c   eU R                  UR                   5      nU R                  nUS:w  a  SU S3OSnU R                  (       a'  U R                  (       a  UR
                  (       a  U S3nSUR                  R                  5        SU U 3$ )Nr  rp  r  r   z + rsplit_startztl.arange(0, zBLOCK))r  indexing_size_strr  rZ  r  rB  r<  r   )r   r"  r  r  rL  s        r|   r}  )TritonKernel.iteration_ranges_ranges_code9  s    +++%%e&6&67&&*5*C4}A&&&))""x/Fu||1134F4&IIr   c                ^    U R                   nU R                  5       nS/U-  nSU SU SU S3$ )NrF   r  r  r  )r  r  )r   r"  r  r  r  r  s         r|   iteration_ranges_scalar_code)TritonKernel.iteration_ranges_scalar_codeF  sC     &&&&(sTz$r%;-q99r   c                0   UR                   c   eSUR                    S3nU R                  U5      (       a#  SU SUR                   S-    SUR                    S3nUR                  R                  X"5      nU R                  S:w  a  U S	U R                   S3$ U$ )
Nztl.program_id(r  r  z + tl.program_id(rF   z) * tl.num_programs(r  r  rp  )rk  re  	pid_cacher  r  )r   r"  rM  pids       r|   iteration_ranges_get_pid%TritonKernel.iteration_ranges_get_pidN  s    ~~)))u~~.a0 &&u-- cU+ENNQ,>+??STYTbTbSccefCoo!!#+z)U$t//022
r   c                   UR                   S:H  =(       aq    UR                  (       + =(       aY    U R                  (       + =(       aA    [        R                  R
                  R                  UR                  [        5       5      (       + $ r.  )	rk  has_zdimrZ  rD   r  r  r  r  r-   )r   r"  s     r|   re  #TritonKernel.needs_yz_grid_overflow]  sa    NNa YNN"Y...Y GG$$99%++~GWXX		
r   c                    U R                   (       a   U R                   UR                  5        S3   $ [        UR                  5          $ )Nr   )rJ  r   r+   )r   r<  s     r|   r  TritonKernel.max_blocke  s;    $$'7u%=>>//r   c                   U R                   (       d  gU R                  (       a[  UR                  R                  5        S3U R                  ;   a0  U R                  UR                  R                  5        S3   S:X  a  gO:[        R
                  R                  R                  UR                  S5      (       a  gUR                  (       a-  U R                  (       a  U R                  UR                  5      nO?UR                  S:X  a  U R                  (       a  SnOU R                  UR                  5      nUR                  (       a#  U R                  (       a  X R                  5       -  n[        R
                  R                  R!                  UR                  U5      (       ae  UR"                  S:g  =(       dO    UR$                  =(       d<    [        R
                  R                  R'                  UR                  [)        5       5      $ g)NFr   rF   Tr6  )rI  rJ  r<  r   rD   r  r  r  r  rB  r  r  r;  r  rZ  r  rh  rk  r1  r  r-   )r   r   r  s      r|   ri  TritonKernel._has_constant_maskj  sm   !!DKK$5$5$7#8!>$BSBS!S  DKK$5$5$7#8!>?1D E ww77

AFF !:!:33DJJ?I[[CDMMIt{{3I!;!;!OO$55I 7788YOO" W==W77##88^EUV r   c                f    U R                   S   nUR                  S:X  d   eU R                  U5      $ )Nr   r6  )rG  r<  ri  )r   xtrees     r|   rr   TritonKernel._has_constant_xmask  s5      #||s"""&&u--r   c                    U R                    H9  nU R                  U5      (       d  M  UR                  UR                   S35        M;     UR                  S5        g )Nr   r   )rG  ri  r  r<  )r   r   r   s      r|   r  TritonKernel.filter_masks  sL    $$D&&t,,!!T[[M"67 %
 	&!r   c                    [        [        R                  5      S U R                    Vs/ s H  n[        U   PM     sn$ s  snf rs   )rC  r   r   r>  r   )r   r   s     r|   get_reduction_prefixes#TritonKernel.get_reduction_prefixes  sG     ]::;<Ud>U>UV
V tV
 	
 
s   ?c                   U R                    Vs/ s H  o"R                  (       d  M  UPM     nnSR                  [        S U 5       5      5      nUR	                  SU R                  U5       35        U R                    Vs/ s H3  nUR                  (       d  M  [        R                  UR                     PM5     nn[        U5      nUR	                  SU R                  U5       35        gs  snf s  snf )zN
Generates code that flattens ND reduction numels, block sizes, etc. into 1D.
r0  c              3  >   #    U  H  oR                    S 3v   M     g7f)r  Nr^  rx  s     r|   r   8TritonKernel.codegen_reduction_numels.<locals>.<genexpr>  s     "U_Tkk]%#8_r`  z	rnumel = zRBLOCK: tl.constexpr = N)
rG  rB  r   r   r   rG  r   r   r   r<   )r   r  r   reduction_treesrO  	rn_blocksrs  s          r|   rY  %TritonKernel.codegen_reduction_numels  s    
 -1,<,<R,<D@Q@Q4,<RF"U_"UUV	$**V"4!567
 ((
(   1M%%dii0( 	 

 y)/

60B/CDE S

s   C,C,C1!C1c                    U R                  5       nU Vs/ s H  n[        R                  " U U 340 UD6PM      sn$ s  snf )z;
Helper to initialize symbols like rn_numel, rn_base, etc.
)r=  r   r   )r   rL  rV  rn_prefixesr<  s        r|   r~  #TritonKernel._get_reduction_symbols  sA     113JUV+xx0;F;+VVVs   %=c                    U R                  5       nU R                  SSSS9n[        [        U5      S-
  5       Vs/ s H  n[	        X#S-   S 5      PM     sn[
        R                  " S5      /-   $ s  snf )z
Compute coefficients to convert ND reduction indices to linear indices.
For example:
  rindex = r0_index * r1_numel * ... * rn_numel + ... + rn_index.
r  Tr   rF   N)r=  r~  rE  r  r<   r   r[  )r   rF  	rn_numelsrl  s       r|   _get_reduction_index_coeffs(TritonKernel._get_reduction_index_coeffs  s~     113//PT/U	;@[AQTUAU;V
;VCM)!GI./;V
]]1 	 
s   A0c                8    U R                  5       n[        X!5      $ )z;
Compute linear reduction indices from N dimensional ones.
)rJ  r;   )r   
multi_indscoeffss      r|   r  'TritonKernel._flatten_reduction_indices  s     113,,r   c                   U R                  SSSS9nU R                  SSSS9nU R                  U5      nUR                  SU R                  U5       35        U R                  U5      nUR                  SU R                  U5       35        g)zH
Generates code that converts ND reduction indices into linear indices.
r   Tr   r   z
roffset = z	rindex = N)r~  r  r   r  )r   r  
rn_offsetsrn_indsrU  rindexs         r|   r  &TritonKernel.codegen_reduction_indices  s    
 00d 1 

 --gtQU-V 11*=
4#4#4W#=">?@009	$"3"3F";!<=>r   c                   UR                   nUR                  (       a%  UR                  UR                   SU SU S35        OUR                  cD  UR                  UR                   SU R                  U5       35        UR                  U S35        OUR                  b  U SU R                  U5       3nOU R                  X S35      nUR                  U SU R                  U5       SUR                  5        S3UR                   SU 3/5        U R                  U5      (       a(  U R                  5       nUR                  U S	U S
35        g UR                  U SUR                   SU S35        g )Nr  z	offset + r{  z
offset = 0r   z	offset = r0  r   zmask = tl.full(z, True, tl.int1)zmask = z < r  )r<  rw  r   r_  rk  r}  r  r)  rv  r.  r   ri  r  )r   r"  r  r6  r  rL  s         r|   r|  ,TritonKernel.iteration_ranges_codegen_header  sc    LL==NNejj\QCy4@A^^#NNejj\T-N-Nu-U,VWXNNaS
+,+Id&G&G&N%OP88#VMOOc4#@#@#G"HAGGI;V[\zzl#dV, ""5))'')ENNaSw6FGHNNaS

|3qc?@r   )rV  rW  rQ  rS  rn  r  rJ  rF  r?  r  rI  rO  rT  rM  rN  rm  rX  )r   TN)r^  zdict[str, sympy.Expr]rJ  zOptional[FixedTritonConfig]r   r   r9  rZ  r   r   r   r   )r   r   )r   )r_  r   rJ  r   r  r   r   ztuple[str, str])r+  r   r  r   r   r   r   r   )r_  r   r   r   rs   )
r_  r   r   r   r  rL   r1  rC   r   r   NN)rs  rL   r;  z.tuple[str, sympy.Expr, sympy.Expr, sympy.Expr]r<  rL   r=  rZ  r  r   r>  z Optional[tuple[str, sympy.Expr]]r?  zOptional[CSEVariable]r   rL   )r  r   r9  rZ  r   r   )
r9  rZ  r  rZ  rm  rB   r  +Union[CSEVariable, tuple[CSEVariable, ...]]r   rY  )r9  rZ  )r_  r   r   r   r  rY  )r  tuple[torch.dtype, ...]r   r   )r  rZ  r  zUCallable[[tuple[CSEVariable, ...], tuple[CSEVariable, ...]], tuple[CSEVariable, ...]]rs  tuple[CSEVariable, ...]r   r[  )
r  rZ  rs  r[  rP  r   rQ  r   r   r[  )r   r  )r   z type[triton_heuristics.GridExpr])r_  r   r  zOptional[IRNode]r  )r   rM  )r"  rY   )r"  rZ   r   r   )r"  rZ   r  r   r   r   )r"  rZ   r   r   )r<  r   r   r  )r   rZ   r   r   )r   r   r   r   )r  rN   r   r   )rL  r   r   zlist[sympy.Symbol]r  )rM  r7  r   r   )r"  rZ   r  rN   r   r   )Ort   r   r   r   r  r  r   r  rG  r  rP  ra  re  r[  r]  r\  r  r  r  r   r  r  r  r  r  r  r  r4  r/  rG  rM  rT  r  r  r  r  r  r  r  r  r  r  rA  rS  rh  ro  r  r  r  r  r  r  r  r  r  r  rk  r  r  r  r#  r}  r)  r.  re  r  ri  rr  r  r3   r=  rY  r~  rJ  r  r  r|  r   r[  r\  s   @r|   rE  rE  T  s   %I%%).E&.O
 48%3%%3
 2%3 
%3 %3N"

#J*:0

 " " fTfTR	 EG- - !- -<- 	- ^8.MM M 	M
 M4EP SW(( *(3>(FO(	(T  480411 C1 &	1
 $1 1 11 .1 
1f.
"JJ J &	J
 ;J 
5JX&DO&"9
v5(&!
%6(( ( ;	(T1Sfm"'m"
m" (m" 
!m"^>"'>" (>" 	>"
 >" 
!>"@b%H Xt

 % %N`D    $;LC
-8(,2&J:(:14:	:
0
)V.
" 
 
F$W 
 
-? A(A0>A	A Ar   rE  c            
      z  ^  \ rS rSr% \rS\S'   \" \R                  \R                  \R                  \R                  \R                  \R                  \R                  \R                   /5      rSU 4S jjr\SS j5       rS rS rSSS jjr S   SS	 jjr        SS
 jr        SS jrS rSrU =r$ )TritonSchedulingi  z	type[Any]kernel_typec                   > [         TU ]  U5        Ub  [        US5      (       d  g UR                   H+  n[	        U[
        [        45      (       d  M   [        Ul        M-     g )Nr  )	rO  rP  r   r  r  r2   r0   debug_triton_codedebug_device_str)r   	schedulerr  rR  s      r|   rP  TritonScheduling.__init__  sN    #GIw$?$?OOD$0B CDD(9% $r   c                    [         R                  R                  (       d  [         R                  R                  (       a'  [	        / U R
                  Q[        R                  P5      $ U R
                  $ rs   )r   r   cooperative_reductionsforce_cooperative_reductionsr   backend_featuresrI   REDUCE_TO_SINGLE_ELEMENT)rx   r}  s     r|   get_backend_features%TritonScheduling.get_backend_features  sR     MM00}}99P#&&P(O(OP  ###r   c                  ^ [         R                  R                  n[        X5      u  p4U(       a  UR	                  U5        [
        R                  (       a  SSKJnJ	m  [        U4S jU 5       5      (       db  U Vs/ s H%  n[        Xe5      (       d  M  UR                  5       PM'     nnUR	                  UR                   SSR                  U5       35        g g g s  snf )Nr   )r/   ForeachKernelSchedulerNodec              3  <   >#    U  H  n[        UT5      v   M     g 7frs   )r  )r   r  rl  s     r|   r   3TritonScheduling.codegen_comment.<locals>.<genexpr>1  s      CPa
1899=s   z Fused node name list: r  )rD   r  r	  r7   make_commentr   debug_fusiontorch._inductor.schedulerr/   rl  r   r  get_namecommentr   )	r   node_scheduler  origins_detailed_originsr/   r  
node_namesrl  s	           @r|   codegen_comment TritonScheduling.codegen_comment%  s    ''&&%8%P"  )
  CP   +*!!7 !AJJL*  
 $$''>tyy?T>UV s   <CCc                   [         R                  R                  nXR                  ;   a  UR                  U   nU$ [        R
                  R                  (       a$  [        U[        R
                  R                  5      OSn[        U5      S S nSR                  SXvUR                  5       /5      nXTR                  U'   [        R
                  R                  (       a  UOSnUR                  [        [        R                  5      U5      nUR                  [        [        R                   5      U5      nUR                  SS5      n[#        [%        UR'                  5       5      S5      u  pn[)        5       n[*        R-                  5       (       a  [*        R                  X5        UR/                  S	U< S
35        UR1                  USS9  [         R                  R3                  5       nUR/                  SUR4                   S35        SU 3n[7        X$5      u  nnUSU-   S-   U-   -  nUR9                  X\R;                  5       U5        [<        R>                  " S5      (       a  [<        R@                  " X[U5        U$ )Nr   r   r  r   triton_z#pragma CMT#pyzasync_compile.triton(z, '''Tr  z''', device_str='z')z# kernel path: rs  kernel_metadata)!rD   r  r	  src_to_kernelr   r   descriptive_namesr6   rE   r   next_kernel_suffixunique_kernel_namesreplacer   r9   r  r  r#   r"   r  rN   async_compileuse_process_poolr   r   r~  r  r7   define_kernelr   r    is_metric_table_enabledlog_kernel_metadata)r   src_codert  r  r  r  
fused_namekernel_category	subs_name	_basenamer  kernel_pathcompile_wrapperr  metadata_commentru  detailed_originss                    r|   r  TritonScheduling.define_kernel?  s*   ''&&,,,!//9Kj c ==22 &mV]]5T5TU 
 AJ2ANO((?8R8R8TUK /:!!(+'-}}'H'HiI
  ''K,H,H(I;WH''K,C,C(DiPH  ''s;H(08>>;K1Ld(S%I+,.O--// $$Y9%%(=i]%&PQ""84"8WW@@BN%%(9.:M:M9Nb&QR!0>(;M(S%G%w 58H HH!!5579I ../@AA++KhOr   c                    U R                  USS9n[        R                  " U5      nU R                  XB[	        S U 5       5      S9$ )zk
Benchmark fused list of nodes and return the execution time
in milliseconds on randomly generated inputs.
T)r  c              3  @   #    U  H  oR                  5       v   M     g 7frs   rr  r   r  s     r|   r   9TritonScheduling.benchmark_fused_nodes.<locals>.<genexpr>  s     :WQVA::<<QVr  )rw  )generate_kernel_code_from_nodesr$   r  benchmark_codegened_moduler   )r   r  n_spills_thresholdr  r  s        r|   benchmark_fused_nodes&TritonScheduling.benchmark_fused_nodesy  sS    
 77PT7Ux(..
:WQV:W0W / 
 	
r   c                  ^^	^
^^^ [        [        R                  R                  5      n[	        5          UR                  [        R                  R                  5       5         SmU4S jm
U
U4S jnU
4S jnUb  UO[        S/5      n[        R                  SUTR                  5        U" 5       mTb   TTR                  4sSSS5        sSSS5        $ TR                  5       m	TR                  mTR                  m T" TR                  " T	6 S   5        TR(                  n[+        U5      S
:X  d   eUS   R,                  U:  a  ['        S	5      mOS[.        R0                  " U	UU4S j5      m[+        TR2                  5      S:  a  T[.        R0                  " U	U4S j5      -
  m[        R                  SUT5        U" 5         TTR                  4sSSS5        sSSS5        $ ! [         as  n[         R"                  R$                  (       a  e [        R                  SUU5        ['        S	5      mU" 5         TTR                  4s SnAsSSS5        sSSS5        $ SnAff = f! , (       d  f       O= f SSS5        g! , (       d  f       g= f)z$Benchmark an already compiled moduleNc                    > T R                   c   e[        R                  R                  T R                   5      S   S-   $ Nr   z.kernel_perf__file__ospathsplitextr  s   r|   cache_file_pathDTritonScheduling.benchmark_codegened_module.<locals>.cache_file_path  s6    ||///ww''5a8>IIr   c                 >   > T" 5       n [        U [        T5      5        g rs   r%   r   )r  r  mss    r|   store_cache@TritonScheduling.benchmark_codegened_module.<locals>.store_cache  s    &(T3r7+r   c                    > T" 5       n [         R                  R                  U 5      (       a.  [        U 5       n[	        UR                  5       5      sS S S 5        $ g ! , (       d  f       g = frs   )r  r  existsopenr  readr  fdr  s     r|   
load_cache?TritonScheduling.benchmark_codegened_module.<locals>.load_cache  sJ    &(77>>$''dr$RWWY/ $ $s   A
A*unknown%kernel src code for %s written to: %sr   z*Exception (%s) in compiling fused nodes %sinfrF   c                 4   > T" TR                   " T 6 S   5      $ rZ  
clone_argsra  callwrapped_jit_functions   r|   rc  =TritonScheduling.benchmark_codegened_module.<locals>.<lambda>      D!5!@!@$!G!JKr   c                 "   > TR                   " T 6 $ rs   r  ra  r  s   r|   rc  r    s     4 ? ? Fr   z+The fused kernel for %s took %.3f ms to run)r   rD   r  r  r   r}  r~  r   r|  debugr  get_argsr  r{  r  	Exceptionr   r   .disallow_failing_autotune_kernels_TESTING_ONLYr  	launchersr  n_spillsr(   benchmark_gpur  )r   r  r  rw  device_interfacer  r  r  r  ra  r  r  r  r  s    `       @@@@@r|   r  +TritonScheduling.benchmark_codegened_module  s/    4AGG4G4GH ##AGG$G$G$IJBJ, )4
*i[:Q  II7
 B~3<<'; KJ ! @ <<>D88D#&;; ()44d;A>? -66Iy>Q&&& |$$'995\ !..K +==>Bk77F B II=
 Ms||#[ KJ ! L  
(==OO		@
 5\3<<''_ KJ ! L
(K KJJ !  sh   .I="AI"	I=(I"=G"B:I"	I="
I,AIII"	I=II""
I0	,I==
Jc                   UR                  S5      nU=(       a     [        S UR                  5        5       5      nU R                  nU(       a  SSKJn  UnU(       a  SUS'   UR                  S5      (       a
  SUS	'   SUS'   [        R                  UR                  5      (       d  UR                  S	5      (       a   eSUS	'   [        R                  R                  XaX#5      nU" U0 UD6nU R                  XU5      $ )
NrA  c              3  @   #    U  H  oR                  5       v   M     g 7frs   )is_split_scan)r   r  s     r|   r   9TritonScheduling.create_kernel_choices.<locals>.<genexpr>  s      (
-NT  -Nr  rF   )TritonSplitScanKernelFoverride_cooperative_reductionrS  Toverride_persistent_reduction)contains_opr   scheduler_nodesr^  triton_split_scanr  rE  r  reduction_numelr  rD   rd  triton_kernel_kwargsadd_multi_kernel_choices)	r   kernel_featureskernel_argskernel_kwargsis_scanr  r^  r  r  s	            r|   create_kernel_choices&TritonScheduling.create_kernel_choices  s    "--f5 
C (
-<-L-L-N(
 %
 +/*:*:@/K>CM:; &&v..=AM9:>CM:;11/2Q2QRR$(()HIIII=BM9:		66+
 k;];,,V-PPr   c           	     $   U/n[         R                  R                  (       d  U$ UR                  =(       a    UR	                  S5      (       + nUR
                  =(       a    UR	                  S5      (       + nU(       a%  UR                  U R                  " U0 UDSS0D65        U(       a  UR                  R                  n[        R                  R                  R                  US5      (       ae  UR                  U R                  " U0 UDSS0D6=n5        U(       a7  UR                  (       a&  UR                  U R                  " U0 UDSSS.D65        [        U5      S:  a-  USS   H  n	UR                  U	l        M     UR!                  S S9  U$ )	Nr  r  Fi   )r  r  rF   c                    U R                   $ rs   )r  )ks    r|   rc  ;TritonScheduling.add_multi_kernel_choices.<locals>.<lambda>1  s
    q'='=r   r  )r   r   multi_kernelr  r  rZ  r  r^  rA  r  rD   r  r  r  r  must_keep_buffersrS  )
r   r  r  r  kernelsoptional_persistentoptional_cooperativerO  r  kernel2s
             r|   r  )TritonScheduling.add_multi_kernel_choices  s    (.h}}))N$99 
-BSBS+C
 ?
  &;;  
MDUDU,E
 A
 NN   # 38  __44Fww44VUCC!--$' 8= E '5+E+ENN(((+ <A:?	 w<!"12;,2,D,D) ' LL=L>r   c                  ^^^^^^^ U4S jmU4S jnUUU4S jnS/ pTSn[         R                  R                  n[        U5      [         R                  l        [         R                  R                  n[        U5      [         R                  l        [
        R                  S:  n	[
        R                  S:  n
U R                  USU	U
SS9nU GH  u  pnU Vs/ s H  oR                  5       PM     nnU VVs/ s H  nU  H  nUR                  5       PM     M     nnnUR                  [        [        R                  5      S5      n[        R                   " U5      m["        R%                  S	UTR&                  5        U" 5       u  mmTb'  UT-  nUT-  nUR)                  TR&                  5        M  TR+                  5       mTR,                  mTR.                  mT" TR0                  " T6 S   5        TR2                  n[5        U5      S
:X  d   eUS   R6                  S:  a  [9        S5      =mmO7[:        R<                  " UUU4S j5      m[:        R<                  " UU4S j5      m["        R%                  S[        S U 5       5      TT5        U" 5         UT-  nUT-  nUR)                  TR&                  5        GM     U[         R                  l        U[         R                  l        XFU4$ s  snf s  snnf )Nc                    > T R                   c   e[        R                  R                  T R                   5      S   S-   $ r  r  r  s   r|   r  @TritonScheduling.benchmark_combo_kernel.<locals>.cache_file_path9  s6    <<+++77##CLL1!4~EEr   c                   > T" 5       n [         R                  R                  U 5      (       aC  [        U 5       n[	        S UR                  5       R                  5        5       5      sS S S 5        $ g! , (       d  f       g= f)Nc              3  8   #    U  H  n[        U5      v   M     g 7frs   )r  )r   r  s     r|   r   NTritonScheduling.benchmark_combo_kernel.<locals>.load_cache.<locals>.<genexpr>A  s      E3Daq3Drz  rX  )r  r  r  r  r  r  splitr  s     r|   r  ;TritonScheduling.benchmark_combo_kernel.<locals>.load_cache=  sW    "$Dww~~d##$Z2  E2779??3D EE  Z  Zs   .A11
A?c                 \   > T" 5       n [        U [        T5      S-   [        T5      -   5        g )Nr  r  )r  r  r  ms_clones    r|   r  <TritonScheduling.benchmark_combo_kernel.<locals>.store_cacheD  s&    "$Ds2w}s8}<=r   r   g        T)subkernel_nodescustom_part_algorithmenable_autotunemixed_sizesonly_gen_src_coder{  r  rF   r  c                 4   > T" TR                   " T 6 S   5      $ rZ  r  r  s   r|   rc  9TritonScheduling.benchmark_combo_kernel.<locals>.<lambda>{  r  r   c                 (   > TR                   " T 6 S   $ rZ  r  r  s   r|   rc  r  ~  s    0;;TB1Er   zDThe fused kernel for %s took %.3f ms to run, %.3f ms to clone inputsc              3  @   #    U  H  oR                  5       v   M     g 7frs   r  r  s     r|   r   :TritonScheduling.benchmark_combo_kernel.<locals>.<genexpr>  s     <A::<<r  )rD   r  r  r   inplaced_to_remover   combo_kernels_autotunecombo_kernel_allow_mixed_sizesgenerate_combo_kernel_code	get_nodesrr  r  r   r9   r  r$   r  r|  r  r  r  r  r  r{  r  r  r  r  r  r(   r  )r   	node_listr  r  total_ms	file_listtotal_clone_msremoved_buffers_originplaced_to_remove_origr  r  kernel_code_listr  r  
node_groupr  fused_node_listsr  r  namesr  ra  r  r  r  r  r  r  s                        @@@@@@@r|   benchmark_combo_kernel'TritonScheduling.benchmark_combo_kernel4  s   
	F	 	>  ) # ww66",-A"B"#''"<"<%/0G%H" 77!;;;a?::%"&+#" ; 
 (8#H=GHZT 0ZH/?O/?eAQZZ\\/?EO''K,C,C(DiPH""8,CII7
 &<LB~B(*  .<<>D88D#&;;  %00$7:;,66Iy>Q&&&|$$q( %e,X !..K '44E IIV<<<	 MNHh&NS\\*e (8f #7%<"22i  IOs   K.7$K3r   )rb  zOptional[Scheduler]r   r   )r}  ztorch.device)   )r   tuple[float, str])r  N)rw  zOptional[OrderedSet[str]]r   r  )r  rh   r  	list[Any]r  r  r   list[TritonKernel])r  rE  r  r  r  r  r   r  )rt   r   r   r   rE  r^  r   r   rI   FOREACH	BUCKETIZEINPLACE_BUFFERSMASKED_SCATTER_WITH_INDEXSCANSORTTRITON_TEMPLATESTUPLE_REDUCTIONrg  rP  r   ri  rx  r  r  r  r  r  r  r   r[  r\  s   @r|   r]  r]    s   )K)!""$$**44++**		
: $ $48t	
 RVT$5NT$	T$l#Q+#Q #Q &	#Q
 
#QJ33 3 &	3
 
3jY3 Y3r   r]  c                4   / nU R                  5       nUb!  [        U[        R                  5      (       d   eU(       a1  UR                  c$  UR                  U R                  5        S35        U$ SSKJn  U R                  5       nUc   eU R                  R                  U5      n[        U[        U45      (       d   S[        U5       35       e[        R                  R!                  U5         ["        R$                  nUR'                  U R)                  5       5      R+                  5       nU["        l        S S S 5        UR                  U R                  5        S35        UR                  [,        R.                  " WS5      5        U$ ! , (       d  f       NX= f)Nz" Unfinalized multi template bufferr   )CUDACombinedSchedulingz]Scheduling backend should be SIMD or CUDACombined when generating debug Triton strings, got: z Triton code:z    )get_template_noder  r   MultiTemplateBuffermake_kernel_renderr  rr  0torch._inductor.codegen.cuda_combined_schedulingr  r{  rb  get_backendr\   r  rD   r  set_current_devicer    generated_kernel_countr  r  r  r  r7  )r  linesmulti_templater  r}  backendold_generated_kernel_counttriton_codes           r|   r`  r`    s_   E++-N!Z@V@V%W%WWW.;;C((JKL2 L/	
 "!!!..,,V4'N4J#KLL 	
klpqxlykz{	
L WW''/ *1)G)G&!AA eg  .HG* 0 	(67X__[&9:L 0/s   .A	F		
Fr   )r  r   r  r   r  r   r   r   rW  )r9  rZ  r   rZ  )r9  rZ  r   r   )rJ  zUnion[CSEVariable, Any]r   r   )r   rf   r  )rz   r   r   zCallable[[_T], _T])r  r/   r   r  )
__future__r   r  r,  r8  r  rq  loggingr  r  r  r  collections.abcr   r   r   typingr   r   r	   r
   r   r   r   sympy.printing.precedencer   ru   torch._loggingtorch.utils._pytreer  _pytreer  torch._dynamo.device_interfacer   torch._dynamo.utilsr   r   torch._prims_commonr   torch.utils._ordered_setr   torch.utils._sympy.functionsr   r   r   torch.utils._tritonr   utils._sympy.symbolr   r   r   r   utils._sympy.value_rangesr   r   r   r   r    r  r!   	codecacher"   r#   r$   r%   ops_handlerr&   runtimer'   runtime.benchmarkingr(   runtime.hintsr)   r*   r+   r,   runtime.runtime_utilsr-   r.   rb  r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   virtualizedrA   r  rB   rC   rD   wrapper_benchmarkrE   block_analysisrG   commonrH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   simdrW   rX   rY   rZ   r[   r\   triton_utilsr]   r^   r_   r`   ra   r  rb   typesrc   rd   r^  rf   rg   simd_kernel_featuresrh   ri   	getLoggerrt   r|  _logginggetArtifactLoggerperf_hint_logschedule_log
fusion_logrn   r   r   r   	dataclassr   r   r  r  r  r  r:  r=  rC  rE  rH  rK  rM  r_  r  r  _initialize_pointwise_overridesr  r  r   r  r6  r   r  r?  rE  r]  r`  r   r   r|   <module>rO     s4   "         	  .  F F  0   $ $ C < 0 / K K 2 X X 4 " " ( F F ( ' .  D W W     C B B /    "   %  L8	B!00<H~~//*E^^--hA
6 6  4 $ 4 *, ,: 
 
 
@ c+ c+ c+L	++/+<P++>jQM jQZ 	3
&8
;P *(.bO&k O&d  / / 9q$O q$h$+ $+N : : :&! !H # # #
%uS%S/-A'BBC 
l&A:/0 l&A^MJ3~ J3Zr   