
    8h                   R   S SK Jr  S SKrS SKrS SKrS SKrS SKrS SKrS SKrS SK	r	S SK
r
S SKrS SKrS SKJrJr  S SKJrJrJrJrJr  S SKrS SKJr  S SKrS SKrS SKJs  Jr  S SKJr  S SK J!r!J"r"  S SK#J$r$  S S	K%J&r&  S S
K'J(r(  S SK)J*r*J+r+J,r,J-r-J.r.  S SK/J0r0  S SK1J2r2  S SK3J4r4  S SK5J6r6J7r7  SSK8J9r9J:r:J;r;  SSK<J=r=  SSK;J>r>J?r?  SSK@JArA  SSKBJCrC  SSKJDrDJErEJFrFJGrGJHrHJIrIJJrJJKrKJLrLJMrMJNrN  SSKOJPrP  SSKQJRrRJSrSJTrTJUrUJVrVJWrW  SSKXJYrY  SSKZJ[r[J\r\J]r]  \(       a  S SK^J_r_J`r`  S SKaraSSKbJcrc  SSKdJere  \R                  " \g5      rh\U" 5       R                  rj\k\R                  \R<                  \m\n4   ro\\;R                  \V4   rq\S/S4   rrSXS  jrsSYS! jrt\u\m\v4   rw\\k\\v\R.                  4   S"4   \\w/\k\vS"4   4   4   rx  SZ           S[S# jjryS\S$ jrz\R                   " S% S&5      5       r| " S' S(5      r} " S) S5      r~\R                   " S* S+\~5      5       r\R                   " S, S-\~5      5       r\R                   " S. S/\~5      5       r\R                   " S0 S1\~5      5       r " S2 S3\~5      r\R                   " S4 S5\~5      5       r\R                   " S6 S7\~5      5       r\R                   " S8 S9\~5      5       r\R                   " S: S;\~5      5       r\R                   " S< S=\~5      5       r\R                   " S> S?\~5      5       r\R                   " S@ SA\5      5       r\R                   " SB SC\5      5       r\R                   " SD SE\5      5       r\R                   " SF SG\5      5       r " SH SI\5      r\R                   " SJ SK\~5      5       r\R                   " SL SM\5      5       r\R                   " SN SO\5      5       r\R                   " SP SQ\~5      5       r\R                   " SR SS\~5      5       r\mr\\\I4   r " ST SU\S5      r " SV SW\5      rg)]    )annotationsN)chaincount)AnyCallableOptionalTYPE_CHECKINGUnion)Expr)dtype)countersdynamo_timed)DebugPrinterManager)MultiKernelState)	cache_dir)CallMethodKeyConvertIntKeyDivideByKeyresolve_unbacked_bindingsSymTypes)_get_qualified_name)
OrderedSet)SingletonInt)symbol_is_typeSymT   )async_compileconfigir)output_code_log)IRNodeReinterpretView)triton_heuristics)DeviceProperties)cache_on_selfDelayReplaceLineget_benchmark_nameIndentedBuffer#is_codegen_graph_partition_subgraphLineContext'set_kernel_post_grad_provenance_tracingsympy_product	sympy_str
sympy_substriton_version_uses_attrs_dict)V   )ArgNameCodeGenDeferredLinePythonPrinterWorkspaceArgWorkspaceZeroMode)cexpr)	config_ofshould_unwrap_unspec_argsignature_to_meta)IteratorSequence)GraphLowering)FxConverterWrapperLinec                :   [         R                  R                  U 5      nU R                  5       [         R                  R                  ;  nU R                  5       U R                  5       [        [         R                  R                  R                  U5      5      U4$ N)
r0   graphget_allocation_storage_sizeget_nameunaligned_buffersget_device_or_error	get_dtyper-   sizevarssimplify)nodestorage_size	alignments      Y/var/www/fran/franai/venv/lib/python3.13/site-packages/torch/_inductor/codegen/wrapper.pybuffer_reuse_keyrO   [   sr    7766t<Lqww'@'@@I  " 	!''""++L9:     c                   U R                  5       UR                  5       :w  a  gU R                  5       UR                  5       :w  a  g[        R                  R                  R                  [        R                  R                  U 5      5      n[        R                  R                  R                  [        R                  R                  U5      5      n[        U5      [        U5      :X  d`  [        R                  R                  R                  USU-  5      (       a/  [        R                  R                  R                  X25      (       a  gg)NFgffffff?T)
rG   rH   r0   rC   rI   rJ   rD   r-   statically_known_geqstatically_known_leq)	input_buf
output_buf
input_sizeoutput_sizes       rN   can_match_buffer_sizerX   i   s     $$&**H*H*JJ
 4 4 66!!**	++I6J ''""++	++J7K 	*;!77 	
--k4*;LMMGG11+JJrP   .c                6  ^ ^^^ [        5       mSS jm S   SUU4S jjjnSSU UU4S jjjnST  3nU" SU S35        T(       a9  [        R                  R                  (       a  TR                  R                  5       O[        R                  " 5       nTR                  5          U   [        R                  R                  (       ab  U(       a[  [        R                  R                  (       a<  U[        R                  R                  ;   a  [        R                  R                  U   n	OS /[        U5      -  n	[        U5      S:X  a!  U" US   U	S   5      u  pU" S	U
 3S	U 35        O[        U5      S:  d   e[        U5      [        U5      :X  d   e[        5       n[        [        X!U	5      S
 SS9 H  u  pnUR                  (       aF  UR                  R!                  5        V Vs/ s H  u  pSU  SU 3PM     nn nSR#                  U5      nOSnU" X5      u  pSU SU
 3nUU;   a  My  UR%                  U5        U" USU SU 35        M     S S S 5        S S S 5        UTR'                  5       4$ s  snn f ! , (       d  f       N.= f! , (       d  f       N7= f)Nc                p    [        U [        R                  5      (       a  U $ [        R                  " U 5      $ rB   )
isinstancesympyr   Integer)items    rN   _convert_to_sympy_expr@user_defined_kernel_grid_fn_code.<locals>._convert_to_sympy_expr   s&    !$

33tLt9LLrP   c                  > Tb  [        U 5      (       a  X 4$ [        U4S jU  5       5      nU(       d  UnTR                  U5      [        R                  R
                  (       a%  TR                  [        U4S jU 5       5      5      4$ S4$ )z
This function return a tuple of two values: the first one is for the real grid
which is used in the generated code; the second one is an example grid with
concreate values which is used in the autotune block to run the generated
kernels at compile time.
Nc              3  4   >#    U  H  nT" U5      v   M     g 7frB    ).0gr_   s     rN   	<genexpr>Kuser_defined_kernel_grid_fn_code.<locals>.determine_grid.<locals>.<genexpr>   s     Cd1!44ds   c              3  Z   >#    U  H   nTR                  U[        U5      5      v   M"     g 7frB   generate_example_arg_valuetype)rd   re   wrappers     rN   rf   rg      s,      !-A  ::1d1gFF!-   (+)callabletuplecodegen_python_shape_tupler   tritonautotune_at_compile_time)gridexample_grid
sympy_gridr_   rl   s      rN   determine_grid8user_defined_kernel_grid_fn_code.<locals>.determine_grid   s     ?htnn:CdCC
%L..z: ==99 22 !- 
 	
 
 	
rP   c                   > TR                  U 5        T(       aV  [        R                  R                  (       a6  TTR                  ;  a%  TR
                  R                  U=(       d    U 5        g g g g rB   )	writeliner   rq   rr   kernel_autotune_nameskernel_autotune_calls)linert   nameoutputrl   s     rN   ry   3user_defined_kernel_grid_fn_code.<locals>.writeline   sW    66G999))33L4HDI : 7 rP   grid_wrapper_for_def z(meta):r1   r   zreturn c                2    [        U S   R                  5      $ Nr1   lenkwargsxs    rN   <lambda>2user_defined_kernel_grid_fn_code.<locals>.<lambda>   s    c!A$++.rP   Tkeyreversezmeta['z'] == z and Trueif z	: return )r^   Union[int, sympy.Expr]return
sympy.ExprrB   )rs   
TritonGridrt   zOptional[TritonGrid])r|   strrt   Optional[str])r(   r   rq   rr   r{   indent
contextlibnullcontextr0   rC   autotuning_gridsr   r   sortedzipr   itemsjoinaddgetvalue)r}   configsgridsrl   original_fxnode_namerv   ry   fn_namekernel_autotune_calls_indentexample_gridsrs   rt   seencvalguards	statementr_   r~   s   `  `             @@rN    user_defined_kernel_grid_fn_coder      sT    FM
 .2

*
 
>J J "$(GWIW%& v}}== 	%%,,.##% !
 
6MM22$(($(@(@@GG445IJM!FSZ/Mu:?!/a-:J!KDv&',(@Au:>!>u:W---$.LD *0EM2.*%
 88DEHHNNDTDTyt&fSE2DT   %\\&1F#F%3D%G"!&4&9	$#)s6()L>%JK#*- 7R FOO%%%9 76s8   %J
(E I9(I3<AI9J
3I99
J	J


Jc                   ^^^^^ [        5       mTR                  U R                  SS9  SSKJm  SSKJm  [        U R                  /5      mUUUUU4S jmT" U 5        TR                  5       $ )z[
Given a triton kernel function pointer collect the transitive closure of
its dependencies
Tstripr   )JITFunction)	constexprc           	     .  > [        S [        R                  " U R                  5       5       5      nU R                  R                  R                  S0 5      nU R                  R                  R                   GH  nUT;   a  M  X0R                  R                  ;   d  M'  U R                  R                  U   n[        UT5      (       aV  T	R                  5         T	R                  S5        T	R                  UR                  SS9  TR                  U5        T" U5        M  [        U[        [        [         T
45      (       a  T	R                  5         [        UT
5      (       a  SUR"                  < S3nOU< nUR                  U5      =n(       aQ  [        U[$        5      (       a  SUR&                   S	UR(                   3nOSU< 3nT	R                  U U S
U 35        OT	R                  U S
U 35        TR                  U5        GM  X1;   d  GM  US:w  d  GM  [+        US5      (       d  GM  UR&                  R-                  S5      (       d  GM  T	R                  SUR&                   SUR(                   SU 35        TR                  U5        GM     g )Nc              3  ^   #    U  H#  nUR                   S :X  d  M  UR                  v   M%     g7f)LOAD_GLOBALN)opnameargval)rd   insts     rN   rf   ^user_defined_triton_kernel_transitive_closure_source_code.<locals>.traverse.<locals>.<genexpr>
  s)      '
3{{m+ DKK3s   --__annotations__z@triton.jitTr   ztl.constexpr(): . = tl
__module__rq   zfrom z import z as )r   disBytecodefn__globals__get__code__co_namesr[   newlinery   splicesrcr   intr   boolvaluerk   r   __name__hasattr
startswith)
cur_kernelunqualified_loadsglobal_annotationssymbol_namesymbol
symbol_str
annotationannotation_coder   compile_wrapperr   symbols_includedtraverses           rN   r   Kuser_defined_triton_kernel_transitive_closure_source_code.<locals>.traverse  sG   
 ' '
Z]]3'
 

 (]]66::;LbQ%==11::K..mm777#22;?fk22#++-#--m<#**6::T*B$((5V$c4(CDD#++-!&)44'4V\\4DA%F
(.z
%7%;%;K%HHzH%j$77"$Z%:%:$;1Z=P=P<Q R , 13:..AO'11*mO+<C
|L (11[MZL2QR$((54#t+55 ))44X>>
 $-- 1 12(6??:K4P[}] %((5[ ;rP   )
r(   r   r   rq   r   triton.languager   r   r   r   )kernelr   r   r   r   r   s    @@@@@rN   9user_defined_triton_kernel_transitive_closure_source_coder      sd    
 %&O6::T2 #) "6??"3486 86t V##%%rP   c                  0    \ rS rSr% S\S'   S\S'   S rSrg)	SymbolicCallArgiC  r   innerr   
inner_exprc                ,    [        U R                  5      $ rB   )r   r   selfs    rN   __str__SymbolicCallArg.__str__I  s    4::rP   rc   N)r   r   __qualname____firstlineno__r   r   __static_attributes__rc   rP   rN   r   r   C  s    JrP   r   c                  F   ^  \ rS rSrU 4S jrSS jrSS jrS	S jrSrU =r	$ )
MemoryPlanningStateiM  c                n   > [         TU ]  5         [        R                  " [        5      U l        SU l        g Nr   )super__init__collectionsdefaultdictlist
reuse_pooltotal_allocated_buffer_size)r   	__class__s    rN   r   MemoryPlanningState.__init__N  s-    ##D) 	 12(rP   c                L    [        U R                  R                  US 5      5      $ rB   )r   r   r   )r   r   s     rN   __contains__ MemoryPlanningState.__contains__U  s    DOO''T233rP   c                f    U R                   U   R                  5       nUR                  (       a   eU$ rB   )r   pop	is_reusedr   r   r^   s      rN   r   MemoryPlanningState.popX  s+    s#'')>>!!rP   c                f    UR                   (       a   eU R                  U   R                  U5        g rB   )r   r   appendr   s      rN   pushMemoryPlanningState.push]  s&    >>!!##D)rP   )r   r   )r   ReuseKeyr   r   )r   r  r   FreeIfNotReusedLine)r   r  r^   r  r   None)
r   r   r   r   r   r   r   r  r   __classcell__r   s   @rN   r   r   M  s    24
* *rP   r   c                      \ rS rSrSS jrSrg)r@   ib  c                    [        S5      e)Nz2FX codegen not yet supported for type {type(self)})NotImplementedErrorr   	converters     rN   
codegen_fxWrapperLine.codegen_fxc  s    !"VWWrP   rc   Nr  r?   r   FxConversionFuncr   r   r   r   r  r   rc   rP   rN   r@   r@   b  s    XrP   c                  H    \ rS rSr% S\S'   S\S'   SS jrSS jrSS jrS	rg
)EnterSubgraphLineig  PythonWrapperCodegenrl   r>   rC   c                b    U R                   R                  U R                   R                  5        g rB   )rl   push_computed_sizescomputed_sizesr   s    rN   __post_init__EnterSubgraphLine.__post_init__l  s    (()D)DErP   c                n    U R                   R                  U R                  5        UR                  5         g rB   )rl   push_codegened_graphrC   	do_indentr   codes     rN   codegenEnterSubgraphLine.codegeno  s"    ))$**5rP   c                    UR                   $ rB   )_generate_enter_subgraphr  s     rN   r  EnterSubgraphLine.codegen_fxs  s    111rP   rc   Nr   r  r  r(   r   r  r  	r   r   r   r   r   r  r  r  r   rc   rP   rN   r  r  g  s    !!F2rP   r  c                  >    \ rS rSr% S\S'   SS jr\S	S j5       rSrg)
CommentLineiw  r*   r|   c                :    UR                  U R                  5        g rB   )ry   r|   r  s     rN   r  CommentLine.codegen{  s    tyy!rP   c                    U R                   $ rB   )_generate_comment)r  s    rN   r  CommentLine.codegen_fx~  s    ***rP   rc   Nr%  r  )	r   r   r   r   r   r  staticmethodr  r   rc   rP   rN   r(  r(  w  s!    
" + +rP   r(  c                  >    \ rS rSr% S\S'   S	S jrS
S jrSS jrSrg)ExitSubgraphLinei  r  rl   c                V    U R                   R                  5       U R                   l        g rB   )rl   pop_computed_sizesr  r   s    rN   r  ExitSubgraphLine.__post_init__  s    &*ll&E&E&G#rP   c                X    U R                   R                  5         UR                  5         g rB   )rl   pop_codegened_graphdo_unindentr  s     rN   r  ExitSubgraphLine.codegen  s    ((*rP   c                    UR                   $ rB   )_generate_exit_subgraphr  s     rN   r  ExitSubgraphLine.codegen_fx  s    000rP   rc   Nr$  r%  r  r&  rc   rP   rN   r0  r0    s    !!H1rP   r0  c                  >    \ rS rSr% S\S'   S\S'   S
S jrSS jrSrg	)EnterDeviceContextManagerLinei  r   
device_idxzOptional[int]last_seen_device_guard_indexc                   [         R                  R                  (       Ga  UR                  S5        [         R                  R                  (       aj  U R
                  c;  UR                  [         R                  R                  R                  5        S35        g U R
                  U R                  :X  d   S5       eg U R
                  cH  UR                  [         R                  R                  R                  5        SU R                   S35        g UR                  SU R                   S35        g UR                  S[         R                  R                  R                  U R                  5       S35        UR                  5         UR                  [         R                  R                  R                  U R                  5      5        g )	N
z) stream_guard(stream, this->device_idx_);z4AOTInductor only supports running on one CUDA devicez device_guard(z);zdevice_guard.set_index(with :)r0   rC   cpp_wrapperry   aot_moder>  
device_opscpp_aoti_stream_guardr=  cpp_aoti_device_guarddevice_guardr  
set_devicer  s     rN   r  %EnterDeviceContextManagerLine.codegen  sM   77NN4 ww 44<NN77--CCEFFop  <<O NO 44<NN77--CCEFnUYUdUdTeegh NN%<T__<MR#PQ NNU177#5#5#B#B4??#S"TTUVWNNNN177--88IJrP   c                    UR                   $ rB   )&_generate_enter_device_context_managerr  s     rN   r  (EnterDeviceContextManagerLine.codegen_fx  s    ???rP   rc   Nr%  r  r   r   r   r   r   r  r  r   rc   rP   rN   r<  r<    s    O"//K:@rP   r<  c                  (    \ rS rSrSS jrSS jrSrg)ExitDeviceContextManagerLinei  c                d    [         R                  R                  (       d  UR                  5         g g rB   )r0   rC   rC  r6  r  s     rN   r  $ExitDeviceContextManagerLine.codegen  s     ww"" #rP   c                    UR                   $ rB   )%_generate_exit_device_context_managerr  s     rN   r  'ExitDeviceContextManagerLine.codegen_fx  s    >>>rP   rc   Nr%  r  r   r   r   r   r  r  r   rc   rP   rN   rP  rP    s    ?rP   rP  c                  >    \ rS rSr% S\S'   S\S'   S
S jrSS jrSrg	)ExternKernelAllocLinei  r  rl   ir.ExternKernelAllocrK   c                    U R                   n/ UR                  5       QUR                  5       QnU R                  R	                  U R                   U5        g rB   )rK   codegen_argscodegen_kwargsrl   $_generate_extern_kernel_alloc_helper)r   r  rK   argss       rN   r  ExternKernelAllocLine.codegen  sD    yy=""$=t':':'<=99$))TJrP   c                    UR                   $ rB   )_generate_extern_kernel_allocr  s     rN   r   ExternKernelAllocLine.codegen_fx  s    666rP   rc   Nr%  r  rN  rc   rP   rN   rX  rX    s    !!
K
7rP   rX  c                  >    \ rS rSr% S\S'   S\S'   S
S jrSS jrSrg	)ExternKernelOutLinei  r  rl   ir.ExternKernelOutrK   c                ~   U R                   n/ UR                  5       QUR                  SS9QnUR                  5       n[        R
                  R                  (       a  UR                  S:X  a  SnOUR                  5       nUR                  5       =n(       a  UR                  O[        R
                  R                  n[        R                  R                  (       a
  [        X$SS9  U R                  R!                  UUR#                  5       UR$                  (       a  UR$                  R#                  5       OS UU5        g )NT)skip_outztorch::inductor::_mm_plus_mmaoti_torch__mm_plus_mm_out)	is_extern)rK   r[  r\  get_kernel_namer0   rC   rC  cpp_kernel_name
get_devicerk   device_typer   traceenabledr+   rl   "_generate_extern_kernel_out_helpercodegen_referenceoutput_view)r   r  rK   r^  kernel_nameddevices          rN   r  ExternKernelOutLine.codegen  s    yyJ""$Jt':':D':'IJ**,GG$$(FF 7K..0K!%!22A29L9L<<3DQUV77""$484D4DD..0$	
rP   c                    UR                   $ rB   )_generate_extern_kernel_outr  s     rN   r  ExternKernelOutLine.codegen_fx      444rP   rc   Nr%  r  rN  rc   rP   rN   rd  rd    s    !!

05rP   rd  c                  >    \ rS rSr% S\S'   S\S'   S
S jrSS jrSrg	)FreeLinei  r  rl   %Union[BufferLike, ir.TorchBindObject]rK   c                    U R                   R                  5       [        R                  R                  ;  d   eUR                  U R                  R                  U R                   5      5        g rB   )rK   rE   r0   rC   removed_buffersry   rl   make_buffer_freer  s     rN   r  FreeLine.codegen  sF    yy!!#177+B+BBBBt||44TYY?@rP   c                    UR                   $ rB   )_generate_freer  s     rN   r  FreeLine.codegen_fx      '''rP   rc   Nr%  r  rN  rc   rP   rN   r|  r|    s    !!
//A(rP   r|  c                      \ rS rSr% S\S'   S\S'   S\S'   S\S'   S\S	'   S
\S'   S\S'   S\S'   S\S'   S\S'   S\S'   SS jrSS jrSrg)KernelCallLinei  r  rl   r   rs  ztuple[Any, ...]	call_argsraw_keysraw_args	list[str]	arg_typesr   rq   zdict[str, Any]triton_metaztorch.deviceru  
graph_namer   c                   U R                   R                  U R                  U R                  U R                  U R
                  U R                  U R                  U R                  U R                  U R                  U R                  S9
  g )N)rq   r  r  r  r  ru  r  r   )rl   _generate_kernel_call_helperrs  r  rq   r  r  r  r  ru  r  r   r  s     rN   r  KernelCallLine.codegen  se    11NN;;nn]]]]((;;!%!:!: 	2 	
rP   c                    UR                   $ rB   )_generate_kernel_callr  s     rN   r  KernelCallLine.codegen_fx      ...rP   rc   Nr%  r  rN  rc   rP   rN   r  r    sL    !!LO
/rP   r  c                  r    \ rS rSr% S\S'   S\S'   S\S'   SrS\S	'   S
rS\S'   SrS\S'   SS jrSS jr	Sr
g)KernelDefinitionLinei  r  rl   r   rs  kernel_bodyNr   metadataTr   gpucpp_definitionc                    U R                   R                  U R                  U R                  U R                  U R
                  U R                  S9  g N)r  r  r  )rl   _define_kernel_helperrs  r  r  r  r  r  s     rN   r  KernelDefinitionLine.codegen&  sB    **]].. 	+ 	
rP   c                    UR                   $ rB   )_generate_kernel_definitionr  s     rN   r  KernelDefinitionLine.codegen_fx/  rz  rP   rc   r%  r  )r   r   r   r   r   r  r  r  r  r  r   rc   rP   rN   r  r    s<    !!"Hm"C$(NM(
5rP   r  c                  >    \ rS rSr% S\S'   S	S jrS
S jrSS jrSrg)MemoryPlanningLinei3  r  rl   c                    U $ )zFirst pass to find reuserc   r   states     rN   planMemoryPlanningLine.plan7  s    rP   c                    g)zSecond pass to output codeNrc   r  s     rN   r  MemoryPlanningLine.codegen;  s    rP   c                |   / n[         R                  " U 5       Hw  nUR                  S:X  a  M  [        XR                  5      nUR	                  UR                   SUR
                  [        R                  L a  UR                  5       OU 35        My     [        U 5      R                   SSR                  U5       S3$ )z6
Emits a string representation that fits on one line.
rl   =(, r   )dataclassesfieldsr}   getattrr   rk   r   BufferrE   r   r   )r   r^  fieldr   s       rN   r   MemoryPlanningLine.__str__>  s      ''-EzzY&$

+CKK::,a%**		2IsST	 . t*%%&a		$'8::rP   rc   Nr  r   r   r  r%  r   r   )	r   r   r   r   r   r  r  r   r   rc   rP   rN   r  r  3  s    !!);rP   r  c                  >    \ rS rSr% S\S'   S	S jrS
S jrSS jrSrg)AllocateLineiM  
BufferLikerK   c           	        U R                   R                  5       [        R                  R                  ;   a  [        U R                  5      $ [        U R                   5      n[        R                  (       aH  X!;   aC  UR                  U5      nSUl        [        U R                  UR                   U R                   5      $ U R                   R                  5       R                  S:X  aj  U R                  R                  U R                   5      nUbB  U=R                   [#        [$        R&                  " [(        R*                  US5      5      -  sl        U $ )NTcpur1   )rK   rE   r0   rC   r  NullLinerl   rO   r   allow_buffer_reuser   r   	ReuseLinerG   rk   static_shape_for_buffer_or_noner   r   	functoolsreduceoperatormul)r   r  r   	free_linestatic_shapes        rN   r  AllocateLine.planQ  s    99177#:#::DLL)) tyy)$$		#I"&IT\\9>>499EE99((*//58<<GG		RL'11S$$X\\<C6 1 rP   c                    U R                   R                  5       [        R                  R                  ;  d   eU R
                  R                  U R                   5      nUR                  U5        g rB   )rK   rE   r0   rC   r  rl   make_buffer_allocationry   r   r  r|   s      rN   r  AllocateLine.codegene  sK    yy!!#177+B+BBBB||22499=trP   c                    UR                   $ rB   )_generate_allocater  s     rN   r  AllocateLine.codegen_fxj  s    +++rP   rc   Nr  r%  r  	r   r   r   r   r   r  r  r  r   rc   rP   rN   r  r  M  s    
(
,rP   r  c                  L    \ rS rSr% S\S'   SrS\S'   SS jrSS jrSS	 jrS
r	g)r  in  r  rK   Fr   r   c                   [        U R                  R                  5       5      S:  a  U $ [        U R                  R                  [
        R                  5      (       a  U $ U R                  (       a   eU R                  R                  5       [        R                  R                  ;   a  [        U R                  5      $ [        R                  (       a%  UR!                  [#        U R                  5      U 5        U $ r   )r   rK   get_inputs_that_alias_outputr[   layoutr   MultiOutputLayoutr   rE   r0   rC   r  r  rl   r   r  r  rO   r  s     rN   r  FreeIfNotReusedLine.plans  s    tyy55781<Kdii&&(<(<==K>>!!99177#:#::DLL))$$JJ'		2D9rP   c                    U R                   R                  5       [        R                  R                  ;  d   eU R
                  (       d5  UR                  U R                  R                  U R                   5      5        g g rB   )	rK   rE   r0   rC   r  r   ry   rl   r  r  s     rN   r  FreeIfNotReusedLine.codegen  sR    yy!!#177+B+BBBB~~NN4<<88CD rP   c                    UR                   $ rB   )_generate_free_if_not_reusedr  s     rN   r  FreeIfNotReusedLine.codegen_fx  s    555rP   rc   Nr  r%  r  )
r   r   r   r   r   r   r  r  r  r   rc   rP   rN   r  r  n  s"    
It
E
6rP   r  c                  R    \ rS rSr% S\S'   S\S'   S\S'   SS jrSS jrSS	 jrS
rg)ReinterpretLinei  r  rK   	reused_asz	ir.Layoutr  c                    U $ rB   rc   r  s     rN   r  ReinterpretLine.plan  s    rP   c                T   [        U R                  [        R                  5      (       d   e[        U R                  R                  [        R
                  5      (       d   eU R                  R                  U R                  R                  5       U R                  R                  5        g rB   )
r[   r  r   NonOwningLayoutviewr"   rl   codegen_deferred_allocationr  rE   r  s     rN   r  ReinterpretLine.codegen  sp    $++r'9'9::::$++**B,>,>????00NN##%t{{'7'7	
rP   c                    UR                   $ rB   )_generate_reinterpretr  s     rN   r  ReinterpretLine.codegen_fx  r  rP   rc   Nr  r%  r  r  rc   rP   rN   r  r    s#    

/rP   r  c                  V    \ rS rSr% S\S'   S\S'   SrS\S'   SS jrSS	 jrSS
 jrSr	g)r  i  r  rK   r  Tr   
delete_oldc                |   U R                   R                  5       [        R                  R                  ;   aM  U R
                  R                  5       [        R                  R                  ;   d   e[        U R                  5      $ U R
                  R                  5       [        R                  R                  ;  d   eU $ rB   )rK   rE   r0   rC   r  r  r  rl   r  s     rN   r  ReuseLine.plan  s    99177#:#::>>**,0G0GGGGDLL))~~&&(0G0GGGGrP   c                x   U R                   R                  5       [        R                  R                  ;  d   eU R
                  R                  5       [        R                  R                  ;  d   eUR                  U R                  R                  U R                   U R
                  U R                  5      5        g rB   )
rK   rE   r0   rC   r  r  ry   rl   make_buffer_reuser  r  s     rN   r  ReuseLine.codegen  sz    yy!!#177+B+BBBB~~&&(0G0GGGGLL**499dnndooV	
rP   c                    UR                   $ rB   )_generate_reuser  s     rN   r  ReuseLine.codegen_fx  s    (((rP   rc   Nr  r%  r  )
r   r   r   r   r   r  r  r  r  r   rc   rP   rN   r  r    s'    
J
)rP   r  c                      \ rS rSrSS jrSrg)r  i  c                    UR                   $ rB   )_generate_nullr  s     rN   r  NullLine.codegen_fx  r  rP   rc   Nr  r  rc   rP   rN   r  r    s    (rP   r  c                  f    \ rS rSr% S\S'   S\S'   \SS j5       r\SS j5       r\SS j5       rS	r	g
)CommBufferLinei  r  rl   	ir.BufferrK   c                    SSK Jn  U R                  R                  5       nU R                  R	                  5       nU" U5      (       a  [        SU R                   35      e[        U5      UR                  -  $ )Nr   )is_symbolicz-The size of a comm buffer can't be symbolic: )torch._inductor.utilsr  rK   	get_numelrH   AssertionErrorr   itemsize)r   r  numelr   s       rN   sizeCommBufferLine.size  sd    5		##%		##%u ?		{K  5zENN**rP   c                    U R                   R                  5       n[        U[        R                  5      (       d   eUR
                  $ rB   )rK   get_output_specr[   r   CommBufferLayoutcomm_buffer_typer   r  s     rN   r  CommBufferLine.comm_buffer_type  s9    **,&""5"56666&&&rP   c                    U R                   R                  5       n[        U[        R                  5      (       d   eUR
                  $ rB   )rK   r  r[   r   r  
group_namer  s     rN   r	  CommBufferLine.group_name  s9    **,&""5"56666   rP   rc   Nr   r   )r   zir.CommBufferTyper  )
r   r   r   r   r   propertyr   r  r	  r   rc   rP   rN   r  r    sG    !!
O	+ 	+ ' '
 ! !rP   r  c                  8    \ rS rSrSS jr\S 5       rSS jrSrg)	CommBufferAllocateLinei  c                &   U R                   R                  5       [        R                  R                  ;  d   eU R                   R                  5       nU R                   R                  5       nU R                   R                  5       n[        U R                   R                  5       5      n[        U R                   R                  5       5      nUR                  U R                  U R                  U R                  U R                  UUUUU5      5        g rB   )rK   rE   r0   rC   r  rl  rH   ro   get_size
get_stridery   make_allocation_liner  r	  rl   )r   r  r}   ru  r   shapestrides          rN   r  CommBufferAllocateLine.codegen  s    yy!!#177+B+BBBByy!!#%%'		##%dii((*+tyy++-.%%%%		
rP   c                   U [         R                  R                  :X  aT  U SUR                  U5       SUR                  U5       SU SUR                   SU S[
        R                  " SS5       S3$ [        S	U  35      e)
Nz = empty_strided_p2p(r  z, torch.device("cuda:z"), group_name="z", alloc_id=r   l    r   zUnsupported comm buffer type: )r   CommBufferTypeSYMM_MEMcodegen_shape_tupleindexrandomrandintr
  )r  r	  rl   r}   ru  r   r  r  s           rN   r  +CommBufferAllocateLine.make_allocation_line  s     r00999&-..u56b..v67r' &&,ll^ 4)l +"NN1i89< &01A0BC rP   c                    UR                   $ rB   )_generate_comm_buffer_allocater  s     rN   r  !CommBufferAllocateLine.codegen_fx   s    777rP   rc   Nr%  r  )	r   r   r   r   r  r.  r  r  r   rc   rP   rN   r  r    s     
(  $8rP   r  c                  (    \ rS rSrSS jrSS jrSrg)CommBufferFreeLinei  c                    U R                   R                  U R                  5      nUR                  U SU R                  R
                   S35        g )Nz # z buffer free)rl   r  rK   ry   r  r   r  s      rN   r  CommBufferFreeLine.codegen  s@    ||,,TYY7$s4#8#8#>#>"?|LMrP   c                    UR                   $ rB   )_generate_comm_buffer_freer  s     rN   r  CommBufferFreeLine.codegen_fx
  s    333rP   rc   Nr%  r  rV  rc   rP   rN   r"  r"    s    N4rP   r"  c                  V    \ rS rSr% SrS\S'   S\S'   S\S'   S\S	'   SS
 jrSS jrSrg)MultiOutputLinei  zM
Given a MultiOutputLayout buffer, indexes actual buffer(s) from the result.
r  rl   r   result_namearg_nameSequence[Any]indicesc                   ^ ^ UU 4S jmT" T R                   T R                  5      nUR                  T R                  R                   T R
                   SU T R                  R                   35        g )Nc                  > [        U5      S:  a  US   u  p#[        U[        5      (       a  T" U  SU S3USS  5      $ [        U[        5      (       a;  TR                  R                  U TR                  [        U5      5      nT" XASS  5      $ [        U[        5      (       a  T" U  SU S3USS  5      $ [        SU5      eU $ )Nr   []r1   z['z']znon supported index type: )
r   
issubclassr   ro   rl   codegen_tuple_accessr*  r   dictr  )basenamer-  itypeituple_accesscodegen_list_tuple_accessr   s        rN   r9  :MultiOutputLine.codegen.<locals>.codegen_list_tuple_access  s    7|a"1:eT**4z1#Q5GQRQSUUu--#'<<#D#D $"2"2CF$L 5\12;OOt,,4zA3b5I7STSU;WW()EuMMrP   r   )r+  r-  ry   rl   declarer*  ending)r   r  r   r9  s   `  @rN   r  MultiOutputLine.codegen  s]    	 $ *$--F||##$T%5%5$6c%ATAT@UV	
rP   c                    UR                   $ rB   )_generate_multi_outputr  s     rN   r  MultiOutputLine.codegen_fx1  s    ///rP   rc   Nr%  r  )	r   r   r   r   __doc__r   r  r  r   rc   rP   rN   r)  r)    s*     "!M
00rP   r)  c                  H    \ rS rSr% S\S'   S\S'   S\S'   SS jrSS	 jrS
rg)SymbolicCallArgLinei5  r  rl   r   argr>   rC   c                d    U R                   R                  U R                  U R                  5        g rB   )rl   "_generate_symbolic_call_arg_helperrD  rC   r  s     rN   r  SymbolicCallArgLine.codegen;  s    77$**MrP   c                    UR                   $ rB   )_generate_symbolic_call_argr  s     rN   r  SymbolicCallArgLine.codegen_fx>  rz  rP   rc   Nr%  r  rN  rc   rP   rN   rC  rC  5  s    !!	N5rP   rC  c            	      "  ^  \ rS rSrSrSrU 4S jr\ S       SS jj5       rSS jr	SS jr
SS	 jrSS
 jrSS jr\SS j5       rSS jr\SS j5       rSS jr\SS j5       rSS jr  SS jrSS jrSS jrSS jrSS jrSS jrSS jrSS jrSS jrSS jrSS jrS r S r!S r"S  r#S! r$SS" jr%SS# jr&SS$ jr'SS% jr(SS& jr)SS' jr*SS( jr+SS) jr,SS* jr-S+ r.    SS, jr/            SS- jr0SS. jr1SS/ jr2SS0 jr3S1 r4S2 r5S3 r6              SS4 jr7S5 r8SS6 jr9\:Rv                  SS7 j5       r<SS8 jr=S9 r>S: r?S; r@S< rASS= jrB      SS> jrCS? rDSS@ jrESA rFSSB.SSC jjrGSSB.SSD jjrHSSE jrISSF jrJSSG jrKSSH jrLSSI jrM S   SSJ jjrNSSK jrOSSL jrPSM rQSN rRSO rS   S         SSP jjrT\ S     SSQ jj5       rU   S         SSR jjrVSSS jrW  SST jrXSSSU jjrY      SSV jrZSSW jr[SSX jr\SY r]SZ r^S[ r_S\ r`S] raS^ rbS_ rcS` rdSSa jreSb rfSSSSSSSSc. SSd jjrgSSSSSSSeSSf. SSg jjrhSh riSi rjSj rkSSk jrlSSl jrm SSm jrnSn roSSo jrpSSp jrqSSq jrrSSr jrsSSs jrtSSt jruSSu jrvSv rwSSw jrxSx rySSy jrzSz r{        SS{ jr|S| r}    SS} jr~SS~ jrS rS rS rS rS rS rS rS r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       rSrU =r$ )r  iF  z:
Generate outer wrapper in Python that calls the kernels.
Tc                  >^  [         TT ]  5         [        5       T l        0 T l        [        5       T l        [        5       T l        [        5       T l        [        5       T l	        [        5       T l
        [        5       T l        [        5       T l        [        5       T l        [        5       T l        [        5       T l        0 T l        ST l        0 T l        [        5       T l        / T l        ST l        ST l        ST l        ST l        ST l        [6        R8                  R:                  (       a  SOST l        [6        R8                  R:                  (       a  SOST l        S T l         ST l!        0 T l"        [        5       T l#        [        5       T l$        S T l%        T RM                  5         / T l'        / T l(        T RS                  5         [U        T 5      (       d  T RW                  5         T RY                  5         [6        R8                  RZ                  (       dB  [6        R8                  R\                  R_                  5        H  u  pT Ra                  X5        M     [        [b           " 5       T l2        [        [b           " 5       T l3        0 T l4        [j        Rl                  " S 5      " T Rn                  5      T l7        [j        Rp                  S
U 4S jj5       nUT l9        0 T l:        [        5       T l;        [y        5       T l=        [        5       T l>        0 T l?        [        [        R                  R                  [        R                  R                  S	9T lE        / T lF        g )Nr    #r  z
std::move(r   Tc                   > TR                   R                  U 5        [        R                  R                  (       a  TR
                  R                  U 5        g g rB   )importsry   r   rq   rr   r{   )r|   r   s    rN   add_import_once6PythonWrapperCodegen.__init__.<locals>.add_import_once  s;    LL""4(}}55**44T: 6rP   )debug_printer_leveluse_array_ref)r|   r   r   r  )Gr   r   r   _names_iterargs_to_buffersr(   rP  headerprefixsuffixkernel_declarationswrapper_callkernel_autotune_defsr{   subgraph_definitionsr   rz   kernel_autotune_example_argskernel_autotune_tmp_arg_idxsrc_to_kernelkernel_numel_exprlinesr;  declare_maybe_referencer<  commentnone_strr0   rC   rC  
move_beginmove_endr>  supports_intermediate_hooksuser_defined_kernel_cacheunbacked_symbol_declsr  launcher_fn_nameset_launcher_fn_namecodegened_graph_stackcomputed_sizes_stackwrite_headerr)   write_prefix!write_kernel_autotune_defs_headerrD  constant_reprsr   write_constant
BufferName	allocatedfreedreusesr  	lru_cachewrite_get_raw_streamcacherQ  _metas
_meta_varsr   multi_kernel_statealready_codegened_subgraphsallocated_workspacesr   r   aot_inductor debug_intermediate_value_printerallow_stack_allocationdebug_printeradditional_files)r   r}   hashedrQ  r   s   `   rN   r   PythonWrapperCodegen.__init__M  s   */'  	 &'$&$&$&#1#3 *,$2$4!%3%5"$2$4!6@l" IK)01( .0HR!#
')$*+''*=*=,2 ww22;?)+/(QS&L 	" 9C $!!# &("$&!2488..0ww ! 6 6 < < >##D1 !? $J/1
+-
 57$-$7$7$=%%%
! 
	; 
	;
  /&(+5<"2"4<FL(46! 1 & 3 3 T T --DD
 !#rP   Nc                P    U (       a  Uc   eUc   e[        XU5      $ [        5       $ rB   )SubgraphPythonWrapperCodegenr  )is_subgraphsubgraph_nameparent_wrapperpartition_signaturess       rN   createPythonWrapperCodegen.create  s=      ,,,!---//C  $%%rP   c                    SU l         g )Ncall)rk  r   s    rN   rl  )PythonWrapperCodegen.set_launcher_fn_name  s
     &rP   c                D    U R                   R                  U SU 35        g )Nz = None  # )rW  ry   )r   r}   r  s      rN   rs  #PythonWrapperCodegen.write_constant  s    k&:;rP   c           	     T   [         R                  R                  R                  5       nSnUb  UR                  b  SUR                   3nSn[        [        R                  R                  5      S:  a  SnU R                  R                  SU S[        R                   SU S3S	S
9  U R                  R                  SS	S
9   SSKJn  U R                  R                  SS	S
9  [        R$                  (       a  U R                  R'                  S5        g g ! [         ["        4 a     NDf = f)NrM  z
# AOT ID: r   zRfrom torch._inductor.codegen.debug_utils import _print_debugging_tensor_value_infoz
                aH  
                from ctypes import c_void_p, c_long, c_int
                import torch
                import math
                import random
                import os
                import tempfile
                from math import inf, nan
                from cmath import nanj
                from torch._inductor.hooks import run_intermediate_hooks
                from torch._inductor.utils import maybe_profile
                from torch._inductor.codegen.memory_planning import _align as align
                from torch import device, empty_strided
                from zq import AsyncCompile
                from torch._inductor.select_algorithm import extern_kernels
                z
            Tr   a  
                aten = torch.ops.aten
                inductor_ops = torch.ops.inductor
                _quantized = torch.ops._quantized
                assert_size_stride = torch._C._dynamo.guards.assert_size_stride
                assert_alignment = torch._C._dynamo.guards.assert_alignment
                empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
                empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
                empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
                reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
                alloc_from_pool = torch.ops.inductor._alloc_from_pool
                async_compile = AsyncCompile()
            )_SymmetricMemoryzs
                empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
                zfrom torch.cuda import nvtx)torch_guardsTracingContexttry_getaot_graph_namer   r   r  r  rP  r   r   r   rW  torch._C._distributed_c10dr  AttributeErrorImportErrorannotate_trainingry   )r   contextaot_config_commentaot_inductor_debug_utilsr  s        rN   ro  !PythonWrapperCodegen.write_header  sD   --..6687#9#9#E#-g.D.D-E!F#% v""CCDqH'{$#$ % $,,- .)* +!$ ' 	 	
* 	  	 	
 	 DKK 	   ##KK!!"?@ $ , 		s    D D'&D'c                    g rB   rc   )r   rW  s     rN   include_extra_header)PythonWrapperCodegen.include_extra_header      rP   c                ^    U R                   R                  S[        R                   S35        g )Na	  
                import torch
                from torch._dynamo.testing import rand_strided
                from torch._dynamo.utils import preserve_rng_state
                from torch._inductor.select_algorithm import AlgorithmSelectorCache
                from aH   import AsyncCompile

                async_compile = AsyncCompile()
                generate_example_value = AlgorithmSelectorCache.generate_example_value
                empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
                empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
            )r\  r   r   r   r   s    rN   rq  6PythonWrapperCodegen.write_kernel_autotune_defs_header  s3    !!((
 $,,- .	
rP   c                   S[         R                   S3n[        R                  R                  (       a]  U R
                  R                  U5        U R
                  R                  [        R                  R                  R                  S5      5        [        R                  R                  (       d]  U R                  R                  USS9  U R                  R                  [        R                  R                  R                  S5      5        g g )NzU
            import triton
            import triton.language as tl
            from z+ import start_graph, end_graph
            get_raw_streamTr   )r#   r   r   rq   rr   r{   r   ry   r0   rC   rE  import_get_raw_stream_asrC  rP  )r   
import_strs     rN   write_triton_header_once-PythonWrapperCodegen.write_triton_header_once  s     $,,- .

 ==11&&--j9&&00"";;<LM ww""LL
$7LL"""";;<LM #rP   c                   [         R                  R                  (       aB  U R                  R	                  [
        R                  R                  R                  S5      5        [
        R                  R                  (       dC  U R                  R	                  [
        R                  R                  R                  S5      5        g g )Nr  )r   rq   rr   r{   ry   r0   rC   rE  r  rC  rP  r   s    rN   write_get_raw_stream_header0PythonWrapperCodegen.write_get_raw_stream_header#  s{    ==11&&00"";;<LM ww""LL"""";;<LM #rP   c                $    U R                  5         g rB   )r  r   s    rN    write_get_raw_stream_header_once5PythonWrapperCodegen.write_get_raw_stream_header_once-  s    ((*rP   c                   [        U5      nXR                  ;  a  S[        U R                  5       3nX R                  U'   U R                  R	                  U SU 35        [
        R                  R                  (       a;  U R                  R	                  U SU 35        U R                  R                  U5        U R                  U   $ )Nmetar   )reprr{  r   rW  ry   r   rq   rr   r{   r|  r   )r   r  vars      rN   add_meta_once"PythonWrapperCodegen.add_meta_once1  s    Dz{{"T[[)*+C #KKKK!!SETF"34}}55**44uCv5FG##C({{4  rP   c                ~    U R                  5        Vs/ s H  oR                  U R                  5      PM     sn$ s  snf rB   )get_graph_outputsrq  r[  r   r   s     rN   get_output_refs$PythonWrapperCodegen.get_output_refs<  s?     =A<R<R<T
<Tq 1 12<T
 	
 
s   $:c                    g rB   rc   r   s    rN   mark_output_type%PythonWrapperCodegen.mark_output_typeB      rP   c                6    [         R                  R                  $ rB   )r0   rC   graph_inputsr   s    rN   get_graph_inputs%PythonWrapperCodegen.get_graph_inputsE  s     ww###rP   c                6    [         R                  R                  $ rB   )r0   rC   graph_outputsr   s    rN   r  &PythonWrapperCodegen.get_graph_outputsJ  s    ww$$$rP   c           
     6   U R                  5       R                  5        H  u  p[        U[        R                  [
        R                  45      (       a  M6  U[        R                  R                  ;  d  [        U[
        R                  5      (       a  Mu  [        UR                  5       5      S:X  a  M  U R                  UR                  5       5      nU R                  UR                  5       5      nU R                  R!                  SU SU SU S35        M     g )Nr   zassert_size_stride(r  r   )r  r   r[   r\   r   r   TorchBindObjectr0   rC   graph_input_namesGeneratorStater,   r  rp   r  rX  ry   )r   r}   bufr   r  s        rN   codegen_input_size_asserts/PythonWrapperCodegen.codegen_input_size_assertsM  s    ..0668ID#

B,>,>?@@ 177444
R&&9 9  S\\^,1223<<>BD44S^^5EFFKK!!$7vRvRxq"QR 9rP   c                n   U R                   R                  S5        U R                  5       R                  5        Hx  u  p[	        U[
        R                  [        R                  45      (       a  M6  SU S3nU R                   R                  U5        SU S3nU R                   R                  U5        Mz     g )Nz(# make sure graph inputs are not nan/infzassert not z.isnan().any().item()z.isinf().any().item())	rX  ry   r  r   r[   r\   r   r   r  )r   r}   r  r|   s       rN   codegen_input_nan_asserts.PythonWrapperCodegen.codegen_input_nan_asserts_  s    HI..0668ID#

B,>,>?@@ &;<DKK!!$' &;<DKK!!$' 9rP   c                :    U R                   R                  S5        g )NzV

            async_compile.wait(globals())
            del async_compile
            )rX  r   r   s    rN   write_async_compile_wait-PythonWrapperCodegen.write_async_compile_waitj  s    	
rP   c                    SR                  U5      n[        U5      S:X  a  US-  nU R                  R                  U S35        U R                  R                  S5        g )Nr  r1   ,z = argszargs.clear())r   r   rX  ry   )r   input_nameslhss      rN   
write_argsPythonWrapperCodegen.write_argss  sP    ii${q 3JCWo.n-rP   c                    [         R                  (       a  U R                  R                  S5        SnU$ U R                  R                  SU R                   S35        SnU$ )Na  
                class Runner:
                    def __init__(self, partitions):
                        self.partitions = partitions

                    def recursively_apply_fns(self, fns):
                        new_callables = []
                        for fn, c in zip(fns, self.partitions):
                            new_callables.append(fn(c))
                        self.partitions = new_callables

                    def call(self, args):
                r   z
                def z(args):
                r1   )r   graph_partitionrX  r   rk  r   prefix_indents     rN   !write_launcher_fn_call_get_indent6PythonWrapperCodegen.write_launcher_fn_call_get_indentz  sm    !!KK M  KK**+ ,
 MrP   c                6    [         R                  R                  $ rB   )r0   rC   r  r   s    rN   get_graph_input_names*PythonWrapperCodegen.get_graph_input_names  s    ww(((rP   c                   U R                   c   eU R                  5         U R                  5       nU R                  R	                  U5         [
        R                  R                  (       aA  U R                  R                  [        R                  R                  R                  5       5        [        R                  R                  5       n[
        R                  (       a  U R                  R                  SU S35        U R                  5       =n(       a  U R!                  U5        U R#                  5         U R%                  5         S S S 5        g ! , (       d  f       g = f)Nz0training_annotation = nvtx._device_range_start(''))rk  r  r  rX  r   r   rq   debug_sync_graphry   r0   rC   rE  synchronizeget_training_phaser  r  r  codegen_inputs"codegen_input_size_and_nan_asserts)r   r  phaser  s       rN   rp  !PythonWrapperCodegen.write_prefix  s    $$000%%'>>@[[.}}--%%agg&8&8&D&D&FGGG..0E''%%FugRP %)$>$>$@@ @ 12!335 /..s   C;E
Ec                    [         R                  (       a  U R                  5         [         R                  (       a  U R	                  5         g g rB   )r   size_assertsr  nan_assertsr  r   s    rN   r  7PythonWrapperCodegen.codegen_input_size_and_nan_asserts  s1    ++-**, rP   c                    U R                  5         SU 3n[        R                  R                  (       aB  U R                  R                  U SU S35        [        R                  R                  (       a  U$ U R                  U SU S35        U$ )Nstream = get_raw_stream(r   )	r  r   rq   rr   r{   ry   r0   rC   rC  )r   r=  r  r}   s       rN   ry  )PythonWrapperCodegen.write_get_raw_stream  s    --/
|$==11&&00&*:,a8 ww""$1*Q?@rP   c                     U R                   S   $ )N)rm  r   s    rN   get_codegened_graph(PythonWrapperCodegen.get_codegened_graph  s    ))"--rP   c                :    U R                   R                  U5        g rB   )rm  r   )r   rC   s     rN   r  )PythonWrapperCodegen.push_codegened_graph  s    ""))%0rP   c                6    U R                   R                  5       $ rB   )rm  r   r   s    rN   r5  (PythonWrapperCodegen.pop_codegened_graph  s    ))--//rP   c                P    SSK Jn  U R                  R                  U" U5      5      $ )Nr   )deepcopy)copyr  rn  r   )r   r  r  s      rN   r  (PythonWrapperCodegen.push_computed_sizes  s!    !((//0HIIrP   c                6    U R                   R                  5       $ rB   )rn  r   r   s    rN   r2  'PythonWrapperCodegen.pop_computed_sizes  s    ((,,..rP   c                .    [        U R                  5       $ rB   )nextrU  r   s    rN   next_kernel_suffix'PythonWrapperCodegen.next_kernel_suffix  s    t''()*rP   c                ~   U R                  [        XR                  5      5        [        R                  R
                  (       a  U R                  5         U R                  R                  S[        R                  R                  R                  U5       S35        U R                  R                  5         U R                  R                  [        R                  R                  R                  U5      5        [        U 5      (       a  U R                  5         U R                  R                  SU SU S35        Xl        g )NrA  rB  r  r  r   )ry   r<  r>  r   rq   rr   r  r{   r0   rC   rE  rH  r  rI  r)   r  )r   r=  s     rN   codegen_device_guard_enter/PythonWrapperCodegen.codegen_device_guard_enter  s    )*6W6WX	
 ==11))+&&00**77
CDAF &&002&&00""--j9 3488002&&00$6zl!D -7)rP   c                    U R                  [        5       5        [        R                  R                  (       a  U R
                  R                  5         g g rB   )ry   rP  r   rq   rr   r{   r6  r   s    rN   codegen_device_guard_exit.PythonWrapperCodegen.codegen_device_guard_exit  s6    356==11&&224 2rP   c                   U(       Ga1  [         R                  (       a  U R                  R                  SSR	                  U5      -   S-   5        U R                  R                  S5        U R                  R                  5         U R                  R                  S5        U R                  R                  5         U R                  R                  S5        U R                  R                  S5        U R                  R                  S5        U R                  R                  S	SR	                  U5      -   S-   5        g U R                  R                  S
5        g )Nzreturn_vars = (r  , )zfor var in return_vars:z!if isinstance(var, torch.Tensor):z#assert not var.isnan().any().item()z#assert not var.isinf().any().item()r   zreturn (z	return ())r   r  r[  ry   r   r  r6  )r   output_refss     rN   generate_return$PythonWrapperCodegen.generate_return  s   !!!!++%		+(>>F !!++,EF!!++-!!++,OP!!++-!!++,QR!!++,QR!!--a0''
TYY{5K(Ke(ST''4rP   c                    g rB   rc   r   results     rN   generate_before_suffix+PythonWrapperCodegen.generate_before_suffix  r  rP   c                    [         R                  (       aO  SR                  U R                  5      [	        U R                  5      S:X  a  SOS-   nUR                  SU S35        g g )Nr  r1   r  rM  z-
                runner = Runner(partitions=[z{])
                call = runner.call
                recursively_apply_fns = runner.recursively_apply_fns
                )r   r  r   all_partition_namesr   r   )r   r  all_partition_name_lists      rN   generate_after_suffix*PythonWrapperCodegen.generate_after_suffix  se    !!&*ii0H0H&I43349r'# MM--D,E F "rP   c                    g rB   rc   r  s     rN   generate_end!PythonWrapperCodegen.generate_end  r  rP   c                8    U R                  [        X5      5        g rB   )ry   rX  r   rK   s     rN   generate_fallback_kernel-PythonWrapperCodegen.generate_fallback_kernel  s    ,T89rP   c                    UR                  U 5        U R                  [        X5      5        [        UR                  [
        R                  5      (       a  UR                  U 5        g g rB   )codegen_commentry   rX  r[   r  r   Layoutcodegen_size_assertsr#  s     rN   generate_extern_kernel_alloc1PythonWrapperCodegen.generate_extern_kernel_alloc  sJ    T",T89dkk299--%%d+ .rP   c           
        [        UR                  [        R                  5      nUR	                  5       nUR                  5       nUR                  5       nU R                  n[        R                  (       a  SU;   a  SU 3nU(       a5  U R                  U R                   U SSR                  U5       SU 35        g U R                  U R                   U SU SSR                  U5       SU 35        U R                  (       aR  [        R                  (       a<  Ub8  [        S   S==   S	-  ss'   U R                  S
UR                   < SU S35        g g g g )Nview_as_complexz.clone()r  r  r   r   inductorintermediate_hooksr1   zrun_intermediate_hooks()r[   r  r   
NoneLayoutrE   get_origin_noderj  r<  r   memory_planningry   r;  r   rh  generate_intermediate_hooksr   r}   )r   extern_kernelr^  	no_returnoutput_nameorigin_noders  r<  s           rN   r]  9PythonWrapperCodegen._generate_extern_kernel_alloc_helper!  s;    }33R]]C	#,,.#335#335!!&7;&F  x(FNNdll^K=$))D/9J!F8TUNN<<.SQtyy>OqQWPXY 0066+$%9:a?:-k.>.>-AK=PQR , 7 1rP   c                Z    UR                  U 5        U R                  [        X5      5        g rB   )r'  ry   rd  r#  s     rN   generate_extern_kernel_out/PythonWrapperCodegen.generate_extern_kernel_out>  s$     	T"*467rP   c                2   [         R                  R                  R                  nUR	                  XAS S S5        UR                  SU(       a  UOU 35        U   U R                  U SSR                  U5       S35        S S S 5        g ! , (       d  f       g = f)Nexternzout=r  r  r   )r0   rC   wrapper_coder  set_printer_argsr   ry   r   )r   r   outout_viewr^  ru  debug_printer_managers          rN   rp  7PythonWrapperCodegen._generate_extern_kernel_out_helperE  sw     !" 4 4 B B..tT4Rdx8S9:;"NNfXQtyy&7q9: #""s   'B
Bc                  ^  UR                   nUR                  nU(       a$  [        S U 5       5      n[        S U 5       5      nUR                  R	                  5        S3nSR                  U 4S jU 5       5      nSR                  U 4S jU 5       5      n[        R                  T UR                  5      nSnU SUR                   S	3nU SU SU SU 3n	U S
U	 S3n
U
$ )Nc              3  t   #    U  H.  n[         R                  R                  R                  U5      v   M0     g 7frB   r0   rC   rI   atomically_apply_size_hintrd   rt  s     rN   rf   RPythonWrapperCodegen._generate_tma_descriptor_call_experimental.<locals>.<genexpr>X  s*     VQUA))DDQGGQU   68c              3  t   #    U  H.  n[         R                  R                  R                  U5      v   M0     g 7frB   rF  rH  s     rN   rf   rI  Y  s+      HR1  ;;A>>
rJ  z.data_ptr()r  c              3  P   >#    U  H  n[         R                  TU5      v   M     g 7frB   r  val_to_arg_strrd   dimr   s     rN   rf   rI  _  s$     XSWC-<<T3GGSW   #&c              3  P   >#    U  H  n[         R                  TU5      v   M     g 7frB   rM  rO  s     rN   rf   rI  `  s%      
FPs //c::jrQ  z$triton.tools.experimental_descriptorz.create_d_tma_descriptorr  r   )
dims
block_dimsro   tensorrq  r   r  rN  element_sizerank)r   descapply_size_hintsrT  rU  ptrrW  rX  r   r^  r  s   `          rN   *_generate_tma_descriptor_call_experimental?PythonWrapperCodegen._generate_tma_descriptor_call_experimentalT  s    yy__
VQUVVD HR J ..01=yyXSWXXYY 
FP
 

 ,::4ARARS7xx		{*:;bbB|n=QtfArP   c                    UR                   nU(       a  [        S U 5       5      nSnU S3nUR                  R                  5        SU 3nU SU S3nU$ )Nc              3  t   #    U  H.  n[         R                  R                  R                  U5      v   M0     g 7frB   rF  rH  s     rN   rf   LPythonWrapperCodegen._generate_tma_descriptor_call_stable.<locals>.<genexpr>m  s+       HS1  ;;A>>rJ  z/triton.tools.tensor_descriptor.TensorDescriptorz.from_tensorr  r  r   )block_shapero   rV  rq  )r   rY  rZ  ra  rX  r   r^  r  s           rN   $_generate_tma_descriptor_call_stable9PythonWrapperCodegen._generate_tma_descriptor_call_stablej  so    &&  HS  K Cx|$++//12"[MBQtfArP   c                    [        U[        R                  5      (       a  U R                  X5      $ [        U[        R                  5      (       d   eU R                  X5      $ rB   )r[   r   TMADescriptorExperimentalr\  TMADescriptorStablerb  )r   rY  rZ  s      rN   _generate_tma_descriptor_call2PythonWrapperCodegen._generate_tma_descriptor_callw  sW    dB8899BB  dB$:$:;;;;<<TTTrP   c                    U R                  U5      nUR                   SU U R                   3nU R                  U5        g Nr   )rg  r}   r<  ry   )r   rY  r  r|   s       rN   generate_tma_descriptor,PythonWrapperCodegen.generate_tma_descriptor  s:    11$7))Cvdkk]3trP   c                   U SSR                  [        [        U5      5       3nUR                  S5      (       a  USR                  S/U-   5      -  nOU(       a  US[	        U5       3-  nUS-  nU R                  U5        g )Nr  r  zaten.scatter_reducer  rM  z	, reduce=r   )r   mapr   r   r  ry   )	r   r~   inputsrk  python_kernel_namesrc_is_tensorr  r   r|   s	            rN   generate_scatter_fallback.PythonWrapperCodegen.generate_scatter_fallback  s~     %%QsxxC0@'A&BC(()>??DIIrdVm,,D)DL>22trP   c                v    SSR                  U5       S3nX&XE/nU R                  U R                  X5      5        g )Nr0  r  r1  )r   ry   wrap_kernel_call)r   r   r   r-  values
accumulateindices_strr^  s           rN   generate_index_put_fallback0PythonWrapperCodegen.generate_index_put_fallback  s;    $))G,-Q/3t,,V:;rP   c           
     `    U R                  U SU SSR                  U" 5       5       S35        g )Nr   r  r  r   )ry   r   )r   buf_namerp  get_argsop_overloadr  outputss          rN   ,generate_fallback_kernel_with_runtime_lookupAPythonWrapperCodegen.generate_fallback_kernel_with_runtime_lookup  s2     	(3'9&:!DIIhj<Q;RRSTUrP   c                p    [        S5         U R                  U5      sS S S 5        $ ! , (       d  f       g = f)NzPythonWrapperCodegen.generate)r   	_generater   is_inferences     rN   generatePythonWrapperCodegen.generate  s#    9:>>,/ ;::s   '
5c                0    [         R                  (       a  gg)Nr   r1   )r   r  r   s    rN   get_wrapper_call_indent,PythonWrapperCodegen.get_wrapper_call_indent  s    !!rP   c              #  \   #    U R                   n Xl         Uv   X l         g ! X l         f = f7frB   ry   )r   newolds      rN   set_writeline"PythonWrapperCodegen.set_writeline  s'     nn	! NI NSNs   ,
! ,),c                    U R                   R                  n[        R                  R                  (       a  U R
                  R                  U5        g U R                  R                  U5        g rB   )r}  kernel_defsr   rq   rr   r\  r   rW  )r   r  s     rN   _write_multi_kernel_defs-PythonWrapperCodegen._write_multi_kernel_defs  sF    --99==11%%,,[9KK{+rP   c                \
   [         R                  (       a  U R                  5         [        R                  " 5        nUR                  U R                  R                  5       5        [         R                  (       a  U R                  U5        [         R                  (       a  U R                  5         U R                  U5        [         R                  R                  (       a/  [         R                  R                  (       d  U R                  5         U R!                  U R                  R"                  5         U R$                   HP  n['        U[(        5      (       a  UR+                  U R                  5        M5  U R                  R#                  U5        MR     S S S 5        U R-                  5         U R/                  5       nU R1                  5         [         R                  R2                  (       aA  U R                  R#                  [4        R6                  R8                  R;                  5       5        [         R                  (       a  U R=                  5         [         R                  R                  (       a/  [         R                  R                  (       d  U R?                  5         [         R                  R                  (       a  U RA                  5         [         RB                  (       a0  [         RD                  (       d  U R                  R#                  S5        U RG                  U5        S S S 5        [I        5       nURK                  U RL                  5        UR#                  S5        URK                  U RN                  5        [4        R6                  RP                  (       aH  [4        R6                  RD                  (       a)  [4        R6                  RR                  (       a
  [I        5       nURK                  U RT                  5        U RW                  5         URK                  U RX                  5        U R[                  5       nUR                  U5         URK                  U R                  5        S S S 5        U R]                  U5        URK                  U R^                  5        U Ra                  U5        U Rc                  U5        U Re                  U5        URg                  5       U Rh                  Rg                  5       4$ ! , (       d  f       GNk= f! , (       d  f       GN= f! , (       d  f       N= f)Nz+nvtx._device_range_end(training_annotation)rM  )5r   profile_bandwidthr  r   	ExitStackenter_contextr[  r   profiler_mark_wrapper_call#generate_profiler_mark_wrapper_callgenerate_start_graphrun_wrapper_ir_passesrq   store_cubinrr   !generate_reset_kernel_saved_flagsr  ry   rb  r[   r@   r  r  r  r  r  r0   rC   rE  r  generate_end_graph generate_save_uncompiled_kernelsgenerate_and_run_autotune_blockr  rC  r  r(   r   rP  rW  rD  is_const_graphr]  finalize_prefixrX  r  r  rY  r  r   add_benchmark_harnessgetvaluewithlinemaprZ  )r   r  stackr|   r  r  wrapper_call_indents          rN   r  PythonWrapperCodegen._generate  sP   ##))+!!#u 1 1 8 8 :;0088?''))+&&|4}}((1W1W668 ##D$5$5$?$?@ JJD!$44T%6%67))33D9	 ' A ))+..0K!!#}}--!!++AGG,>,>,J,J,LM'''')}}((1W1W557}}55446 ''0B0B!!++A   -U $Z  !dll#dkk" 77 3 38N8N#%F 	d//0dkk""::<]]./MM$++, 0 	##F+dkk"""6*&!""6* &&($$88:
 	
u A@ $#z 0/s2   C9T4A!S9FTT9
T	T
T
T+c                X   U R                   R                  S5        0 n[        R                  R                  (       af  [
        R                  R                  (       aG  [        [
        R                  R                  5       VVs0 s H  u  p#U R                  U5      U_M     nnnU R                   R                  5       S-   U R                  R                  5       -   n[        R                  [        R                  :X  aj  [         R"                  " [%        5       SSS9 nUR'                  UR)                  S5      5        UR*                  nSSS5        [        R,                  " SW5         [/        XA5        gs  snnf ! , (       d  f       N8= f! [0         a  n[3        S	U 35      UeSnAff = f)
z
Compose self.kernel_autotune_defs and self.kernel_autotune_calls into a single block of
code and execute it to trigger Triton kernel compilation and auto-tuning
zQ
            async_compile.wait(globals())
            del async_compile
        r@  z.pyF)dirrY  deletezutf-8NzAuto-tuning code written to %sz%Failed to run autotuning code block: )r\  r   r   rq   rr   r0   rC   autotuning_inputs	enumerateget_autotuning_input_namer   r{   r    levelloggingDEBUGtempfileNamedTemporaryFiler   writeencoder}   debugexec	ExceptionRuntimeError)r   scopeidxvtuning_codef	file_pathes           rN   r  4PythonWrapperCodegen.generate_and_run_autotune_block  sj   
 	!!((	
 ==11agg6O6O ((A(ABBFC ..s3Q6B  
 %%..0((1134 	
   GMM1 ,,Ke**734FF		
 !!0
	S$/   	S!FqcJKQRR	Ss*   E4-E:(F :
F
F)F$$F)c                \    SSK Jn  U" U 5      R                  U R                  5      U l        g )Nr1   )MemoryPlanner)r2  r  r  rb  )r   r  s     rN   memory_plan PythonWrapperCodegen.memory_plan;  s     2"4(--djj9
rP   c                "   [         R                  R                  5       nU R                  (       a  [	        U R                  S   [
        5      (       a  U R                  S   R                  R                  U;  av  U R                  R                  5         U R                  (       aK  [	        U R                  S   [
        5      (       a)  U R                  S   R                  R                  U;  a  Mv  [        5       /n/ n[        [        U R                  5      5       H  nU R                  U   n[	        U[
        5      (       a#  UR                  US   5      U R                  U'   MJ  [	        U[        5      (       a  UR                  [        5       5        Mz  [	        U[        5      (       d  M  UR                  UR                  5       5        M     UR                  UR                  5       5        [        U5      S:X  d   e[!        S U 5       5      ng )Nr  r   c              3  8   #    U  H  oR                   v   M     g 7frB   )r   )rd   ss     rN   rf   9PythonWrapperCodegen.memory_plan_reuse.<locals>.<genexpr>]  s      +
3Ga))3Gs   )r0   rC   get_output_namesrb  r[   r  rK   r}   r   r   ranger   r  r  r   r0  sum)r   	out_namesplanning_statespast_planning_statesr7  r|   _total_allocated_buffer_sizes          rN   memory_plan_reuse&PythonWrapperCodegen.memory_plan_reuse@  s   GG,,.	 JJ4::b>+=>>

2##((	9 JJNN JJ4::b>+=>>

2##((	9 /01!s4::'A::a=D$ 233 $		/"*= >

1D"344&&':'<=D"233$++O,?,?,AB ( 	##O$7$7$9:?#q(((
 (+ +
3G+
 (
$rP   c                ~    U(       a&  [         R                  (       a  U R                  5         g U R                  5         g rB   )r   r2  r  r  r  s     rN   r  *PythonWrapperCodegen.run_wrapper_ir_passesa  s%    F22""$rP   c           	       ^	 U R                   m	[        R                  U	4S j5       n[        R                  U	4S j5       n[        U[        R
                  5      (       aM  [        U[        R                  5      (       a  X#;   a  g T	R                  U SU 35        UR                  U5        g [        U[        R                  5      (       a  [        UR                  5       5       H^  u  pg[        U[        R                  5      (       d  M&  Xs;  d  M-  T	R                  U SU" U5       SU S35        UR                  U5        M`     [        UR                  5       5       H^  u  ph[        U[        R                  5      (       d  M&  X;  d  M-  T	R                  U SU" U5       SU S35        UR                  U5        M`     g [        U[        R                  5      (       a  g [        U[        R                  5      (       a  g [         R"                  R$                  R&                  (       a  g [)        S[+        U5       35      e)Nc                <   > TR                  U  SU  S35        U  S3$ )Nz_size = z.size()_sizer  r}   r  s    rN   sizeofDPythonWrapperCodegen.codegen_input_symbol_assignment.<locals>.sizeofp  s(    NNdV8D69:V5>!rP   c                <   > TR                  U  SU  S35        U  S3$ )Nz
_stride = z	.stride()_strider  r  s    rN   strideofFPythonWrapperCodegen.codegen_input_symbol_assignment.<locals>.strideofu  s)    NNdV:dV9=>V7##rP   r   r0  r1  zUnknown value type: )rX  r  rz  r[   r\   r   Symbolry   r   r   	TensorBoxr  r  r  r  r  r  	_inductorr   r  r  rk   )
r   r}   r   
bound_varsr  r  rP  r   r  r  s
            @rN   codegen_input_symbol_assignment4PythonWrapperCodegen.codegen_input_symbol_assignmenth  s    {{		" 
	" 
	$ 
	$ eUZZ((eU\\22e6INNeWCv./NN5!r||,,&u~~'78	dELL11d6LNNdV3vd|nAcU!#DENN4( 9  ))9)9);<fell338PNNfXS$0@#a#HINN6*  = r1122r0011%%55$';DK=%IJJrP   c           	     F   [         [        R                     " 5       nU R                  5       nUR	                  5        VVs/ s H)  u  p4[        U[        R                  5      (       d  M&  X44PM+     snnUR	                  5        VVs/ s H)  u  p4[        U[        R                  5      (       a  M&  X44PM+     snn-   nU H  u  pgU R                  XgU5        M         SS jnU H.  u  p[        U[        R                  5      (       d  M&  U" Xq5        M0     gs  snnf s  snnf )z$Assign all symbolic shapes to localsc                x   [         R                  " U R                  5       U R                  5       /5       H}  n[	        U[
        5      (       a  [	        U[        R                  5      (       a  M9  UR                   Vs/ s H  o3U;  d  M
  UPM     nn[        U5      S:  d  Mm  [        SU SU S35      e   g s  snf )Nr   zFor z, expected z to have been codegen-ed.)r   from_iterabler  r  r[   r   r\   r  free_symbolsr   r  )r   r  exprsymundefined_symbolss        rN   _verify_input_symbol_assignmentLPythonWrapperCodegen.codegen_inputs.<locals>._verify_input_symbol_assignment  s     ++U^^-=u?O?O?Q,RS!$--D%,,1O1O $(#4#4%#4C:8MC#4 " % ()A-(tfK0A/BB[\  T%s   <	B7	B7N)r   ir.TensorBoxr  OrderedSet[sympy.Symbol])	r   r\   r  r  r   r[   r  r   r  )
r   r  r  kr  ro  r}   r   r  _s
             rN   r  #PythonWrapperCodegen.codegen_inputs  s    -/
 ,,.+113
3tqz!U\\7RFQF3
 , 2 2 4X 4Jq%,,<WVaV 4XY "KD00jI "		0	& HAeR\\22+E> 3
Xs    %D)D%D/Dc                f   [        U[        R                  5      (       a  [        U[        R
                  5      (       ar  XR                  ;   a  g U R                  R                  U5        [        R                  R                  R                  U   nU R                  U S[        U5       35        g g g rj  )r[   r\   r  r   r   PRECOMPUTED_SIZEr  r   r0   rC   rI   inv_precomputed_replacementsry   pexpr)r   r  r  s      rN   ensure_size_computed)PythonWrapperCodegen.ensure_size_computed  s    c5<<((^CAVAV-W-W)))##C(77##@@EDNNcU#eDk]34 .X(rP   c                    g rB   rc   r   s    rN   r  $PythonWrapperCodegen.finalize_prefix  r  rP   rJ   c                   [        S5      e)Nz8codegen_cpp_sizevar is only implemented for cpp_wrapper!)r  r   r   rJ   s      rN   codegen_cpp_sizevar(PythonWrapperCodegen.codegen_cpp_sizevar  s    UVVrP   c                   [        XS9$ )Nr  )r  r  s      rN   codegen_python_sizevar+PythonWrapperCodegen.codegen_python_sizevar  s    Q**rP   c                $    U R                  U5      $ rB   )r  r  s     rN   codegen_sizevar$PythonWrapperCodegen.codegen_sizevar  s    **1--rP   c                    U SU S3$ )Nr0  r1  rc   )r   r5  r}   r  s       rN   r3  )PythonWrapperCodegen.codegen_tuple_access  s    1UG1%%rP   c                    / [        U R                  U5      Qn[        U5      S:X  a  g[        U5      S:X  a	  SUS    S3$ SSR                  U5       S3$ )Nr   ()r1   r  r  r  r   )rn  r  r   r   )r   r  partss      rN   rp   /PythonWrapperCodegen.codegen_python_shape_tuple  s^    :#d1159:u:?u:?uQxj$$499U#$A&&rP   c                $    U R                  U5      $ rB   )rp   )r   r  s     rN   r  (PythonWrapperCodegen.codegen_shape_tuple  s    ..u55rP   c                    SR                  SR                  U[        U5      [        U5      U R	                  U5      U R	                  U5      /5      5      $ )Nzalloc_from_pool({})r  )formatr   r  r   rp   )r   r}   offsetr   r  r  s         rN   codegen_alloc_from_pool,PythonWrapperCodegen.codegen_alloc_from_pool  sS    $++II&MJ33E:33F;

 
	
rP   c                   X!R                   R                  :X  al  X1R                   R                  :X  aS  XAR                   R                  :X  a:  Ub&  XaR                  :w  a  SUR                  5        SU S3$ UR                  5        $ U R                  U5      nU R                  U5      nU R                  U5      nUb/  XaR                  :w  a   SUR                  5        SU SU SU SU S3$ SUR                  5        SU SU SU S3	$ )Nzaten.view.dtype(r  r   z#aten.view.dtype(reinterpret_tensor(z), zreinterpret_tensor()r  r   r  r  r   rE   rp   r  )r   datar   r  r  ry   r   s          rN   codegen_reinterpret_view-PythonWrapperCodegen.codegen_reinterpret_view  s    KK$$$++,,,++,,, Ujj%8)$--/):"UG1EE--/*+2248D44V<F))&1F Ujj%8<T]]_<MRPTvUWX^W__abhaiilmrlsstuu *$--/):"TF"VHBvhVWXrP   c                8    U R                  U SU SU S35        g )Nz.copy_(r  r   r  )r   r   dstnon_blockings       rN   codegen_device_copy(PythonWrapperCodegen.codegen_device_copy  s!    #gcU"\N!<=rP   c                    UR                  5       nUR                  S   R                  5       nU R                  [        XX1R                  5      5        g r   )rE   ro  ry   r)  r-  )r   rK   r*  r+  s       rN   codegen_multi_output)PythonWrapperCodegen.codegen_multi_output
  s:    mmo;;q>**,t(LLQRrP   c                   S UR                    5       u  n[        UR                  5      S:X  a#  U R                  UR                   SU S35        GOw[        UR                  5      S:X  aE  [        UR                  S   [        5      (       a#  U R                  UR                   SU S35        GO[        UR                  5      S:X  a  [        UR                  S   [        5      (       a  U R                  UR                   SU S35        U R                  S	UR                   S
UR                  S   R                   SUR                   SUR                  S   R                   S3	5        U R                  UR                   SUR                   SUR                  S   R                   35        O[        SUR                   35      eU R                  UR                  5        S35        g )Nc              3  @   #    U  H  oR                  5       v   M     g 7frB   )rq  )rd   ts     rN   rf   >PythonWrapperCodegen.codegen_dynamic_scalar.<locals>.<genexpr>  s     >+Q&&((+s   r   r   .item()r1   z = 1 if z.item() else 0z_undivided = zassert z_undivided % z
 == 0, f'{z_undivided} not divisible by 'z_undivided // unrecognized keypath z = None)ro  r   keypathry   r  r[   r   r   divisorr  rE   )r   rK   r  s      rN   codegen_dynamic_scalar+PythonWrapperCodegen.codegen_dynamic_scalar  s   >$++>t||!NNdhhZs4&89!#
4<<?M(R(RNNdhhZxv^DE!#
4<<?K(P(PNNdhhZ}TF'BCNN$((=a1H1H0I Jxxj >t||A?V?V>WWXZ NN88*CzQ8O8O7PQ !#8!GHH 	$--/*'23rP   c           
     r  ^ ^ UU 4S jnU4S jnU4S jnTR                  / SQ5        TR                  5          TR                  SSS9  [        R                  R
                  R                  5        HT  u  pVTR                  SU 35        U" XVR                  5       UR                  5       UR                  UR                  5        MV     [        [        R                  R                  5      S	:  a^  TR                  S
5        [        R                  R                  R                  5        H!  u  pWTR                  SU 35        U" XW5        M#     [        R                  R                  R                  5        GH9  u  pV[        U[         R"                  5      (       aI  [        [        R                  R$                  R&                  R)                  US 5      [*        5      (       a  Mn  [        U[,        R.                  5      (       ad  [        [        R                  R                  5      S	:X  a  TR                  S
5        TR                  SU 35        U" XVR1                  5       5        M  [        U[         R2                  5      (       a2  U" U[        R                  R$                  R5                  USS95        GMB  [        U[,        R6                  5      (       a$  U" USUR                  R8                   S35        GM  UR;                  5        Vs/ s H+  n[        R                  R$                  R5                  USS9PM-     n	nUR=                  5        Vs/ s H+  n[        R                  R$                  R5                  USS9PM-     n
nU" UU	U
UR?                  5       URA                  5       5        GM<     SSRC                  [        R                  R                  RE                  5       5       S3nTR                  SU 35        TR                  S5        S S S 5        g s  snf s  snf ! , (       d  f       g = f)Nc                   > TR                  U  STR                  U5       STR                  U5       SU SU S3
5        g )Nz = rand_strided(r  
, device='	', dtype=r   )ry   rp   )r}   r  r  ru  r   r~   r   s        rN   add_fake_inputFPythonWrapperCodegen.benchmark_compiled_module.<locals>.add_fake_input%  sT    &(2259:"226:; <!()E7!5rP   c                2   > TR                  U  SU 35        g rj  r  )r}   r   r~   s     rN   add_expr_inputFPythonWrapperCodegen.benchmark_compiled_module.<locals>.add_expr_input-  s    vS./rP   c                   > SS K n[        U[        R                  5      (       d   eTR	                  U  SUR                  U5      < S35        g )Nr   z = pickle.loads(r   )pickler[   r  ScriptObjectry   dumps)r}   r   r4  r~   s      rN   add_torchbind_inputKPythonWrapperCodegen.benchmark_compiled_module.<locals>.add_torchbind_input0  sE    eU%7%78888v%5fll56I5LANOrP   )rM  rM  z3def benchmark_compiled_module(times=10, repeat=10):z
                from torch._dynamo.testing import rand_strided
                from torch._inductor.utils import print_performance
                Tr   zglobal r   zimport pickle*   fallbackztorch.cuda.default_generators[z].graphsafe_get_state()zcall([r  z])zfn = lambda: z8return print_performance(fn, times=times, repeat=repeat))#
writelinesr   r   r0   rC   	constantsr   ry   r   r  ru  r   r   torchbind_constantsr  r[   r\   r  rI   
var_to_valr   r   r   r  get_real_objr   	size_hintr  r  r  r  rl  rH   r   keys)r   r~   r.  r1  r7  r}   r   torchbind_objr   r  r  call_strs   ``          rN   benchmark_compiled_module.PythonWrapperCodegen.benchmark_compiled_module$  sa   		0	P 	K	
 ]]_MM     !ww00668   74&!12**,ekk	  9 177../!3  1+,77+F+F+L+L+N'D $$wtf%56'<	 ,O  !ww3399;eU\\22zGG$$//33E4@,8 8 eR%7%7881776671<((9$$wtf%56'.@.@.BCuzz22
 #4)9)9)C)CETV)C)WXr'8'899"89K9K8LLcd "'!1!1A ((221r2B!1   "'!1!1!3!3A ((221r2B!3   #((*)E  <T  		!''*>*>*C*C*E FGrJH}XJ78WXK _fo _s+   KP(2P P(2P#BP(
P((
P6c                
   [         R                  (       d  gU R                  U5        UR                  / SQ5        UR	                  5          UR                  SS[        5        S3/5        SSS5        g! , (       d  f       g= f)z<
Append a benchmark harness to generated code for debugging
N)rM  rM  zif __name__ == "__main__":zBfrom torch._inductor.wrapper_benchmark import compiled_module_mainzcompiled_module_main('z', benchmark_compiled_module))r   benchmark_harnessrE  r<  r   r'   r   r~   s     rN   r  *PythonWrapperCodegen.add_benchmark_harness  sh     ''&&v.@A]]_X,-?-A,BB_` __s    A44
Bc                >    U R                  [        U UUUUUS95        g r  )ry   r  )r   rs  r  r  r  r  s         rN   define_kernel"PythonWrapperCodegen.define_kernel  s*     	 !-		
rP   c                6    U(       a  U S3OSnSU U  SU 3nU$ )Nr@  rM  z

r   rc   )rs  r  r  metadata_commentbodys        rN   _format_kernel_definition.PythonWrapperCodegen._format_kernel_definition  s1     /7hZr?B&'}C}ErP   c                .   [         R                  R                  (       aK  U R                  XS S9nU R                  R                  U5        [        R                  R                  (       a  g U R                  XUS9nU R                  R                  U5        g )N)r  )
r   rq   rr   rQ  r\  r   r0   rC   rC  rW  )r   rs  r  r  r  r  rP  s          rN   r  *PythonWrapperCodegen._define_kernel_helper  s     ==11114 2 D %%,,T2ww""--x . 
 	4 rP   c                :    U R                   R                  U5        g rB   )r]  r   )r   fn_codes     rN   define_subgraph_launcher_fn0PythonWrapperCodegen.define_subgraph_launcher_fn  s    !!((1rP   c                  ^^,^-^.^/^0^1 SSK JnJnJn	  SSKJm,Jn
JnJnJ	n  SSK
JnJn  UR                  n/ m10 m// m./ nU.U14S jm-S-U,U-U/U4S jjn[        UR                  5       GH  u  nnUUR                   ;   a  U" UT," US9S	S
9  M%  UT;  a  M-  TU   nTU   c  U" UT," US9S	S9  MF  [#        U[$        R&                  5      (       a^  [#        U[$        R(                  5      (       a'  SUR*                  UR,                  R/                  5       4OSu  nnnU" UU" UUUUS95        M  [#        U[$        R0                  5      (       a.  U" UU" UUR3                  5       UR/                  5       S95        GM  [#        U[$        R4                  5      (       aM  U" UU" UUR6                  R3                  5       UR/                  5       UR8                  R:                  S95        GM|  [#        U[<        [>        R@                  45      =(       a*    [B        RD                  RF                  RI                  US5      nU" UU" UU5      US9  GM     [K        T1S T.UR                   Vs/ s H  n[M        U5      PM     snS9nU[N        RP                  " [B        RD                  RS                  5       5      0 T/E[T        RW                  US5      E[Y        T1T.S9/S.nU(       a  [[        U5      US'   U(       a  [[        U5      US'   []        U5      S:X  a0  UR_                  5       n/ [a        [>        Rb                  US   5      QnOS.U04S jjn0 m0U V s/ s H  n / [a        UU 5      QPM     nn U(       a  []        U5      []        U5      :X  d   e/ n![e        [g        Xb5      S S	S9 HA  u  n n"U!Ri                  U" U"5      / [a        [j        U 5      Q/ [a        [l        U 5      QS.5        MC     U	R                  U!/ [a        [n        T0Rq                  5       5      QS.n/ T0Rs                  5       Qn[u        URv                  5      /n#[]        U5      S:  aY  TRq                  5        HE  n[#        U[$        R0                  [$        R4                  45      (       a  M4  U#Ri                  U5        MG     U#Ri                  [o        U5      5        U#Ry                  [o        U5      5        [[        U#5      n#U#U Rz                  ;   a  / U Rz                  U#   QUP7$ U S[]        U Rz                  5       3n$[}        5       n%[~        R                  R                  (       a  U%R                  SU$< S35        OU%R                  SU< S35        U$US '   UR                  UR                  5       5        U%R                  U" 5       5        U%R                  S!/ [a        Xr5      Q< S"U< S#U< S$35        [        U5      n&[~        R                  R                  (       a  U&R                  S%U S&3S%U$ S&35      n&U&R                  S'S(5      n&U%R                  U&5        [B        RD                  RS                  5       n'U%R                  S)U'R                   S*35        [        R                  " URv                  5      u  n(n)[        R                  " URv                  5      n*S+U* S,U) 3n+U R                  U$U%R                  5       U+5        U$U4U Rz                  U#'   U$UU4$ s  snf s  sn f )/Nr   )config_to_dict	FixedGridPrecomputedGridr1   )ConstexprArgKernelArgTypeSizeArg	TensorArgTMADescriptorArg)gen_common_triton_importsTritonKernelc                J   > TR                  U5        TR                  U 5        g rB   )r   )r  rD  arg_indices	signatures     rN   add_to_signaturePPythonWrapperCodegen.define_user_defined_triton_kernel.<locals>.add_to_signature  s    S!s#rP   c                  > U(       aE  [        5       (       a  T" X5        UR                  T;   a  TUR                     TUR                  '   g g UR                  T;   d   eU(       a?  [        5       (       a  T" U T" UR                  S95        OT" X5        STUR                  '   g U(       a6  [        5       (       a  T" U T" UR                  S95        S TUR                  '   g T" X5        g )Nr}   r1   )r/   r}   )	r  rD  is_constexprequals_1equals_noner]  rg  r=  r   s	        rN   add_argGPythonWrapperCodegen.define_user_defined_triton_kernel.<locals>.add_arg  s    133 %S.88v% +1*:Ichh' & xx6)))577
 )l.IJ(2*+Ichh' 577 )l.IJ*.Ichh'$S.rP   rj  T)rk  )rm  stable)experimentalNN)r}   api_typera  r   )r}   bufferr   )r}   rs  r   r  )rl  )
size_dtyper-  argdefs)r-  )rf  ru  r=  r   restore_valuereset_to_zeror   c                t  > [        U [        R                  5      (       al  / U R                  QnU(       d  U $ UR	                  [
        S9  U H0  nUT;   a  M  [        R                  " S[        T5       35      TU'   M2     [        U T5      $ [        U [        5      (       d   e[        R                  " U 5      $ )N)r   _launcher_s)r[   r\   r   r  sortr   r  r   r.   r   r]   )r  symbolsr  extra_launcher_argss      rN   rename_sizes_for_launcherYPythonWrapperCodegen.define_user_defined_triton_kernel.<locals>.rename_sizes_for_launcherp  s    dEJJ//2 1 12G"#LLSL)&"55$38<<)#.A*B)CD4+C0  ' &d,?@@!$,,,,}}T**rP   c                2    [        U S   R                  5      $ r   r   r   s    rN   r   HPythonWrapperCodegen.define_user_defined_triton_kernel.<locals>.<lambda>  s    3qt{{3CrP   r   )r   pythoncpp)	grid_typeprecomputed_gridsr|  r  zasync_compile.triton(z, '''rs  zG
            @triton_heuristics.user_autotune(
                configs=z ,
                inductor_meta=z,
                triton_meta=z{,
                filename=__file__,
                custom_kernel=True,
            )
            @triton.jit
            r   r  z'''z\'\'\'z''', device_str='r  z# Original path: rB  )FFF)r  r   r   r   )Mruntime.triton_heuristicsrZ  r[  r\  commonr]  r^  r_  r`  ra  rq   rb  rc  r   r  	arg_names
constexprsr[   r   TMADescriptorrf  ra  rV  rH   r  rE   r"   r  r  r  r   r\   r]   r0   rC   rI   statically_known_equalsr;   r2   r$   r  get_current_device_or_throwr4  fromkeysr9   ro   r   setup_grid_as_argsrn  sympifyr   r   r   r  r8   r   rv  rB  idr   extendri  r(   r   unique_user_kernel_namesry   updateinductor_meta_commonr   r   replacerk   inspectgetsourcelinesgetsourcefilerL  r   )2r   r   r   r   restore_value_argsreset_to_zero_argsr   rZ  r[  r\  r^  r_  r`  ra  rb  rc  original_nameequal_to_1_argsrn  r  r   rD  rr  ra  r   rl  r   triton_signaturer  inductor_metaextra_launcher_call_argsr}  rs   r  cfg	cache_keyr}   r   
kernel_srccurrent_devicer  linenosrcfiler  r]  rg  re  r=  r|  rf  s2      `                                        @@@@@@rN   !define_user_defined_triton_kernel6PythonWrapperCodegen.define_user_defined_triton_kernel  s   	
 	

	
 	
 	D)+	$&	!#%'	$"	/ "	/H "&"2"23HCf'''\s3$G& +Cc{"\s3Fc2#3#344 &c2+A+ABB "3??CJJ4H4H4JK9 1Hk5
 (!$%-(3"'	  RYY//!!$#&<<>"%--/  R%7%788 !!$#&88#4#4#6"%--/#&::#4#4	  *c5==1   ''**BB  Cc!2XFs 4v -)/)9)9:)9AWQZ)9:	
 *&--agg.Q.Q.ST--3
 ''
, +01C+DK(+01C+DK(u:?,5,H,H,JM'FU]]E!H)E'F$+  EGINO<s4d;<EOSZ3w<777 "#E#)CT	c "(("0"5"5Ct$4"52UD!12 -55%6'PS2E2L2L2N)O'PM
 (E)<)A)A)C'D$ VYY-	w<!}}!#		23E3E'FGG$$S) ' 	[)*]+,)$	666//	:( 
  #d&D&D"E!FG(*==11%%(=dXU&KL%%(=m=Ne&TU'+m$\>>@A8:;83~78; <,/ 0(O ,			
 OvV
==11#++d=/,CtD6QR^TJ''{;
z*<<>!!$5n6I6I5J""MN**6995	6''		2&wiq9$$&	
 6:;4G&&y1[":::K ;j Ps   [4[9c                    U SUR                    S3nUb  USU 3-  n[        XBR                  5      nU R                  [	        X[
        R                  5      5        U$ )Nr  r  )rX  r   r  ry   rC  r0   rC   )r   rs  treerY  r  rD  s         rN   generate_numel_expr(PythonWrapperCodegen.generate_numel_expr  s[    a}E2axL D dJJ/*4agg>?
rP   c                j    U R                  UR                   S[        UR                  5       35        g rj  )ry   r   r  r   )r   rD  rC   s      rN   rF  7PythonWrapperCodegen._generate_symbolic_call_arg_helper  s)     	#))Ccnn(='>?@rP   c                   UR                  5       n[        X5      nUR                  [        R                  :X  a  U R                  U5        GOBUR                  [        R                  :X  a2  U R                  U5        U R                  U R                  U5      5        OUR                  [        R                  :X  a  U R                  R                  U5      nU(       a]  [        U[        5      (       a  [        UR                  [        5      (       d   e[        R                  " UR                  U5      Ul        OUU R                  U5        U R                  U R                  U5      5        X0R                  U'   O[        UR                  5      e[         R"                  R$                  (       a  U R&                  R                  [(        R+                  U UUR,                  UR.                  [0        R2                  R4                  R7                  UR8                  5      4SS95        UR                  [        R                  :w  a/  U R&                  R                  [(        R                  X5      5        g g g )N)r1   )r  r  )rE   r  	zero_moder7   UNINITIALIZEDry   ZERO_ON_CALLmake_zero_bufferZERO_PER_GRAPHr  r   r[   rK   r6   maximumr  r   rq   rr   r{   r  make_allocationru  r   r0   rC   rI   rA  r   )r   wsr}   r|   priors        rN   generate_workspace_allocation2PythonWrapperCodegen.generate_workspace_allocation  s   {{}D%<<,:::NN4 \\.;;;NN4 NN40067\\.===--11$7E!%66:JJ< <   *11%**bA
t$t44T:;26))$/ ..==11&&00$44IIHH77++55bhh?A 5 	 ||0>>>**44(99$E ? 2rP   c                v    UR                   [        R                  :w  a  U R                  [	        X5      5        g g rB   )r  r7   r  ry   r  )r   r  s     rN   generate_workspace_deallocation4PythonWrapperCodegen.generate_workspace_deallocation	  s,    <<,;;;NN.t89 <rP   c                $    U SU R                    3$ )Nz.zero_())r<  )r   r}   s     rN   r  %PythonWrapperCodegen.make_zero_buffer	  s    x}--rP   c                H    U SSR                  U5       SU R                   3$ )Nr  r  r   )r   r<  )r   r}   r  s      rN   ru  %PythonWrapperCodegen.wrap_kernel_call	  s'    q9-.a}==rP   c                    U R                   R                  S5        U R                   R                  S[        R                  R                   S35        UR                  U R                   R                  5       5        g )Nz*from torch.profiler import record_functionzwith record_function('graph_z_inductor_wrapper_call'):)r[  ry   r0   rC   graph_idr  r   )r   r  s     rN   r  8PythonWrapperCodegen.generate_profiler_mark_wrapper_call	  sb    ##$PQ##*177+;+;*<<UV	
 	D--4467rP   c                :    U R                   R                  S5        g )Nzstart_graph())r[  ry   r   s    rN   r  )PythonWrapperCodegen.generate_start_graph!	  s    ##O4rP   c                `    U R                   R                  S[        R                  < S35        g )Nz
end_graph(r   )r[  ry   r   profile_bandwidth_outputr   s    rN   r  'PythonWrapperCodegen.generate_end_graph$	  s'    ##j1P1P0SST$UVrP   c                ^    U R                   R                  S[        R                   S35        g )NU
            for kernel in globals().values():
                if isinstance(kernel, zU.CachingAutotuner):
                    kernel.cuda_kernel_saved = False
            r[  r   r#   r   r   s    rN   r  6PythonWrapperCodegen.generate_reset_kernel_saved_flags'	  s2      ''8'A'A&B C	
rP   c                ^    U R                   R                  S[        R                   S35        g)a  
Precompile and save the CUBINs of the Triton kernels that haven't
been precompiled and saved as a side effect of running the generated
JIT model (Python wrapper). This can happen when the model contains
control flow: only one pass through the control flow operators covers
the kernels that are saved, the remaining kernels are not launched,
hence not saved. The main purpose of this codegen is to compile and
save the Triton kernels outside the active control flow path for
subsequent AOTInductor code generation and compilation.
r  a  .CachingAutotuner):
                    if not kernel.cuda_kernel_saved:
                        if len(kernel.launchers) == 0:
                            kernel.precompile()
                        kernel.save_gpu_kernel(
                            grid=(0, 0, 0),   # use dummy grid
                            stream="stream",  # use dummy stream
                            launcher=kernel.launchers[0],
                        )
            Nr  r   s    rN   r  5PythonWrapperCodegen.generate_save_uncompiled_kernels0	  s4     	  ''8'A'A&B 	C	
rP   c                B    S nU Vs/ s H
  o2" U5      PM     sn$ s  snf )Nc                   [        U [        5      (       a  [        U 5      (       a  U S-   $ U $ [        U [        [        [
        [        45      (       a  [        U 5      $ [        [        R                  R                  R                  U 5      5      $ )Nr#  )r[   r   r:   r   floatr   r   r  r0   rC   rI   rJ   )rD  s    rN   wrap_argAPythonWrapperCodegen.prepare_triton_kernel_call.<locals>.wrap_argK	  sg    #s##*B3*G*GsYPSPC#udO!DEE3xQWW--66s;<<rP   rc   )r   r  r  rD  s       rN   prepare_triton_kernel_call/PythonWrapperCodegen.prepare_triton_kernel_callJ	  s%    	= *33#333s   c                  ^  [        U[        5      (       Ga!  [        U[        R                  5      (       a.  UR	                  5       R                  5       nT R                  U   nObT R                  R                  U5      (       a  UnT R                  U   nO0Uc   S5       eST R                   3nUnT =R                  S-  sl        Uc
   SU 35       e[        S UR                  5        5       5      n[        S [        R                  R                  U5       5       5      n[        S UR                  5        5       5      nUR                  5       n	UR!                  5       n
[        R                  R"                  R%                  UR'                  5       R(                  [*        R,                  S9nS	U S
U SU	 SU
 S
U S
U S3nT R.                  R1                  U SU 35        [        U[        R                  5      (       a2  T R3                  USS9nUnT R.                  R1                  U SU 35        U$ [5        U[6        R8                  5      (       d  [        U[:        5      (       a  [        U[<        5      (       a  UT R>                  ;   a  U$ Uc  gUn[        U[:        5      (       a  UR@                  nU[        R                  R"                  RB                  ;   a'  [        R                  R"                  RB                  U   n[=        [        R                  R"                  RE                  U[*        R,                  S95      $ [        U[<        [F        [H        [J        45      (       a  [=        U5      $ [        U[L        5      (       a  SS
RO                  U 4S jU 5       5       S3$ [Q        S[S        U5       35      e)NzBV.graph.get_buffer(arg) and raw_arg can't be None at the same timetmp_arg_r1   z Failed to find a buffer for arg c              3     #    U  H;  n[         R                  R                  R                  U[        R
                  S 9v   M=     g7fr:  Nr0   rC   rI   rG  r   unbacked_symint_fallbackrd   r  s     rN   rf   BPythonWrapperCodegen.generate_example_arg_value.<locals>.<genexpr>h	  sA      
 (A	   ;;#<< <  (   AAc              3     #    U  H;  n[         R                  R                  R                  U[        R
                  S 9v   M=     g7fr  r  r  s     rN   rf   r  o	  sA      $
 :A	   ;;#<< <  :r  c              3     #    U  H;  n[         R                  R                  R                  U[        R
                  S 9v   M=     g7fr  r  r  s     rN   rf   r  v	  sA      
 *A	   ;;#<< <  *r  r:  zgenerate_example_value(r  z, 'z', r   r   T)rY  rZ  r  r0  c              3  Z   >#    U  H   nTR                  U[        U5      5      v   M"     g 7frB   ri   )rd   ar   s     rN   rf   r  	  s(      ZVYQR!@!@DG!L!LVYrm   r1  zUnsupported type )*r[   torch_dtyper   r  
get_tensorrE   rV  r   r_  ro   r  r0   rC   get_allocation_sizer  rl  rH   rI   rA  
get_layoutr  r   r  r{   ry   rg  r2  r\   Basicr   r   r|  r   r  rG  r   r  r   r   r   r
  rk   )r   rD  arg_typeraw_argr|  r  r   allocation_sizer  ru  r   r  r   s   `            rN   rj   /PythonWrapperCodegen.generate_example_arg_valueV	  sd   h,,'2#3#344"--/88:**3/%%))#..**3/* X* &d&F&F%GH00A50?L&Fse$LL? 
  D $ $
 44S9$ O  
 ) F ^^%FMMOEWW%%// ''88 0 F .dV2fXSE7RTU[T\\^_n^oopqE&&00H:S1HI'2#3#344 :: %) ;  **44zUG5LMO%++..*S/2R2R#s##$//)J?!#//nnagg&&CCCgg&&CCCH  ;;&"A"A <   c3t455s8OT""tyy ZVY ZZ[[\]]%(9$s)&EFFrP   c                   ^  [        U[        5      (       a!  SSR                  U 4S jU 5       5      -   S-   $ [        U5      $ )Nr0  r  c              3  F   >#    U  H  nTR                  U5      v   M     g 7frB   )_grid_dim_str)rd   r^   r   s     rN   rf   5PythonWrapperCodegen._grid_dim_str.<locals>.<genexpr>	  s     R\T 2 24 8 8\s   !r1  )r[   r   r   r  )r   grid_per_dims   ` rN   r  "PythonWrapperCodegen._grid_dim_str	  s?    lD))diiR\RRRUXX &&rP   )ru  rq   r  r  r  r  r   c                  U R                   R                  U V
s0 s H:  n
[        U
[        5      (       d  M  U
[        R
                  R                  U
5      _M<     sn
5        U=(       d    [        R
                  R                  5       nU R                  [        U UUUUUUUU[        R
                  R                  U	S95        gs  sn
f )z
Generates kernel call code.

triton: Defines whether the backend uses Triton for codegen. Otherwise it uses the CUDA language when gpu=True,
        and C++ when gpu=False.
)
rs  r  r  r  r  rq   r  ru  r  r   N)rV  r  r[   r   r0   rC   try_get_bufferr  ry   r  r}   )r   rs  r  ru  rq   r  r  r  r  r   rD  s              rN   generate_kernel_call)PythonWrapperCodegen.generate_kernel_call	  s    , 	## %$Cc3' 1QWW++C00$	
 @177>>@'#!!#'77<<%9	
s
   C$CrM  )ru  rq   r  r  r  r  r  r   c          
     2	  ^ ^^ U=(       d    [         R                  R                  5       nU(       d2  UR                  S:w  d"  T R	                  T R                  TU5      5        g T R                  U5      nSR                  U5      n[        R                  T UR                  U	5      nU(       d$  SU S3nT R	                  T ST SU SU S35        g T R                  5         [        R                  R                  (       Ga  TT R                  ;  Ga  Ub  [!        U5      [!        U5      :X  d   S5       eS mU
(       aI  [         R                  R"                  (       a*  [         R                  R"                  R%                  U
S 5      mSUU 4S jjnUU 4S	 jn/ nUc)  Ub   S
5       eS /[!        U5      -  nS /[!        U5      -  nO[!        U5      [!        U5      :X  d   S5       e0 n['        [)        X%Xg5      5       GH  u  nu  nnnnS n[+        U[,        5      (       a#  S[-        U5      ;   a  UR/                  S5      u  nnS nT(       a  UT;   a  T R1                  TU   5      nU(       aQ  Un[+        U[2        5      (       d9  [5        U[6        R8                  5      (       d  [+        U[:        5      (       a  UUU'   OUS:X  a  U" XgUU5      (       a  UU   nO[+        U[2        5      (       ag  [<        R>                  " SU5      (       a  UnO6UT R@                  ;  a  T RC                  UUU5      nOT R@                  U   S   nUT4T R@                  U'   OT RC                  UUU5      nURE                  Uc  UOU SU 35        GM     T RF                  R	                  T SSR                  U5       SU S35        T RF                  R	                  [I        SUS5      5        T R                  RK                  T5        [         R                  RL                  (       a  g [         R                  RN                  RP                  nURS                  UTUS 5        U   T R	                  T SU SU S35        S S S 5        T R                  5         g ! , (       d  f       N= f)Nr  r  z	c_void_p(r   r   r  z$call_args and arg_types do not matchc                    > TR                   R                  5        V Vs/ s H  u  pUT:X  d  M  U PM     nn nU(       a  SSR                  U5       S3$ gs  snn f )zAfter all the autotune kernel calls have been written (i.e.
self.kernel_autotune_example_args is complete), returns a deletion call
for all autotune example tensors that are unnecessary after kernel_name
is called.del r  r@  rM  )r^  rv  r   )rV  kntensors_to_deleters  r   s      rN   get_autotune_deletion_callUPythonWrapperCodegen._generate_kernel_call_helper.<locals>.get_autotune_deletion_call
  se     '+&G&G&N&N&P%&P
[( &P " %
 %!$)),=">!?rBB%s
   AAc                  > X   nXC;   a  g[        [        X5      5       H  u  nu  pgXR:X  d  [        U[        5      (       d  M$  SnT(       a  UT;   a  TR	                  TU   5      nUS:X  a  MO   UR                  5       n	[        U	R                  5       H  u  pX:X  d  M  U SU
 S3X4'       g   M     g! [         a     M  f = f)zWe try to infer raw_arg (i.e. raw_args[idx]) from remaining raw_args.
This is particularly useful for jagged cases, where the dimension is often
being passed in as an input.TrM  z.shape[r1  F)r  r   r[   r!   r  r  r   r
  )r  r  r  reused_args
target_argr7  raw_keyr  triton_inputr  rP  r  autotune_argsr   s               rN   infer_arg_by_inputsNPythonWrapperCodegen._generate_kernel_call_helper.<locals>.infer_arg_by_inputs 
  s    
 &]
,-6s87N-O)A)xz'6'B'B #%L$M)A'+'E'E)'2( $r) 	!!(!3!3!5&/&<FC =IN'RUQVVW:X 7'+ '= .P, 	 / ! !!s   0-B5!B5.B55
CCzkeys are not None but args arez#call_args and raw_args do not matchr  rM  z^(workspace|semaphore)r   z.run(z	, stream=z
<del_call>r  )*r0   rC   r  rk   ry   ru  r  r   r  ry  r  r  r   rq   rr   rz   r   autotuning_mappingr   r  r   r[   r   splitr  r  r2  r\   r  r   rematchr^  rj   r   r{   r&   r   rC  r>  r  r?  )r   rs  r  ru  rq   r  r  r  r  r  r   call_args_strstream_name
stream_ptrr  r  all_argsr  r7  rD  r  r  r  r   r   arg_strrB  r  s   ``                         @rN   r  1PythonWrapperCodegen._generate_kernel_call_helper	  s4    @177>>@&++.NN400iHI77	B		-0*??&,,

 $[M3JNN-qQ}oR
|1M %%' MM2224#=#== (S^s9~-M 6M !M#(B(B ! : : > >($! B H'I)II' 6C	N2 6C	N28}I6 96 K8AI(=944C7G c3''C3s8O"yy~HC.2 W%=#'#A#A%g.$L  *G%h<<"8U[[99%c?;;/6G,]':;( (
 *'2G+66 xx 93??"%D$E$EE"&"A"A7# #'"C"CC"H"K>E{=SD55c:"==c8WUG3;se1WI<NOS9V &&00-uTYYx%8$9;-qQ &&00 /I<X &&**;7ww"" !" 4 4 B B..y+yRVW"NNk]%i}TUVW #%%' #"s   R
Rc                :    U R                   R                  U5        g rB   )rb  r   r   r|   s     rN   ry   PythonWrapperCodegen.writeline
  s    

$rP   c                8    U H  nU R                  U5        M     g rB   r  )r   rb  r|   s      rN   r<  PythonWrapperCodegen.writelines
  s    DNN4  rP   c                L    U R                   R                  [        U5      5        g rB   )rb  r   r*   )r   ctxs     rN   r  "PythonWrapperCodegen.enter_context
  s    

+c*+rP   c                  ^ ^ SSK Jn  U" 5       (       a  SS Kn[        U[        5      (       a  [        UR                  R                  5      $ [        U[        R                  5      (       a  [        U5      $ [        U[        [        45      (       aB  [        R                   " S S5      5       m[        [        U5      " UU 4S jU 5       5      5      $ [        U[         R"                  R$                  5      (       a  ['        U5      $ [        U[(        R*                  [(        R,                  [.        45      (       a  UR1                  5       $ U" 5       (       a0  [        UWR2                  R4                  5      (       a  [        U5      $ [        U[(        R6                  5      (       a  UR1                  5       $ [        U5      $ )Nr   )has_triton_packagec                  &    \ rS rSr% S\S'   S rSrg)1PythonWrapperCodegen.val_to_arg_str.<locals>.Shimi
  r   refc                    U R                   $ rB   )r  r   s    rN   __repr__:PythonWrapperCodegen.val_to_arg_str.<locals>.Shim.__repr__
  s    88OrP   rc   N)r   r   r   r   r   r  r   rc   rP   rN   Shimr  
  s    $rP   r  c              3  \   >#    U  H!  nT" [         R                  TU5      5      v   M#     g 7frB   rM  )rd   r  r  r   s     rN   rf   6PythonWrapperCodegen.val_to_arg_str.<locals>.<genexpr>
  s)     VTUq1@@qIJJTUs   ),)torch.utils._tritonr  rq   r[   r   r  rK   r  r\   r   ro   r   r  	dataclassr  rk   r  _ops
OpOverloadr   r   r  
MutableBoxr"   rq  languager   r  )r   r  type_r  rq   r  s   `    @rN   rN  #PythonWrapperCodegen.val_to_arg_str
  s>   :a""%%5::&&8OE4=))""$ $ #$ QVTUVV  5::0011&q))BIIr}}oFGG&&((!!jFOO4I4I&J&J7N2,,--&&((7NrP   c                :   UR                  5       nUR                  5       n[        UR                  5       5      n[        [        R
                  R                  U5      5      n[        UR                  5       5      nU R                  UR                  5       X#XFU5      $ rB   )
rl  rH   ro   r  r0   rC   r  r  r  rE   )r   rs  ru  r   r  allocation_shaper  s          rN   r  +PythonWrapperCodegen.make_buffer_allocation
  s    ""$  "foo'( !<!<V!DEv((*+##OOve=M
 	
rP   c           
     &   Uc  UnU R                  U5      nU R                  U5      nU R                  U5      n	UR                  S;   a  U SUR                   SU SU	 SU S3
n
OU SU SU	 SUR                   SU S3
n
Xx:w  a  U
S	U SU	 S3-   n
U
$ )
N)r  cudaxpuz = empty_strided_r  r  r   z = empty_strided(r,  r-  z.as_strided()rp   rk   )r   r}   ru  r   r  r  r*  r  codegen_allocation_shape_tuplecodegen_stride_tupler@  s              rN   r  $PythonWrapperCodegen.make_allocation
  s     #$"==eD)-)H)H*
&  $>>vF;;00 &)&++a12"'('  &)12"'( )!;;-yq:  @,':&;2>R=SSTUUC
rP   c                8    U R                  [        U5      5        g rB   )ry   r(  r  s     rN   make_comment!PythonWrapperCodegen.make_comment
  s    {4()rP   c           	     `    U R                    U SU U R                   SU R                   SU 3	$ )Nr      )r;  r<  rd  )r   new_nameold_namerd  s       rN   make_tensor_alias&PythonWrapperCodegen.make_tensor_alias
  s6    ,,zXJt{{m2dll^STU\T]^^rP   c                (    SUR                  5        3$ )Nr  )rE   )r   rs  s     rN   r  %PythonWrapperCodegen.make_buffer_free
  s    foo'())rP   c                8    SSR                  S U 5       5       3$ )Nr  r  c              3  $   #    U  H  ov   M     g 7frB   rc   )rd   r}   s     rN   rf   :PythonWrapperCodegen.make_free_by_names.<locals>.<genexpr>
  s     >s   )r   )r   names_to_dels     rN   make_free_by_names'PythonWrapperCodegen.make_free_by_names
  s    dii>>>?@@rP   c           	     `    U R                    U SU U U R                   SU R                   S3	$ )Nr   r6   reuse)rc  r<  rd  )r   r9  r8  del_lines       rN   codegen_exact_buffer_reuse/PythonWrapperCodegen.codegen_exact_buffer_reuse
  s@    ../zXJxjQUQ\Q\P]]_`d`l`l_mmsttrP   c                   UR                  5       UR                  5       :X  d   eUR                  5       nUR                  5       nSnU[        R                  R	                  5       ;  a  U(       a  SU R                  U5       3nUR                  5       UR                  5       :X  a4  UR                  5       UR                  5       :X  a  U R                  XEU5      $ U R                  XR                  5       UR                  5       SU R                  R                  5      nU R                   U SU U SU R                   S3$ )N;z; r   r   r6  rE  )rH   rE   r0   rC   r  r  r  r  rG  r  r[  ry   r;  rd  )r   r  r  r  r9  r8  rF  reinterpret_views           rN   r  &PythonWrapperCodegen.make_buffer_reuse
  s   }}#--/111<<><<>1773355*D11#678H<<>S\\^+0@CNNDT0T228xPP88!11d6G6G6Q6Q
 ,,z-=,>xj4<<.X^__rP   c                    U R                  [        UU R                   U SUR                  5        U R                   SU R
                   S35      5        g )Nr   r6  z alias)ry   r4   r;  rq  r<  rd  )r   r}   r  s      rN   r  0PythonWrapperCodegen.codegen_deferred_allocation   sS    <<.c$*@*@*B)CDKK=PRSWS_S_R``fg	
rP   c                B   UR                  5       nU[        R                  R                  ;   d>  X R                  ;   d/  [        U[        R                  [        R                  45      (       a  g U R                  R                  U5        [        UR                  5       [        R                  [        R                  45      (       a  UR                  5       (       d  g UR                  5       n[        U[        R                  5      (       a  g [        U[        R                   5      (       a  g [        U[        R"                  5      (       Ga   [        UR$                  [        R&                  5      (       d*   S[)        UR$                  5       SUR$                   35       eUR$                  R*                  n[        U[        R,                  5      (       d   [)        U5      5       eUR*                  n[        U[        R.                  5      (       d   [)        U5      5       eU R1                  U5        U R3                  [5        XX5      5        g [        U[        R6                  5      (       a  U R3                  [9        X5      5        g U R3                  [;        X5      5        g )Nzunexpected r   )rE   r0   rC   r  ru  r[   r   DonatedBufferSubgraphBufferr   get_defining_opExternKernelAllocMultiOutputshould_allocater  MutationLayoutSHOULDREMOVEr0  r  r  r"   rk   r  
StorageBoxr  codegen_allocationry   r  r  r  r  )r   rs  r}   r  boxinput_buffers         rN   rX  'PythonWrapperCodegen.codegen_allocation  s     AGG+++~~%&2#3#3R5F5F"GHH4 &&(%%r~~6  **,,'')fb;;<<fbmm,,fb0011fkk2+=+=>> d6;;/06;;-@> ++""Cc2==11<49<188LlBII66AS	A6##L1NN?4vNOfb1122NN1$?@|D12rP   c                   UR                  5       n[        U[        R                  [        R                  45      (       a  U R                  [        X5      5        g [        UR                  5       [        R                  5      (       a  U R                  [        X5      5        g U R                  U5      (       d  g U R                  R                  U5        U R                  [        X5      5        g rB   )rE   r[   r   InputBufferr  ry   r|  r  r  r"  	can_reuserv  r   r  )r   rs  r}   s      rN   codegen_free!PythonWrapperCodegen.codegen_free2  s      fr~~r/A/ABCCNN8D12f,,.0C0CDD NN-d;<~~f%%

t*489rP   c                2   UR                  5       nU[        R                  R                  ;   =(       d    U[        R                  R                  ;   =(       a:    [        [        R                  R                  U   [        R                  5      (       + =(       dz    U[        R                  R                  ;   =(       dV    U[        R                  R                  ;   =(       d2    U[        R                  R                  ;   =(       d    X0R                  ;   (       + $ rB   )rE   r0   rC   r  r  r[   graph_inputs_originalr   rP  r=  r>  never_reuse_buffersrv  )r   rZ  output_bufferr}   s       rN   r^  PythonWrapperCodegen.can_reuseF  s    $$&AGG+++ 
",,, "GG11$79I9I 
" qww(((
" qww222
" qww222
" zz!
 	
rP   c                    UR                  5       U R                  ;   =(       a.    U R                  UR                  5          UR                  5       :H  $ rB   )rE   rw  )r   rs  reused_buffers      rN   	did_reusePythonWrapperCodegen.did_reuseV  sC     OO, KFOO-.-2H2H2JJ	
rP   c                z   [        X5      (       d   eU R                  U5        U R                  R                  UR	                  5       5        U R
                  R                  UR	                  5       5        UR	                  5       U R                  UR	                  5       '   U R                  [        XU5      5        g rB   )	rX   rX  rv  r   rE   ru  rw  ry   r  )r   rZ  rd  s      rN   codegen_inplace_reuse*PythonWrapperCodegen.codegen_inplace_reuse^  s    $\AAAA-

|,,./=11340<0E0E0GM**,-y]CDrP   c                    [        U5      nX R                  ;   a  U$ U R                  R                  U5        U R                  U-   $ rB   )r   rj  r   r;  )r   r   r}   s      rN   codegen_unbacked_symbol_decl1PythonWrapperCodegen.codegen_unbacked_symbol_declf  sA    6{---K &&**40<<$&&rP   c                :  ^^^^ [        [        R                  R                  R                  U5      nU(       d  g UR                  5        HM  u  nmSU4S jjmUUUU4S jnU R                  U R                  U5       SU" 5        U R                   35        MO     g )Nc                  > US:X  a  U $ [        U5      S:  ai  [        US   [        5      (       aQ  [        US   [        R                  5      (       a/  T" U  SUS   R
                   SUS   R                   S3USS  5      $ [        US   [        5      (       a  T" U  SUS   R
                   S3USS  5      $ [        US   [        R                  5      (       a^  [        R                  R                  (       a   T" S	US   R                   S
U  S3USS  5      $ T" U  SUS   R                   S3USS  5      $ [        US   [        5      (       a  T" U  SUS   R                   S3USS  5      $ [        SU 35      e)Nrc   r   r   r1   r   r  r   r	  z	std::get<z>(r0  r1  z.__floordiv__(r%  )r   r[   r   pytreeSequenceKeyr}   r  r0   rC   rC  r   r'  r  )r  r&  gos     rN   rt  IPythonWrapperCodegen.codegen_unbacked_symbol_defs_for_outputs.<locals>.go  s   b=K LA%"71:}=="71:v/A/ABB&'!*//!2!GAJNN3C1Ewqr{   
M::a
'8;WQR[II
F,>,>?? 77.. Ywqz~~&6ba@'!"+N  4&'!*..)9 ;WQR[I
  
K88 nWQZ5G5G4HJGTUTVKXX(+@	)JKKrP   c                   > [         R                  R                  (       a  [        T5      S:X  a`  TS   n T" TS   R	                  5       [        U [        R                  5      (       a"  [        U R                  5      S:w  a	  TSS  5      $ T5      $ [        TS   [        R                  5      (       d   eT" TTS   R                     R	                  5       TSS  5      $ T" TT5      $ )Nr1   r   )r0   rC   rC  r   rE   r[   r   rT  r-  rr  rs  r  )r@  rt  r&  r6  r  s    rN   go_outerOPythonWrapperCodegen.codegen_unbacked_symbol_defs_for_outputs.<locals>.go_outer  s    77&&
 7|q(%aj  "#AJ//1)#r~~>>3s{{CSWXCX $ABK   ")	    *'!*f6H6HIIII!''!*.."9"B"B"DgabkRRk733rP   r   )r  r   r&  zpytree.KeyPath)	r   r0   rC   rI   	shape_envr   ry   rn  r<  )r   r6  r  unbacked_bindingsr  rw  rt  r&  s    ``   @@rN   (codegen_unbacked_symbol_defs_for_outputs=PythonWrapperCodegen.codegen_unbacked_symbol_defs_for_outputso  s     6GG&&(9
 ! ,113JAw
L<4 4. NN44Q78HJ<}Uu 4rP   c                  ^ ^^^ UU U4S jnUU U4S jn T R                  TR                  5        T R                  T R                   STR                   35        U" 5         [
        R                  n[
        R                  " TR                  5         TR                  R                  US9  S S S 5        U" 5         T R                  5         g ! , (       d  f       N&= f! T R                  5         f = f)Nc                   > [        TR                  R                  5      [        T5      :X  d   e[        TR                  R                  T5       H3  u  pTR	                  TR
                   U  SU TR                   35        M5     g rj  )r   rC   r  r   ry   r;  r<  )inner_inputouter_inputouter_inputsr   subgraphs     rN   _codegen_subgraph_prefixSPythonWrapperCodegen.codegen_subgraph_by_inlining.<locals>._codegen_subgraph_prefix  sr    x~~223s<7HHHH,/++\-( ||n[M[M$++O-rP   c                   > [        TR                  R                  5      [        T5      :X  d   e[        TR                  R                  T5       H5  u  pTR	                  U SU R                  5        TR                   35        M7     g rj  )r   rC   r  r   ry   rq  r<  )inner_outputouter_outputouter_outputsr   r  s     rN   _codegen_subgraph_suffixSPythonWrapperCodegen.codegen_subgraph_by_inlining.<locals>._codegen_subgraph_suffix  st    x~~334M8JJJJ.1,,m/* #nC(F(F(H'I$++W/rP    subgraph: )parent_graph)	r  rC   ry   rd  r}   r0   set_graph_handlercodegen_subgraphr5  )r   r  r  r  r  r  r  s   ````   rN   codegen_subgraph_by_inlining1PythonWrapperCodegen.codegen_subgraph_by_inlining  s    			'%%hnn5NNdll^;x}}oFG$&77L$$X^^4//!- 0  5 %&$$& 54 $$&s$   A<C C,C 
CC C/c           	        UR                   nUR                  n[        UR                  5       5      UR                   Vs/ s H  oUR
                  PM     sn-   nSR                  U5      [        U5      S:X  a  SOS-   nU Vs/ s H  oR                  5       PM     n	nSR                  U	5      [        U5      S:X  a  SOS-   n
U R                  SU SU S35        UR                  5        VVs/ s H  u  pU(       d  M  UPM     nnnU(       a#  U R                  SSR                  U5       35        U R                  S	U
 S
U SU S35        U R                  SU S35        gs  snf s  snf s  snnf )z'Generate code to call a graph partitionr  r1   r  rM  	partition	_args = [r1  r  r  z) = self.partitions[z](partition_args)zdel partition_argsN)input_deallocationoutput_nodesr   rB  symbol_inputsr}   r   r   rE   ry   r   )r   partition_idr  r  r  symbol_inputr  ro  rK   output_namesr  r}   
deallocaterA  s                 rN   codegen_partition_call+PythonWrapperCodegen.codegen_partition_call  sn    2DD+88-22452F2T2T9
2T,2T9
 
 ;'#k2Ba2G3RP4@ALDLA))L)C4E4JSPRS 	<.	&CD *<)A)A)C
)C%TzD)C 	 
 NNT$))L"9!:;< 	y,\N+l^SYZ	
 	|nE:;-9
 B
s   E"E'-E,>E,c                V    [        U5       Vs/ s H  nSU 3PM
     snU l        g s  snf )N
partition_)r  r  )r   num_partitionsr  s      rN   set_all_partition_names,PythonWrapperCodegen.set_all_partition_names  s*    BGBW#XBW3j$6BW#X #Xs   &c           	     t   SR                  U5      [        U5      S:X  a  SOS-   nSR                  U5      [        U5      S:X  a  SOS-   nU R                  UR                  R                   SU S35        U R                  SU SUR                  R                   SUR                  R                   S	35        g )
Nr  r1   r  rM  r  r1  r  z) = r  )r   r   ry   rC   r}   )r   r  r  outer_flattened_outputsouter_output_namesouter_input_namess         rN   ,codegen_subgraph_call_with_flattened_outputsAPythonWrapperCodegen.codegen_subgraph_call_with_flattened_outputs  s     "YY'>?./14C"
 !IIl3|$)Cr
 	(..--.i8I7J!LM 	"#4(;(;'<Ahnn>Q>Q=RRXY	
rP   c                x   SR                  U5      [        U5      S:X  a  SOS-   nU R                  UR                  R                   SU S35        [
        R                  R                  R                  5         U R                  U SUR                  R                   SUR                  R                   S	35        g )
Nr  r1   r  rM  r  r1  r   r  r  )r   r   ry   rC   r}   r0   	schedulerfree_buffers)r   r  r  outer_buffer_namer  s        rN   codegen_subgraph_call*PythonWrapperCodegen.codegen_subgraph_call"  s     IIl3|$)Cr
 	(..--.i8I7J!LM 	
&&( 	 !X^^%8%8$98>>;N;N:OvV	
rP   c                   U R                  UR                  5        U R                  S5        U R                  U R                   SUR                   35        [
        R                  nUR                  UR                  l        UR                  R                  U R                  ;  a  [
        R                  " UR                  5         [        R                  " SS5         UR                  R                  5       u  p4S S S 5        S S S 5        U R                  R                  UR                  R                  5        U R                  WR                  5        g g ! , (       d  f       Nb= f! , (       d  f       Nk= f)NrM  r  r  F)r  rC   ry   rd  r}   r0   rC  r~  r  r   patchr  r   rW  r   )r   r  r  subgraph_coder  s        rN   codegen_subgraph_common,PythonWrapperCodegen.codegen_subgraph_common3  s    !!(..1r$,,{8==/BCww%1%=%=">>d&F&FF $$X^^4\\"3U;'/~~'='='?$M < 5
 ,,001D1DE,,]-@-@A G
 <; 54s$   E(E;E(
E%	!E((
E6c                J    U R                  U5        U R                  XU5        g rB   )r  r  )r   r  r  r  s       rN   'codegen_subgraph_with_flattened_outputs<PythonWrapperCodegen.codegen_subgraph_with_flattened_outputsF  s&     	$$X.99$;	
rP   c                J    U R                  U5        U R                  XU5        g rB   )r  r  )r   r  r  r  s       rN   r  %PythonWrapperCodegen.codegen_subgraphN  s#     	$$X.""8;LMrP   c                   UR                  5       nU R                  U S[        UR                  5       35        UR                   Vs/ s H  o3R                  5       PM     nn[        R                  R                  (       aP  [        [        UR                  5      5       Vs/ s H
  oR SU S3PM     nnU R                  UR                  XF5        g U R                  UR                  XB5        g s  snf s  snf )N = [None] * r0  r1  )rE   ry   r   r  ro  rq  r0   rC   rD  r  r  r  r  )r   invoke_subgraphr}   r  r  r7  r  s          rN   codegen_invoke_subgraph,PythonWrapperCodegen.codegen_invoke_subgraphT  s    '')$|C0G0G,H+IJK;J;Q;QR;QC--/;QR77(-c/2I2I.J(K(K1&!A(K   --((, !!/":":LO Ss   C/"C4c                   UR                  5       nUR                   Vs/ s H  o3R                  5       PM     nnUR                  R                  5       n[	        UR                  [
        R                  5      (       d  U S3nU R                  U S[        UR                  5       35        U R                  SU S35        U R                  [        XR                  R                  5      5        [        R                  R                  (       aP  [        [        UR                  5      5       Vs/ s H
  ob SU S3PM     nnU R!                  UR                  XG5        OU R#                  UR                  XB5        U R                  [%        U 5      5        U R                  S5        U R                  [        XR&                  R                  5      5        [        R                  R                  (       aP  [        [        UR                  5      5       Vs/ s H
  ob SU S3PM     nnU R!                  UR&                  XG5        OU R#                  UR&                  XB5        U R                  [%        U 5      5        g s  snf s  snf s  snf )Nr#  r  r   rB  r0  r1  zelse:)rE   operandsrq  	predicater[   r   ShapeAsConstantBufferry   r   r  r  true_subgraphrC   r0   rD  r  r  r  r0  false_subgraph)r   conditionalr}   r  r  r  r7  r  s           rN   codegen_conditional(PythonWrapperCodegen.codegen_conditionald  s   ##%;F;O;OP;OC--/;OP));;=	+//1I1IJJ$+W-I$|C0C0C,D+EFGYKq)*(/H/H/N/NOP775:3{?R?R;S5TU5TvQqc^5TMU--))< !!+";";\P'-.w(/I/I/O/OPQ775:3{?R?R;S5TU5TvQqc^5TMU--**L !!+"<"<lQ'-.9 Q V Vs   I8-I=Jc                   UR                  5       nUR                   Vs/ s H  o3R                  5       PM     nnUR                   Vs/ s H  o3R                  5       PM     nnU R	                  U S[        U5       35        [        U5       H  u  pgU R	                  U SU SU 35        M      / [        [        U5      5       Vs/ s H
  ob SU S3PM     snQUQnU S3/n	[        U5      n
U
S [        U5       nU R	                  S5        U R	                  [        XR                  R                  5      5        [        R                  R                  (       a  U R                  UR                  X5        OU R                  UR                  X5        U R	                  SU	S    S	35        U R	                  [!        U 5      5        U R	                  [        XR"                  R                  5      5        [        R                  R                  (       a  U R                  UR"                  X5        OU R                  UR"                  X5        U R	                  [!        U 5      5        g s  snf s  snf s  snf )
Nr  r0  z] = r1  _cond_resultzwhile True:zif not r   z: break)rE   carried_inputsrq  additional_inputsry   r   r  r  r   r  cond_subgraphrC   r0   rD  r  r  r0  body_subgraph)r   
while_loopr}   r  outer_carried_inputsouter_additional_inputsr7  inpcond_outer_inputscond_outer_outputsbody_outer_inputsbody_outer_outputss               rN   codegen_while_loop'PythonWrapperCodegen.codegen_while_loop  sQ   ""$/9/H/H 
/H!!#/H 	  
 0:/K/K#
/K!!#/K 	  #
 	$|C0D,E+FGH 45FANNdV1QCtC512 6
&+C0D,E&FG&Fas!n&FG
$
 "&l34 
 //J5I1JK}%(/G/G/M/MNO77--((*; 88((*; 	(+,G4	
 	'-.(/G/G/M/MNO77--((*; 88((*; 	'-.c 
#
 Hs   I&I+I0c                     [        U SS 5      (       a  g [        U [        5      (       a  U $ [        R                  R
                  R                  U 5      nUc  U$ [        U5      $ ! [         a     g f = f)Nr  )r  r[   r   r0   rC   
_shape_env_maybe_evaluate_staticr  )r   r   s     rN   statically_known_int_or_none1PythonWrapperCodegen.statically_known_int_or_none  sl    	q.$// !S!!''$$;;A>C{
s8O 		s!   A% A% -A% 
A% %
A21A2c                r    / nU  H.  n[         R                  U5      nUc    g UR                  U5        M0     U$ rB   )r  r  r   )lstr  r   nums       rN   %statically_known_list_of_ints_or_none:PythonWrapperCodegen.statically_known_list_of_ints_or_none  s<    A&CCAFC{MM#	 
 rP   c                0    [         R                  U 5      S L$ rB   )r  r  )r  s    rN    is_statically_known_list_of_ints5PythonWrapperCodegen.is_statically_known_list_of_ints  s     !FFsKSWW	
rP   c                H    [         R                  U R                  5       5      $ rB   )r  r  r  rs  s    rN   r  4PythonWrapperCodegen.static_shape_for_buffer_or_none  s    #IIOO
 	
rP   c                0    [         R                  U 5      S L$ rB   )r  r  r  s    rN   !can_prove_buffer_has_static_shape6PythonWrapperCodegen.can_prove_buffer_has_static_shape  s    #CCFKSWWWrP   ).r|  r{  rU  rQ  r  r  ru  r  r~  rV  rm  rd  r  rn  r  r;  rc  r<  rv  rW  rP  r{   r\  r^  rz   r_  rZ  ra  r>  rk  rb  rf  rg  r}  re  rX  rw  r`  r]  rY  rh  rj  ri  r[  ry  ry   rB   )r  r   r  r   r  Optional[PythonWrapperCodegen]r  $Optional[ir.GraphPartitionSignature]r$  )r}   r   r  r   r   r  )rW  r   )r  TritonMetaParamsr   r   r   r  r   z>dict[str, Union[ir.TensorBox, ir.TorchBindObject, sympy.Expr]]r   zlist[IRNode])r  r  r  )r=  r   r  r   r   r   r  )r=  r   r   r  )r  r  r   r  r  r(   r   r  )rK   zir.FallbackKernelr   r  )rK   rY  )rK   re  r   r  )r   r   r@  r   rA  r   r^  r  ru  r   r   r  )F)r|  r   rp  r   r}  zCallable[[], Sequence[str]]r~  z<Union[torch._ops.OpOverload, torch._ops.HigherOrderOperator]r  r,  r  zSequence[ir.Buffer]r   r  )r  Callable[..., None]r   zIterator[Callable[..., None]])r  r   )r}   r   r   r  r  r  )r  zsympy.Symbol)r   r   rJ   r   r   r   )r   r   r   r   )r5  r   r}   r   r  r   r   r   )r  zSequence[Expr]r   r   )ry   r  r   r   )r  r   )rK   zir.MultiOutput)NTN)
rs  r   r  r   r  r   r  r   r  r   )rs  r   r  r   r  r   )rV  r   )r   z"list[list[Union[int, sympy.Expr]]])rs  r   rY  r   )rD  r   rC   r>   r   r  )r  r6   )rs  r   )rs  r  )rM  )rs  r}  )rA  r  )r9  r   r8  r   rF  r   )r  r  r  r  r  r   )r}   r   r  zir.ReinterpretViewr   r  rs  r  )rZ  r  rd  r  )r6  r   r  r   rz  z,Optional[dict[sympy.Symbol, pytree.KeyPath]]r   r  )r  r   r  zir.GraphPartitionSignature)r  r   )r   r   r   r   rA  supports_cachingr   r.  r  rl  rs  ro  r  rq  r%   r  r  r  r  r  r  r  r  r  r  r  r  r  r  rp  r  ry  r  r  r5  r  r2  r  r  r  r  r  r  r   r$  r*  r]  r:  rp  r\  rb  rg  rk  rr  ry  r  r  r  r   contextmanagerr  r  r  r  r  r  r  r  r  r  r  r  r  r  r3  rp   r  r  r  r  r  r(  rE  r  rL  rQ  r  rW  r  r  rF  r  r  r  ru  r  r  r  r  r  r  rj   r  r  r  ry   r<  r  rN  r  r  r3  r:  r  rB  rG  r  r  rX  r_  r^  rh  rk  rn  r{  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r  r  s   @rN   r  r  F  s    ]#~ 
 FJ	&&$& 7& C	& &'<;Az
   " + +	! 
 

$	G$
%S$	(
.8)6(-.10J
/+7,5
5$:,:8 8 
8;; ;  	;
 ; ; 
;,U
&<
	V	V  	V .		V
 R	V  	V %	V 
	V0 ! !,O
b&SP:

B%(K(K (K -	(KT'?R5 @D W CG +.&'6
(  ' 
:>S
4*[Yz, #'(,

 
  	

 
 &
& FJ'*6C  #'(,!! !  	!
 ! &!.2I; 2I;V"A"A+8A	A
%N:.>85W

4
4SGj' !-
-
h !f(f(P !, F
 DH@*_*Au` 
(3T:(
 
E'NN N H	N
 
N`+'Z<< 9<BY
$
"B&
NP /B3/j     
 

 
 

 X XrP   r  c                     ^  \ rS rSrSr S     SU 4S jjjrSS jrSS jrS rS r	S r
SS	 jrSS
 jrSS jrSS jr  SS jrSS jrSS jrSU 4S jjr\SS j5       r\SS j5       rSrU =r$ )r  i  z
A wrapper codegen that generates code for a subgraph. For most of the
methods, we rely on the implementation in the PythonWrapperCodegen. But we
override a few functions to produce cleaner code (like avoiding writing
imports twice in the output code)
c                F   > Xl         X l        X0l        [        TU ]  5         g rB   )r  r  r  r   r   )r   r  r  r  r   s       rN   r   %SubgraphPythonWrapperCodegen.__init__  s"     +,$8!rP   c                &    U R                   U l        g rB   )r  rk  r   s    rN   rl  1SubgraphPythonWrapperCodegen.set_launcher_fn_name  s     !% 2 2rP   c                    g rB   rc   r   s    rN   ro  )SubgraphPythonWrapperCodegen.write_header   r  rP   c                    g rB   rc   rI  s     rN   r  2SubgraphPythonWrapperCodegen.add_benchmark_harness  r  rP   c                    g rB   rc   rI  s     rN   rE  6SubgraphPythonWrapperCodegen.benchmark_compiled_module  r  rP   c                    g rB   rc   r   s    rN   r  5SubgraphPythonWrapperCodegen.write_async_compile_wait	  r  rP   c                6    U R                   R                  5       $ rB   )r  r  r   s    rN   r  /SubgraphPythonWrapperCodegen.next_kernel_suffix  s    ""5577rP   c                    g rB   rc   r  s     rN   r  2SubgraphPythonWrapperCodegen.generate_after_suffix  r  rP   c                \    U R                   R                  SU R                   S35        SnU$ )Nz
            def z(args):
            r1   )rX  r   rk  r  s     rN   r  >SubgraphPythonWrapperCodegen.write_launcher_fn_call_get_indent  s<    &&' (	

 rP   c                    gr   rc   r   s    rN   r  4SubgraphPythonWrapperCodegen.get_wrapper_call_indent  s    rP   c                    U R                   =n(       a6  UR                  UR                   Vs0 s H  n[        U5      U_M     sn-  nU$ [        R
                  R                  nU$ s  snf rB   )r  input_nodesr  r   r0   rC   r  )r   rf  r  ro  s       rN   r  -SubgraphPythonWrapperCodegen.get_graph_inputs  sm     11191**#,#:#:.#:aA	#:. F
  WW))F.s   A&c                   U R                   =n(       aL  [        UR                  R                  5       5      UR                   Vs/ s H  o"R
                  PM     sn-   nU$ [        R                  R                  nU$ s  snf rB   )	r  r   r  rB  r  r}   r0   rC   r  )r   rf  r  namess       rN   r  2SubgraphPythonWrapperCodegen.get_graph_input_names*  su    11191..33566?6M6M:6Ml!!6M: E
  GG--E:s   A<c                |    U R                   =n(       a  UR                  nU$ [        R                  R                  nU$ rB   )r  r  r0   rC   r  )r   rf  r  s      rN   r  .SubgraphPythonWrapperCodegen.get_graph_outputs3  s;    11191,,G  gg++GrP   c                   > UR                  5       nU R                  =n(       a  X#R                  ;   a  g [        TU ]  U5        g rB   )rE   r  r  r   rX  )r   rs  r}   rf  r   s       rN   rX  /SubgraphPythonWrapperCodegen.codegen_allocation:  s=     222I2@U@U8U "6*rP   c                8    U R                   R                  5         g rB   )r  r  r   s    rN   r  5SubgraphPythonWrapperCodegen.write_triton_header_onceD  s     	446rP   c                8    U R                   R                  5         g rB   )r  r  r   s    rN   r  =SubgraphPythonWrapperCodegen.write_get_raw_stream_header_onceM  s     	<<>rP   )rk  r  r  r  rB   )r  r   r  r  r  r  r$  r  r  r  r  r  r  r  )r   r   r   r   rA  r   rl  ro  r  rE  r  r  r  r  r  r  r  r  rX  r%   r  r  r   r  r  s   @rN   r  r    s     FJ	 - C	 3
8		G	+ 7 7 ? ?rP   r  )rK   r  r   r  )rT   r  rU   r  )NN)r}   r   r   zlist[triton.Config]r   zlist[TritonGrid]rl   r  r   r   r   ztuple[str, str]r  )
__future__r   r   r   r  r   r  r  r  r  r  r  r  	itertoolsr   r   typingr   r   r   r	   r
   r\   r   r  
torch._opstorch.utils._pytreeutils_pytreerr  r   r  torch._dynamo.utilsr   r   #torch._inductor.codegen.debug_utilsr   $torch._inductor.codegen.multi_kernelr   %torch._inductor.runtime.runtime_utilsr   %torch.fx.experimental.symbolic_shapesr   r   r   r   r   torch.fx.noder   torch.utils._ordered_setr    torch.utils._sympy.singleton_intr   torch.utils._sympy.symbolr   r   rM  r   r   r   	codecacher    r!   r"   runtimer#   runtime.hintsr$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   virtualizedr0   r  r2   r3   r4   r5   r6   r7   	cpp_utilsr8   triton_utilsr9   r:   r;   collections.abcr<   r=   rq   rC   r>   wrapper_fxirr?   	getLoggerr   logdoprintr  ro   ru  r   r   r  r  r  r  rO   rX   r4  r   r  r   r   r   r"  r   r   r@   r  r(  r0  r<  rP  rX  rd  r|  r  r  r  r  r  r  r  r  r  r  r"  r)  rC  rt  Liner  r  rc   rP   rN   <module>r5     s   "    
      	  " @ @     $ $ & 6 C A ;  . / 9 : ( ( ' ( ' ,       P P 2%) ! u{{C56299l*+
]OT12 @ S> 	%UZZ
 #
%&2B1CU3PS8_1T(UU
 /3*.d&
d& d& d& ,	d&
 (d& d&NJ&Z   * **X X
 2 2 2 ++ + + 1{ 1 1 "@K "@ "@J?; ? 
7K 
7 
7 5+ 5 5@ 	({ 	( 	( /[ / /> 5; 5 5* ; ; ;2 ,% , ,@ 6, 6 62 /( / /& )" ) ).(! (
 ![ ! !: )8^ )8 )8X 4 4 4 #0k #0 #0L 	5+ 	5 	5 
,-\&X7 \&X~Lp?#7 p?rP   