
    7h                   &   S SK Jr  S SKrS SKrS SKrS SKrS SKrS SKrS SKrS SK	r	S SK
r
S SKrS SKrS SKrS SKrS SKJrJr  S SKJrJrJrJrJrJrJr  \(       a  S SKJr  S SKJr  S SKrS SKrS SKrS SKJ r J!r!  S SK"J#r#J$r$  S S	K%J&r&J'r'  S S
K(J)r)  S SK*J+r+  S SK,J-r-J.r.J/r/  S SK0J1r1  SSK2J3r3J4r4J5r5J6r6J7r7  SSK8J9r9  SSK:J;r;J<r<J=r=  SSK>J?r?  SSK5J@r@JArAJBrBJCrC  SSKDJErEJFrF  SSKGJHrHJIrI  SSK6JJrJJKrKJLrLJMrMJNrN  SSKOJPrP  SSKQJRrRJSrS  SSKTJUrUJVrV  SSKWJXrX  SSKYJZrZJ[r[J\r\J]r]J^r^J_r_J`r`JaraJbrbJcrcJdrdJereJfrfJgrgJhrh  SSKiJjrj  \R                  " \l5      rm\R                  R                  \lS5      rp\R                  R                  \lS5      rq\rS   rs\R                   " S  S!5      5       ru\R                   " S" S#\u5      5       rv " S$ S5      rw " S% S&5      rxS@S' jry " S( S)5      rz        SAS* jr{ " S+ S,\w5      r| " S- S.\w5      r} " S/ S0\w5      r~    SBS1 jr        SCS3 jr " S4 S5\w5      r " S6 S7\5      r " S8 S9\w5      r SD       SES: jjr\R                   " S; S<5      5       r\GR                  " 5       r " S= S25      r " S> S?5      rg)F    )annotationsN)Counterdefaultdict)AnyCallableGenericOptionalTYPE_CHECKINGTypeVarUnion)Sequence)
ModuleType)countersdynamo_timed)LambdaFuturePyCodeCache)get_metric_tableis_metric_table_enabled)free_symbols
OrderedSet)free_symbol_is_typesymbol_is_typeSymT)
has_triton   )commsconfigdependenciesirmetrics)can_codegen_without_upcasts)BackendFeatureget_scheduling_for_deviceKernel) estimate_nccl_collective_runtime)Dep	MemoryDepStarDepWeakDep)GPUTooOldForTritonTritonMissing)count_flops_fxcountable_fx)get_device_typeGraphPartitionSignatureMultiOutputMultiOutputLayout
NoneLayout)LoopBody)MemoryPlanningInfoForBufferMemoryPlanningInfoForNode)
green_textred_text)SimplifyIndexing)cache_on_selfcmpdevice_need_guardget_device_tflopsget_dtype_sizeget_gpu_dram_gbpsGraphPartitionMapIndentedBufferis_collectiveis_cudagraph_unsafe_opis_gpuis_multi_outputs_template#is_output_of_multi_outputs_templateis_waitsympy_product)Vfusionloop_orderingBaseSchedulerNodec                      \ rS rSr% S\S'   S\S'   S\S'   \R                  " \S9rS	\S
'   \R                  " \	S9r
S\S'   SS jrSS jrSS jrSS jrSS jrSS jrSS jrSS jrSS jrSS jrSrg) SchedulerBufferQ   	Scheduler	schedulerz	ir.BuffernodeOptional[BaseSchedulerNode]defining_op)default_factorylist[NodeUser]usersr5   
mpi_bufferc                D    U R                   nUc   eUR                  5       $ N)rT   get_name)selfops     S/var/www/fran/franai/venv/lib/python3.13/site-packages/torch/_inductor/scheduler.pydefining_op_name SchedulerBuffer.defining_op_name[   s#    ~~{{}    c                @    [        U R                  R                  5      $ rZ   )hashrR   namer\   s    r^   __hash__SchedulerBuffer.__hash__`   s    DIINN##ra   c                   [        5       nU R                  5       nUR                  U S[        U R                  5      R
                   35        UR                  U SU R                  R                   35        U R                  5       (       a-  UR                  U S[        U R                  5       5       35        U R                  5       (       a-  UR                  U S[        U R                  5       5       35        [        U R                  5      S::  a0  UR                  U SU R                   35        UR                  5       $ UR                  U S35        UR                  S5         U R                   H  nUR                  U S35        M     S S S 5        UR                  S	5        UR                  5       $ ! , (       d  f       N/= f)
N: z
.layout = z.aliases = z.mutations = r   z	.users = z
.users = [,])rA   r[   	writelinetyperR   __name__layoutget_aliasespformatget_mutationslenrW   indentgetrawvalue)r\   resultrd   users       r^   	debug_strSchedulerBuffer.debug_strc   s   !}}D6DO$<$<#=>?D6DII,<,<+=>?v[9I9I9K1L0MNOv]74;M;M;O3P2QRStzz?avYtzzl;< !!## vZ01q! JJD$$vQZ0 ' " S!!!##	 "!s   *(F;;
G	c                6    U R                   R                  5       $ rZ   rR   r[   re   s    r^   r[   SchedulerBuffer.get_namew       yy!!##ra   c                0   U R                   c   eU R                   R                  5       (       d  g U R                   R                  5       (       dV  U R                   R                  5       (       d7  [	        U R                   R                  5       [        R                  5      (       a4  [        R                  R                  R                  U R                   5        g [        [        R                  S5      (       a  U R                  5       [        R                  R                  ;   a  [        R                  R                  U R                  5          nXR                   R"                  ;   a$  U R                   R"                  U   R                   nO#U R                   R$                  U   R                   n[        R                  R                  R'                  UU R                   5        g [        R                  R                  R                  U R                   5        g )Nargs)rR   should_allocateget_inputs_that_alias_outputget_mutation_names
isinstanceget_output_specr    CommBufferLayoutrI   graphwrapper_codecodegen_allocationhasattrkernelr[   inplace_update_buffersrQ   name_to_donated_buffername_to_bufcodegen_inplace_reuse)r\   input_buffer_nameinput_buffers      r^   allocateSchedulerBuffer.allocatez   sc   yy$$$yy((** II2244yy++--$))335r7J7JKKGG  33DII> AHHf%%188#B#BB !" ? ? P NN$I$II#~~DD% $   $~~99:KLQQGG  66		
 GG  33DII>ra   c                &   U R                   c   e[        U R                   R                  [        R                  5      (       d  [        U R                   5      (       a  gU R                   H$  n[        UR                   [        5      (       d  M$    g   gNFT)rR   r   ro   r    r3   rE   rW   
OutputNode)r\   uses     r^   can_freeSchedulerBuffer.can_free   sm    yy$$$dii&&66:SII;
 ;
 ::C#((J//  ra   c                4   0 nU Hr  n[        UR                  5      U;   a?  UR                  U[        UR                  5         5      U[        UR                  5      '   M[  X2[        UR                  5      '   Mt     [        UR	                  5       5      U l        g rZ   )idrR   mergelistvaluesrW   )r\   rW   rv   r   s       r^   	set_usersSchedulerBuffer.set_users   sm    &(C#((|v%'*yy3881E'Fr#((|$'*r#((|$	 
 &--/*
ra   c                T    U R                   c   eU R                   R                  5       $ rZ   )rR   r   re   s    r^   rp   SchedulerBuffer.get_aliases   s%    yy$$$yy5577ra   c                T    U R                   c   eU R                   R                  5       $ rZ   )rR   r   re   s    r^   rr   SchedulerBuffer.get_mutations   %    yy$$$yy++--ra   c                R    U R                   R                  5       R                  5       $ rZ   )rR   r   
get_devicere   s    r^   r   SchedulerBuffer.get_device   s    yy((*5577ra   )rW   Nreturnstrr   intr   Noner   bool)rW   rV   r   r   r   zSequence[str]r   Optional[torch.device])rn   
__module____qualname____firstlineno____annotations__dataclassesfieldr   rW   r5   rX   r_   rf   rx   r[   r   r   r   rp   rr   r   __static_attributes__ ra   r^   rN   rN   Q   sv    
O,,'--dCE>C.9.?.?3/J+ 
$$($?B
+8.8ra   rN   c                  $    \ rS rSr% SrS\S'   Srg)SchedulerDonatedBuffer   NrS   rT   r   )rn   r   r   r   rT   r   r   r   ra   r^   r   r      s    /3K,3ra   r   c                     \ rS rSr% S\S'   S\S'   S\S'   S\S	'   S\S
'   S\S'   SBS jrSCS jrSDS jrSDS jrSDS jr	SES jr
SDS jrSFS jr      SGS jrSHS jrSIS jrSJS jrSKS jr      SLS jrSFS jrSMS jrSMS jrSFS jrSFS jr    SNS  jrSDS! jrSDS" jr\SMS# j5       r\SMS$ j5       r\SJS% j5       r\SJS& j5       rSOS' jr SPS( jr!SQS) jr"SRS* jr#SJS+ jr$SJS, jr%SJS- jr&SJS. jr'SJS/ jr(SJS0 jr)SJS1 jr*SSS2 jr+SJS3 jr,SFS4 jr- ST     SUS5 jjr.\SVS6 j5       r/\SVS7 j5       r0\SVS8 j5       r1      SWS9 jr2      SXS: jr3\SYS; j5       r4\SZS< j5       r5S[S= jr6S\S> jr7\8    S]S? j5       r9S@r:gA)^rL      z7tuple[torch.device, tuple[tuple[sympy.Expr, ...], ...]]groupdependencies.ReadWritesread_writesOrderedSet[Dep]unmet_dependenciesr   	min_order	max_orderr6   mpi_nodec                     Xl         S U l        g )Nc                     / $ rZ   r   )r   kwargss     r^   <lambda>,BaseSchedulerNode.__init__.<locals>.<lambda>   s    Bra   )rQ   debug_device_str)r\   rQ   s     r^   __init__BaseSchedulerNode.__init__   s    $-& 	ra   c           	     P   Xl         [        5       U l        [        [           " 5       U l        SU l        UR                  5        Vs/ s H  n[        U R                  UU S9PM     snU l	        U R                   Vs0 s H  o3R                  5       U_M     snU l        g s  snf s  snf )NF)rQ   rR   rT   )rR   r   	ancestorsr   
last_usagewrittenget_outputsrN   rQ   outputsr[   outputs_by_name)r\   rR   outputbufs       r^   _init_from_node!BaseSchedulerNode._init_from_node   s    ,0	*4,$
   **,/
 - .. 
 -/
 ,0<<<
+7CLLNC<<
/
<
s   B;B#c                V    [        U 5      R                   SU R                  5       < S3$ )Nz(name=)rm   rn   r[   re   s    r^   __repr__BaseSchedulerNode.__repr__   s'    t*%%&fT]]_,?qAAra   c                P   U R                  5       n[        5       nUR                  U S[        U 5      R                   S[        [        U SS5      5      R                   SU S[        U R                  R                  5       SU S[        U R                  5       SU S	[        U R                  R                  U R                  -
  5       SU S
35        UR                  5          U R                  5        H"  nUR                  UR                  5       5        M$     SSS5        UR                  S5         UR                  U R                  5       5        UR'                  5       R)                  5       $ ! , (       d  f       N]= f! [          a    ["        R%                  SSS9   NOf = f)#Longer form printout for trace logsri   (rR   N)

.writes = 
.unmet_dependencies = .met_dependencies = z.outputs = [
        rk   Ignoring error in debug_str()Texc_info)r[   rA   splicerm   rn   getattrrq   r   writesr   readsrt   r   rx   rl   debug_str_extra	Exceptionlogwarningru   rstrip)r\   rd   r   outs       r^   rx   BaseSchedulerNode.debug_str   sv   }}

bd		QtGD&$$?@IIJ Kj))0012 3WT%<%<=> ?74#3#3#9#9D<S<S#STU V 		
 ZZ\'')

3==?+ *  	c	HJJt++-.  '')) \  	HKK7$KG	Hs   %7E36F 3
FF%$F%c                    g)N r   re   s    r^   r   !BaseSchedulerNode.debug_str_extra      ra   c                $    U R                  U 5      $ rZ   )r   re   s    r^   _debug_str_for_device'BaseSchedulerNode._debug_str_for_device  s    $$T**ra   c                   [        U R                  SS 5      nSn[        U[        R                  R
                  R                  5      (       a$  SUR                  UR                  5       /SSS9-   nOe[        U[        R                  R
                  R                  5      (       a2  SUR                  UR                  5       UR                  5       /SSS9-   nU  U 3$ )Ndatar   z, F)shorten	multiline)r   rR   r   torch	_inductorr    	Pointwise
str_helperget_size	Reductionget_reduction_sizeget_reduction_type)r\   
maybe_datadata_strs      r^   debug_str_short!BaseSchedulerNode.debug_str_short
  s    TYY5
j%//"4"4">">??j33$$&'% 4  H 
EOO$6$6$@$@AAj33..0*2O2O2QR 4  H
 z""ra   c                p    [         R                  SU U R                  U R                  R                  5        g )Nz(%s: unmet_dependencies = %s, writes = %s)r   infor   r   r   re   s    r^   log_detailsBaseSchedulerNode.log_details  s,    6####		
ra   c                    g rZ   r   )r\   self_dep	other_deps      r^   reorder_loops_by_dep_pair+BaseSchedulerNode.reorder_loops_by_dep_pair!  s     	ra   c                X    U R                  U R                  R                  U5      5        g rZ   )set_read_writesr   renamer\   renamess     r^   update_mutated_names&BaseSchedulerNode.update_mutated_names&  s!    T--44W=>ra   c                X    U R                  U R                  R                  U5      5        g rZ   )r  r   	with_readr\   deps     r^   add_fake_depBaseSchedulerNode.add_fake_dep)  s!    T--77<=ra   c                B    [        S U R                  5        5       5      $ )Nc              3  n   #    U  H+  oR                  5       =(       d    UR                  5       v   M-     g 7frZ   )rp   rr   ).0r   s     r^   	<genexpr>=BaseSchedulerNode.has_aliasing_or_mutation.<locals>.<genexpr>-  s*      
@ROO4!2!2!44@Rs   35)anyr   re   s    r^   has_aliasing_or_mutation*BaseSchedulerNode.has_aliasing_or_mutation,  s%     
@D@P@P@R
 
 	
ra   c                f    Xl         U R                   R                  U l        U R                  5         g rZ   )r   r   r   
prune_deps)r\   rws     r^   r  !BaseSchedulerNode.set_read_writes1  s&    "&"2"2"8"8ra   c                b   ^ U R                  5       n[        U4S jU 5       5      nX1-
  U l        g )Nc              3  F   >#    U  H  nTR                  X5      v   M     g 7frZ   )get)r&  kmutation_real_names     r^   r'  3BaseSchedulerNode.set_last_usage.<locals>.<genexpr>:  s      !U1"4"8"8">">   !)used_or_aliased_buffer_namesr   r   )r\   future_used_buffersr4  used_bufferss     ` r^   set_last_usage BaseSchedulerNode.set_last_usage6  s-     88:!!U!UU&<ra   c                J    U R                    H  nUR                  5         M     g rZ   )r   r   )r\   r   s     r^   mark_runBaseSchedulerNode.mark_run=  s    <<CLLN  ra   c                    [        S [        R                  " U R                  R                  U R                  R
                  5       5       5      $ )Nc              3  :   #    U  H  nUR                   v   M     g 7frZ   rd   r&  r!  s     r^   r'  6BaseSchedulerNode.used_buffer_names.<locals>.<genexpr>B  s      
W HHW   )r   	itertoolschainr   r   r   re   s    r^   used_buffer_names#BaseSchedulerNode.used_buffer_namesA  s?     
 t'7'7'='=t?O?O?V?VW
 
 	
ra   c                >  ^ [        5       m[        R                  " U R                  R                  U R                  R
                  5       Vs/ s H  nUR                  PM     nn[        U5      S:  a  UR                  5       nTR                  U5        [        R                  R                  R                  U5      (       aD  UR                  U4S j[        R                  R                  U   R                  5        5       5        [        U5      S:  a  M  T$ s  snf )Nr   c              3  8   >#    U  H  nUT;  d  M  Uv   M     g 7frZ   r   )r&  alias
used_namess     r^   r'  ABaseSchedulerNode.used_or_aliased_buffer_names.<locals>.<genexpr>R  s(      "5 J.	 E"5s   
	)r   rE  rF  r   r   r   rd   rs   popaddrI   r   name_to_bufferr2  extendr   )r\   r!  depsrL  s      @r^   r7  .BaseSchedulerNode.used_or_aliased_buffer_namesG  s    &0l
 !t'7'7'='=t?O?O?V?VW
W HHW 	 
 $i!m((*CNN3ww%%))#.. !"!7!7"224"5 	 $i!m 
s   Dc                N   ^  [        U 4S jT R                   5       5      T l        g )Nc              3  t   >#    U  H-  nUR                   TR                  R                  ;  d  M)  Uv   M/     g 7frZ   )rd   rQ   available_buffer_namesr&  r!  r\   s     r^   r'  /BaseSchedulerNode.prune_deps.<locals>.<genexpr>\  s0      -
.xxt~~DDD C.s   (8	8r   r   re   s   `r^   r-  BaseSchedulerNode.prune_deps[  s#    ", -
..-
 #
ra   c                   ^ ^ SU 4S jjm[        U4S jT R                  R                   5       5      nT R                  T R                  R	                  U5      5        g )Nc                   > [        U [        5      (       d  gTR                  R                  U R                     R                  5       nU[        R                  R                  ;   $ NF)	r   r*   rQ   r   rd   r_   rI   r   removed_operations)r!  op_namer\   s     r^   should_prune7BaseSchedulerNode.prune_weak_deps.<locals>.should_pruned  sI    c7++nn00:KKMGagg8888ra   c              3  F   >#    U  H  nT" U5      (       d  M  Uv   M     g 7frZ   r   r&  r!  r`  s     r^   r'  4BaseSchedulerNode.prune_weak_deps.<locals>.<genexpr>j  s      
1C\#5FCC1   !	!r!  r'   r   r   )r   r   r   r  remove_reads)r\   	to_remover`  s   ` @r^   prune_weak_deps!BaseSchedulerNode.prune_weak_depsb  sN    	9  
++11
 
	 	T--::9EFra   c                D    [        XU R                  R                  5        g rZ   )_prune_redundant_depsrQ   r   )r\   name_to_fused_nodes     r^   prune_redundant_deps&BaseSchedulerNode.prune_redundant_depso  s     	d8R8RSra   c                T    U R                   c   eU R                   R                  5       $ rZ   )rR   get_operation_namere   s    r^   r[   BaseSchedulerNode.get_namet  r   ra   c                "    U R                  5       $ rZ   r[   re   s    r^   get_first_name BaseSchedulerNode.get_first_namex  s    }}ra   c                B    [        S U R                  5        5       5      $ )Nc              3  @   #    U  H  oR                  5       v   M     g 7frZ   rt  r&  rR   s     r^   r'  8BaseSchedulerNode.get_operation_names.<locals>.<genexpr>}  s     G6Fd--//6F   )r   	get_nodesre   s    r^   get_operation_names%BaseSchedulerNode.get_operation_names{  s    Gdnn6FGGGra   c                :    [        S U R                   5       5      $ )Nc              3  @   #    U  H  oR                  5       v   M     g 7frZ   rt  r&  r   s     r^   r'  5BaseSchedulerNode.get_buffer_names.<locals>.<genexpr>  s     ALS,,..Lr{  )r   r   re   s    r^   get_buffer_names"BaseSchedulerNode.get_buffer_names  s    ADLLAAAra   c                B    [        S U R                  5        5       5      $ )Nc              3  d   #    U  H&  n[        U[        5      =(       a
    [        US S9v   M(     g7f)T)disallow_fp32_opsNr   SchedulerNoder"   r&  ns     r^   r'  ABaseSchedulerNode.can_codegen_in_low_precision.<locals>.<genexpr>  s6      
 & q-( G+AFG%s   .0allr|  re   s    r^   can_codegen_in_low_precision.BaseSchedulerNode.can_codegen_in_low_precision  s%     
 ^^%
 
 	
ra   c                B    [        S U R                  5        5       5      $ )Nc              3  f   #    U  H'  n[        U[        5      =(       a    [        U5      v   M)     g 7frZ   r  r  s     r^   r'  @BaseSchedulerNode.can_codegen_without_upcasts.<locals>.<genexpr>  s,      
% q-(K-H-KK%s   /1r  re   s    r^   r"   -BaseSchedulerNode.can_codegen_without_upcasts  s#     
^^%
 
 	
ra   c                    U /$ rZ   r   re   s    r^   r|  BaseSchedulerNode.get_nodes  s	    vra   c                    U R                   $ rZ   )r   re   s    r^   r   BaseSchedulerNode.get_outputs  s    ||ra   c                     U R                   U   $ rZ   )r   )r\   buf_names     r^   
get_outputBaseSchedulerNode.get_output  s    ##H--ra   c                T    U R                   c   eU R                   R                  5       $ rZ   )rR   r   re   s    r^   r   BaseSchedulerNode.get_device  s%    yy$$$yy##%%ra   c                V    U R                  5       nUS L=(       a    UR                  S:H  $ Ncpu)r   rm   r\   devices     r^   is_cpuBaseSchedulerNode.is_cpu  s'    "T!:fkkU&::ra   c                b    U R                  5       nUS L=(       a    [        UR                  5      $ rZ   )r   rD   rm   r  s     r^   rD   BaseSchedulerNode.is_gpu  s'    "T!9fV[[&99ra   c                    gr]  r   re   s    r^   is_reductionBaseSchedulerNode.is_reduction      ra   c                    gr]  r   re   s    r^   is_split_scanBaseSchedulerNode.is_split_scan  r  ra   c                    gr]  r   re   s    r^   is_templateBaseSchedulerNode.is_template  r  ra   c                    gr]  r   re   s    r^   	is_externBaseSchedulerNode.is_extern  r  ra   c                    gr]  r   re   s    r^   
is_foreachBaseSchedulerNode.is_foreach  r  ra   c                    gr]  r   r\   read_deps     r^   can_inplaceBaseSchedulerNode.can_inplace  r  ra   c                    gr]  r   re   s    r^   has_side_effects"BaseSchedulerNode.has_side_effects  r  ra   c                X  ^  SSK Jn  [        T [        5      (       a  [        R
                  (       a  [        R                  R                  T R                  5       [        R                  5      (       a  [        [        R                  [        R                  R                  R                   R"                  5      (       a  [%        [        R                  SS5      b  ['        [        R                  S5      (       d  gT R(                  [        R                  R*                  -  T R,                  R.                  -  nSU 4S jjnT R1                  5        GHQ  nUR2                  nUc   eUR5                  5       (       aV  UR7                  5       (       dA  UR9                  5       (       d,  UR;                  5       [        R                  R<                  ;   a  M  T R>                  R@                   GH  nURB                  T R,                  RD                  ;   a$  T R,                  RD                  URB                     nO/T R,                  RF                  RI                  URB                  5      nU(       d  M  [        R                  RJ                  RM                  UT 5      (       d  M  [        URN                  [P        5      (       a  M  URR                  c   eURR                   Vs/ s H%  nUR2                  R;                  5       U;  d  M#  UPM'     n	n[U        U	5      S:X  d  GM3  U	S   RV                  (       d  GMJ  U	S   R2                  T L d  GM_  UR2                  c  GMo  [        UR2                  RY                  5       [Z        R\                  [Z        R^                  [Z        R`                  45      (       a  GM  URN                  (       am  [        URN                  R2                  [Z        Rb                  [Z        Rd                  45      (       a*  [U        UR2                  R7                  5       5      S:  a  GMF  U" UR2                  UR2                  5      (       d  GMk  U" U5      (       d  GM{  [        R                  Rf                  Ri                  UR;                  5       UR;                  5       5        [        [        R                  [        R                  R                  R                   R"                  5      (       an  [        R                  Rj                  Rm                  UR;                  5       5        [        R                  Rj                  Rm                  UR;                  5       5        UR;                  5       [        R                  Rn                  UR;                  5       '     GMO     GMT     gs  snf )	zf
Decide if there should be inplace updates for the node
and record the decision in the active kernel.
r   )can_match_buffer_size	mutationsNr   c                  >^ U R                   R                  T5      nU R                  5       m[        5       nU R                   H  nUR
                  n[        U[        5      (       d  M&  UR                  5       U R                   R                  ;  d  U R                   R                  U5      ULa  Mn  UU4S jUR                  R                  5        5       -  n[        U5      S:  d  M    g   g)Nc              3  L   >#    U  H  nUR                   T:X  d  M  Uv   M     g 7frZ   rA  )r&  or  s     r^   r'  ^BaseSchedulerNode.decide_inplace_update.<locals>.single_index_in_fused_node.<locals>.<genexpr>  s&      Evv) AEs   $	$r   FT)rQ   get_fused_noder[   r   rW   rR   r   rL   ru  rm  r   reads_and_writesrs   )buf_to_be_inplaced
fused_noderR  rw   	user_noder  r\   s        @r^   single_index_in_fused_nodeKBaseSchedulerNode.decide_inplace_update.<locals>.single_index_in_fused_node  s    
 ,55DDTJJ)224H %/LD*00 II	!)->?? ,,.-77JJK)33BB9M%&  &22CCE 
 t9q= ' 1* ra   r   )r  rN   r   r   )8codegen.wrapperr  r   r  r   inplace_buffersrI   r   has_featurer   r#   INPLACE_BUFFERSr   r  r  codegensimd
SIMDKernelr   r   r   r^  rQ   completed_operationsr   rR   r   r   r   r[   removed_buffersr   r   rd   r   r   r2  r   	can_reuserT   NopKernelSchedulerNoderW   rs   r  r   r    r3   r2   MutationLayoutSHOULDREMOVEFallbackKernelr1   r   make_inplacer  rO  r   )
r\   r  inconsequential_nodesr  r   buf_noderead	input_bufxremaining_usess
   `         r^   decide_inplace_update'BaseSchedulerNode.decide_inplace_update  s   
 	; t]++&&##DOO$5~7U7UVVqxx)@)@)E)E)P)PQQ188[$7C &)) NNgg(()nn112 	 	D ##%CxxH''',,..88::..00<<>QWW%<%<<((..99 E EE $ E Edii PI $ : : > >tyy II I,,66y$GG&y'<'<>TUU$??666 "+&!0A66??,4II !0 # & N+q0*1-999*1-22d:%NN6 *%NN::< " " 4 4 " = =! ! &11 * ) 5 5 : :!#!2!2BNN C! ! !$INN$O$O$Q RUV V1)..#((KK6yAA
 2293E3E3GX%HHeoo&=&=&B&B&M&M  HH..2293E3E3GHHH..223<<>B &..0 77G q / &0&s   "V'V'c                b   [         R                  (       d  g U(       a  U R                  (       a  g U R                  c   eU R                  R	                  5       n/ nU GH&  nUR
                  S:X  a  M  UR                  S5        UR                  S5        SUR
                   SUR                   3nSUR                  ;   a  USUR                  S    3-   nUR                  U5        SUR                  ;   d  M  UR                  S    nUR                  S	5      S
   nUR                  SUR                  SS5      R                  SS5      R                  SS5      -   5        UR                  S5        UR                  S5        GM)     [        U5      S:X  a  g UR                  U5        SU l        g )Nr   r   z#pragma CMT ORIGIN:z#pragma CMT  seq_nrz seq_nr:stack_trace|{z{{}z}}r   \z#pragma CMT END ORIGINr   T)r   comment_originr   rR   get_originsr]   appendtargetmetasplitreplacers   
writelines)	r\   buffer	only_onceorigins	out_linesr  op_info_strr  stack_trace_last_lines	            r^   codegen_originating_info*BaseSchedulerNode.codegen_originating_info>  s    $$yy$$$))'')	AttxR 23(az:K166!)hqvvh7G6H,II[)&!"!6 7(3(9(9#(>r(B%  "+33C>WS$'WT4()   !9:  $- 0 y>Q 	)$ra   c                "    U R                  SSS9$ )NTinclude_readsinclude_writes!get_read_write_buffers_sizes_implre   s    r^   get_read_write_buffers_sizes.BaseSchedulerNode.get_read_write_buffers_sizesj  s    55t 6 
 	
ra   c                "    U R                  SSS9$ )NTFr  r  re   s    r^   get_read_buffer_sizes'BaseSchedulerNode.get_read_buffer_sizesp  s    55u 6 
 	
ra   c                "    U R                  SSS9$ )NFTr  r  re   s    r^   get_write_buffer_sizes(BaseSchedulerNode.get_write_buffer_sizesv  s    55 6 
 	
ra   c                L    [        U R                  XS9R                  5       SS9$ )Nr  r   )start)sumget_read_write_buffer_accessesr   )r\   r  r   s      r^   r  3BaseSchedulerNode.get_read_write_buffers_sizes_impl|  s1     //+ 0 fh	
 	
ra   c                  ^ ^^^^^ [        T [        5      (       a  0 $ [        T [        5      (       a!  [        T R                  [        5      (       a  0 $ [        T [        5      (       af  [        T R                  [
        R                  5      (       a=  T R                  R                  [        R                  R                  R                  L a  0 $ SS jm[        T [        5      (       a@  T" [        T R                  5       S   5      [        T R                  5       S   5      -  5      mO[        S5      m[         R"                  " [$        5      nU(       a:  T R&                  R(                   H   nX4R*                     R-                  U5        M"     U(       a:  T R&                  R.                   H   nX4R*                     R-                  U5        M"     U(       a&  [1        S T R&                  R(                   5       5      O	[1        5       nU(       a&  [1        S T R&                  R.                   5       5      O	[1        5       nSU 4S jjm[        T [2        5      (       a  [1        UU 4S jU 5       5      nXg-
  nXW-
  n0 nXV-   H  n	[5        U4S	 jX9    5       5      mU	[6        R8                  R:                  ;   a  [6        R8                  R:                  U	   n
O>U	[6        R8                  R<                  ;   a  [6        R8                  R<                  U	   n
OM      SUUU U4S
 jjmT" U
5      nX;  a  XU	'   M  X==   U-  ss'   M     U$ )a  
Counting the number of bytes accessed for a kernel is
surprisingly tricky. In particular, there is a differentiation
between 'theoretical' memory accesses and practical memory
accesses. For example, a layernorm kernel may actually access an
input 3 times, but in theory, it only needs to access its input
once (and may be optimized to do so through say, persistent
reductions)

Another example is that even though a buffer is passed in, we may
not access the entire buffer. This may occur if we are accessing
a slice of the buffer. Another tricky case is for indirect
indexing, where the amount of bytes accessed depends on the
values of the input.

What this function aims to compute is the memory accesses for
worst-case inputs, best-case optimization. What this means is
that for each buffer we compute the amount of potential accesses in two ways and take the minimum.

1. Numel in ranges multiplied by number of deps the buffer has
2. The buffer size

Returns memory accesses per buffer.
c                R    [         R                  R                  R                  U SS9$ )Nr   fallback)rI   r   sizevars	size_hint)ss    r^   try_size_hintGBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.try_size_hint  s"    77##--a!-<<ra   r   r       eAc              3  8   #    U  H  oR                   v   M     g 7frZ   rA  rB  s     r^   r'  CBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.<genexpr>  s     B+ACxx+A   c              3  8   #    U  H  oR                   v   M     g 7frZ   rA  rB  s     r^   r'  r    s     C+BCxx+Br  c                   > TR                   R                  U    R                  n[        S U 5       5      n[	        U[        U5      -
  5      S:  $ )Nc              3  8   #    U  H  oR                   v   M     g 7frZ   rR   )r&  rw   s     r^   r'  \BaseSchedulerNode.get_read_write_buffer_accesses.<locals>.is_materialized.<locals>.<genexpr>  s     !>))r  r   )rQ   r   rW   r   rs   )r   snodesrW   buf_usesr\   s       r^   is_materializedIBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.is_materialized  sG    NN..s399E!!>!>>Hx*V"44599ra   c              3  \   >#    U  H!  nT" UTR                   5      (       a  M  Uv   M#     g 7frZ   r"  )r&  r!  r$  r\   s     r^   r'  r    s#      )%_S$++-Nvs   ,	,c              3  (   >#    U  H  nTv   M	     g 7frZ   r   )r&  r!  
node_numels     r^   r'  r    s     $R;QCZ;Qs   c                R  > U (       d  g[        U [        R                  5      (       a  U R                  5       $ [        U R                  [
        5      (       a  TR                  R                  U R                  5          R                  nSnU H  n[        UR                  [        5      (       d   e[        UR                  R                  [        5      (       a8  UR                  R                  5        H  nUT" UR                  5      -  nM     M    g   U$ [        U R                  [        R                  5      (       a#  [        U4S jU R!                  5        5       5      $ T	" [#        U R%                  5       5      5      n['        U R)                  5       5      [+        TU5      -  $ )Nr   c              3  n   >#    U  H*  nT" [         R                  R                  U5      5      v   M,     g 7frZ   )rI   r   
get_buffer)r&  mut_nameget_buf_bytess     r^   r'  ZBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.get_buf_bytes.<locals>.<genexpr>   s/      (@H &agg&8&8&BCC(@   25)r   r    TorchBindObjectr.  ro   r2   rQ   r   r[   rW   rR   rL   r1   r   r3   r  r   rH   r  r>   	get_dtypemin)
r   rW   totrw   	sched_buf	buf_elemsbuf_accessed_elemsr.  r\   r  s
         r^   r.  GBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.get_buf_bytes  sG    c2#5#566,,..

,=>> !NN66s||~FLLEC %)$))5FGGGG%diinnkBB-1YY-B-B-D	 #}Y^^'D D .E $% !& J

BMM:: (+(>(>(@  
 !.mCLLN.K LI)#--/:S*I>  ra   )r  z
sympy.Exprr   r   )r   r   r"  Sequence[BaseSchedulerNode]r   r   )r   z<Optional[Union[ir.Buffer, ir.TensorBox, ir.TorchBindObject]]r   r   )r   r  ExternKernelSchedulerNoderR   r1   r    r  op_overloadr  _prims	rng_primsgraphsafe_run_with_rng_stater  rH   
get_rangesr   collectionsr   r   r   r   rd   r  r   r   FusedSchedulerNoder  rI   r   rP  graph_inputs)r\   r  r   buf_accessesr!  r   r   r  buf_byte_accessesr  r   	buf_bytesr7  r.  r$  r)  r  s   `           @@@@@r^   r  0BaseSchedulerNode.get_read_write_buffer_accesses  s   6 d233Id566:II{<
 <
 It677499b&7&788		%%||%%BBC I	= dM**&doo/23 1! 456J
 SJ"..t4''--XX&--c2 . ''..XX&--c2 /
  B4+;+;+A+ABB 	  C4+;+;+B+BCC 		:
 d.//( )%) O -F+E,.H!$$R<;Q$R!R177111gg,,X6QWW111gg**84!Q!! !F &c*I0.7(+!+y8+c 'f ! ra   c                   U R                   c  g U R                   R                  5       nUc  g [        U5      (       d  g [        U5      n[        R
                  R                  R                  U4SS9S   n[        S   S==   U-  ss'   U$ )Nr   r  inductor
flop_count)	rR   get_origin_noder.   r-   rI   r   r  
size_hintsr   )r\   fx_nodeflopsresolved_flopss       r^   estimate_flops BaseSchedulerNode.estimate_flops  s    99))++-?G$$w'))44eX4J1M\*n<*ra   c                   U R                  5       S   R                  5       S   nUR                  R                  5       n[	        [        U5      5      (       d  g[        U R                  5      (       aA  [        U R                  [        R                  5      (       d   e [        U R                  5      $ [        U R                  5      (       a  gUR                  R!                  5       n [#        5       n[%        U5      S-  nUS::  a  ['        SU 35      eUS::  a  ['        SU 35      e U R+                  5       nUS:X  d  Uc  U R-                  5       U-  $ SnU R-                  5       n	U	c  SOU	n	X-  U-  S-  n
X-  n[/        X5      $ ! [         a  n[        R                  U5         SnAgSnAf[         a  n[        R                  U5         SnAgSnAff = f! [(         a     gf = f)z2
Returns estimated op runtime in nanoseconds (ns)
r   Nl    J)z-gpu_memory_bandwidth cannot be <= 0, but got z"gpu_flops cannot be <= 0, but got g      ?r  )r|  r   rR   r   rD   r/   rB   r   r    IRNoder&   
ValueErrorr   r  	TypeErrorrG   maybe_get_dtyper?   r=   AssertionErrorr   rO  r  max)r\   r   ro   edtypegpu_memory_bandwidth	gpu_flops	flops_estfactorcounted_bytescompute_timetransfer_times               r^   get_estimated_runtime'BaseSchedulerNode.get_estimated_runtime"  s   
 nnq!--/2))+of-.. ##dii3333
7		BB TYY
 ((*	#4#6 )%069I $q($CDXCYZ  A~$'I)%UVV 
 '')	>Y.4469MMM 99;*2*Y6#=%< <//_    2  		s7   F	 *A G 	
GF--G:GG
G)(G)c                    g rZ   r   re   s    r^   get_template_node#BaseSchedulerNode.get_template_nodec      ra   c                0    U R                  5       nUc   eU$ rZ   rd  )r\   templates     r^   get_template_node_or_throw,BaseSchedulerNode.get_template_node_or_throwf  s!    ))+###ra   c                `    [        S [        U 5       5       5      nU SU nX   nXS-   S nX#U4$ )zA
For the list of nodes, get the prologue, template, and epilogue
c              3  X   #    U  H   u  pUR                  5       (       d  M  Uv   M"     g 7frZ   r  )r&  ir  s      r^   r'  CBaseSchedulerNode.get_prologue_template_epilogue.<locals>.<genexpr>r  s     P,<DAaa,<s   *	*Nr   )next	enumerate)nodestemplate_indexprologuetemplate_nodeepilogues        r^   get_prologue_template_epilogue0BaseSchedulerNode.get_prologue_template_epiloguek  sH     PIe,<PP.)-!+-.00ra   )
r   r   r   rR   r   r   r   rQ   r   r   N)rQ   rP   r   r   )rR   ir.Operationr   r   r   )r   z	list[str]r   r  r(   r  r(   r   r   r  dict[str, str]r   r   )r!  r'   r   r   r   )r.  r   r   r   r8  OrderedSet[str]r4  r}  r   r   r   r  rm  dict[str, BaseSchedulerNode]r   r   r   r9  )r   zSequence[SchedulerBuffer])r  r   r   rN   r   r  zdependencies.Depr   r   T)r  rA   r  r   r   r   r   )r  r   r   r   r   r   )r  r   r   r   r   zdict[str, int]r   z
int | None)r   floatr   zOptional[ir.TemplateBuffer])r   zir.TemplateBuffer)rs  list[BaseSchedulerNode]r   zJtuple[list[BaseSchedulerNode], BaseSchedulerNode, list[BaseSchedulerNode]]);rn   r   r   r   r   r   r   r   rx   r   r   r  r  r  r  r"  r*  r  r:  r=  rG  r7  r-  ri  rn  r[   ru  r:   r}  r  r  r"   r|  r   r  r   r  rD   r  r  r  r  r  r  r  r  r  r  r  r	  r  r  rO  ra  rd  rj  staticmethodrx  r   r   ra   r^   rL   rL      sj   BB(('' NN''

&B*2+#
!.7	
?>


=#2=HV=	=
(
GT">T	T
. H H B B 
 
 
 
.&;:@F 9=*$*15*	*X 
 

 
 

 
 


!
37
	
J!!J!37J!	J!X   >0 >0@
 1&1	S1 1ra   c                  P    \ rS rSr% / SQrS\S'   S\S'   SS jrSS jrSS	 jrS
r	g)	WhyNoFuseiz  name1name2reasonr   r   r  ztuple[Any, ...]r   c                X    UR                  5       U l        UR                  5       U l        g rZ   )r[   r  r  r\   node1node2s      r^   r   WhyNoFuse.__init__  s    ^^%
^^%
ra   c                F    Xl         X l        [        R                  U 5        g rZ   )r  r   
fusion_logdebug)r\   r  r   s      r^   __call__WhyNoFuse.__call__  s    	ra   c                p    SU R                    SU R                   S3U R                  U R                  -  -   $ )Nzcannot fuse z with ri   r  re   s    r^   __str__WhyNoFuse.__str__  s6    djj\

|2>KK$))#
 	
ra   )r   r  r  r  Nr  rL   r  rL   r   r   )r  r   r   r   r   r   r   )
rn   r   r   r   	__slots__r   r   r  r  r   r   ra   r^   r  r  z  s#     5IK
&

ra   r  c                    [        U [        [        45      (       a  [        U [        S9n [
        R                  " U SS9nSU;   a  S[        R                  " US5       3$ U$ )Nkey   )rt   r       )	r   r   setsortedr   pprintrq   textwraprt   )objrv   s     r^   rq   rq     sU    #
C())Sc"^^C*Fv~HOOFG4566Mra   c                  @    \ rS rSrSS jrS	S jrS
S jrSS jr\rSr	g)r   i  c                &    [        U/5      U l        g rZ   rY  r   s     r^   r   OutputNode.__init__  s    ",cU"3ra   c                    gr]  r   re   s    r^   r  OutputNode.is_reduction  r  ra   c                    g)Nr   r   re   s    r^   r   'OutputNode.get_inputs_that_alias_output  r   ra   c                    g)NOUTPUTr   re   s    r^   r[   OutputNode.get_name  s    ra   )r   N)r!  r)   r   r   r   r   r   )
rn   r   r   r   r   r  r   r[   r   r   r   ra   r^   r   r     s    4 Hra   r   c                  ^ ^^^^ [         R                  " 5       mT R                   HU  n[        U[        5      (       a  M  TUR
                     R                  5       nTTU   R                  5       ==   S-  ss'   MW     SUUUU 4S jjm[        U4S jT R                   5       5      nU(       a?  T R                  U-
  T l        T R                  T R                  R                  U5      5        gg)aU  
Prunes weakdeps intended for mutation ordering
on an upstream fused node if after fusion there is another dependency
on the fused upstream node, making the weakdep redundant

In essence this enforces an ordering on fusions. As fusions occur, weakdeps will
be incrementally removed, enabling other fusions, ensuring they are fused in order.
r   c                   > [        U [        5      (       aI  TU R                     R                  5       nTTU   R	                  5          S:  nTU   T:H  nU=(       d    U$ g)Nr   F)r   r*   rd   r_   r[   )r!  r_  is_redundantis_self_depr   name_to_dep_countrm  rR   s       r^   r`  +_prune_redundant_deps.<locals>.should_prune  se    c7##!#((+<<>G,-?-H-Q-Q-STWXXL -W5=K.;.ra   c              3  F   >#    U  H  nT" U5      (       d  M  Uv   M     g 7frZ   r   rc  s     r^   r'  (_prune_redundant_deps.<locals>.<genexpr>  s      .,s2C.re  Nrf  )r@  r   r   r   r*   rd   r_   r[   r   r  r   rg  )rR   rm  r   r!  r_  deps_to_pruner  r`  s   ```   @@r^   rl  rl    s     '2&9&9&;&&#w''!#((+<<>G09BBDEJE '

 
  .. M "&"9"9M"IT--::=IJ ra   c                  J   ^  \ rS rSrSU 4S jjrSS jrS	S jrS	S jrSrU =r	$ )
r:  i  c                   > [         TU ]  U5        U R                  U5        U R                  UR	                  5       5        g rZ   superr   r   r  get_read_writesr\   rQ   rR   	__class__s      r^   r   "ExternKernelSchedulerNode.__init__  5    #T"T1134ra   c                V    U R                  5        S[        U R                  SS 5       3$ )Nz.node.kernel = python_kernel_name)r[   r   rR   re   s    r^   r   )ExternKernelSchedulerNode.debug_str_extra  s*    --/"/'$))EY[_2`1abbra   c                    gNTr   re   s    r^   r  #ExternKernelSchedulerNode.is_extern  rf  ra   c                    U R                   c   e[        U R                   S5      =(       a    U R                   R                  5       $ )Nr  )rR   r   r  re   s    r^   r  *ExternKernelSchedulerNode.has_side_effects  s6    yy$$$tyy"45V$)):T:T:VVra   r   rQ   rP   rR   rz  r   r   r   r   )
rn   r   r   r   r   r   r  r  r   __classcell__r  s   @r^   r:  r:    s    5
cW Wra   r:  c                  ,   ^  \ rS rSrSU 4S jjrSrU =r$ )r  i  c                   > [         TU ]  U5        U R                  U5        U R                  UR	                  5       5        g rZ   r  r  s      r^   r   NopKernelSchedulerNode.__init__  r  ra   r   r  )rn   r   r   r   r   r   r  r  s   @r^   r  r    s    5 5ra   r  c                    ^  \ rS rSr% S\S'   S\S'         SU 4S jjr  S     SS jjr  S     SS jjr      SS	 jrS S
 jr	S!S jr
      S"S jrS#S jrS$S jrS%S jrS%S jrS%S jrS&S jrS'S jr    S(S jrS)S jr S*   S+S jjr\S,S j5       r\S,S j5       rS-S jr\S.S j5       rSrU =r$ )/r  i  z tuple[Sequence[sympy.Expr], ...]_sizesr4   _bodyc                f   > [         TU ]  U5        U R                  U5        U R                  5         g rZ   )r  r   r   _compute_attrsr  s      r^   r   SchedulerNode.__init__  s,    
 	#T"ra   c                    [        U R                  [        R                  [        R                  45      (       d   eU R                  R                  UUS9u  U l        U l        U R                  R                  5       nU R                  R                  U5      R                  nX4" U R                  5      4U l        [        R                  (       + =(       d    [        UR                   5      (       + n[        U R                  [        R                  5      (       a)  U R#                  U R                  R%                  US95        g U R#                  [&        R$                  " U R                  /U R                  Q7SU065        g )Nextra_indexing_constraintsrecompute_sizes_body_func)	normalizer  )r   rR   r    ComputedBufferTemplateBuffersimplify_and_reorderr  r  get_device_or_errorrQ   get_backendgroup_fnr   r   loop_ordering_after_fusionrD   rm   r  extract_read_writesr   )r\   r  r  r  r  should_normalizes         r^   r  SchedulerNode._compute_attrs  s9   
 $))b&7&79J9J%KLLLL"&))"@"@'A&? #A #
TZ
 ..0>>--f5>>ht{{34
  &@@@ 
KKI
 E
 dii!2!233  		--8H-I   00JJ!%8Hra   c                $    U R                  UUS9  g )Nr  )r  )r\   r  r  s      r^   recompute_size_and_body%SchedulerNode.recompute_size_and_body  s    
 	'A&? 	 	
ra   c                n   [        S U R                  R                   5       5      nU R                  [        R
                  " U R                  /U R                  Q7SU06R                  U5      5        U R                  R                  U 5        U(       a!  SSKJn  UR                  R                  5         g g )Nc              3  `   #    U  H$  n[        U[        [        45      (       d  M   Uv   M&     g 7frZ   )r   r*   r)   rB  s     r^   r'  5SchedulerNode.refresh_dependencies.<locals>.<genexpr>'  s$      0
1CZgwEW5XCC1s   .	.r  r   SIMDScheduling)r   r   r   r  r   r  r  r  r  pointwise_read_writesclear_cachecodegen.simdr  candidate_tilingscache_clear)r\   r  need_clear_tiling_cache	fake_depsr  s        r^   refresh_dependencies"SchedulerNode.refresh_dependencies"  s    
 &0 0
++110
 &
	 	,,

![[4=i	"	
 	""..t4"4 ,,88: #ra   c                    U R                   R                  U5      U l         U R                   R                  U l        U R	                  SSS9  g )NFTr  r  )r  reorder_iter_loopssizesr  r  )r\   	new_orders     r^   apply_new_loop_order"SchedulerNode.apply_new_loop_order>  sA    ZZ22

 jj&&!!E4!Pra   c                    U R                   R                  5       U l         U R                   R                  U l        U R	                  SSS9  g )NTFr  )r  merge_loopsr  r  r  re   s    r^   r  SchedulerNode.merge_loopsF  s<    ZZ++-
jj&& 	!!D%!Pra   c                   S nU R                   S   n[        U5      UR                  s=:X  a  UR                  :X  a  O  OUR                  U5      nU(       aP  [        =R
                  S-  sl        [        R                  SU R                  5       U5        U R                  U5        g [        R                  SU R                  5       5        g )Nr   r   z"Reorder loops for %s with order %szEDon't reordering %s because we can not decide the suitable loop order)
r  rs   num_varsdecide_loop_order_to_matchr!   num_loop_reorderingloop_ordering_logr  r[   r  )r\   r  r  r  
self_sizess        r^   r  'SchedulerNode.reorder_loops_by_dep_pairR  s     	[[^
z?h//E93E3EE ;;IFI''1,'##4dmmoy %%i0##Wra   c                N   U R                  5       nU SU R                  S    3U SU R                  S    3U SU R                   3/nU R                  R	                  5        H  n[        U[        5      (       a  M  UR                  n[        R                  R                  U5      n[        U[        R                  5      (       a  Mf  UR                  U S[        UR                  5       35        M     [        U R                   ["        5      (       aS  UR                  SU S35        UR                  [$        R&                  " U R                   R)                  5       S	5      5        U R*                  c   eUR-                  U R/                  5       5        S
R1                  U5      $ )Nz.group.device = r   z.group.iteration = r   z	.sizes = z
_layout = zclass z_loop_body:r  r   )r[   r   r  r   r  r   r*   rd   rI   r   r,  r    r1  r  rq   ro   r  r4   r  rt   rx   rR   rQ  r   join)r\   rd   linesr!  r  r   s         r^   r   SchedulerNode.debug_str_extraf  sM   }}f$TZZ]O4f'

17fIdkk]+

 ##446Cc7++88gg((2!#r'9'9::LLH:Z

8K7L!MN 7 djj(++LL6${34LL)=)=)?HIyy$$$T//12yyra   c                    U R                   $ rZ   )r  re   s    r^   r?  SchedulerNode.get_ranges|      {{ra   c                    [        U R                  [        R                  [        R                  45      (       d   S[        U R                  5      < 35       e[        U R                  R                  5       5      $ Nztype(self.node)=)r   rR   r    r  r  rm   r   r	  re   s    r^   r  SchedulerNode.is_reduction  s^    $))b&7&79J9J%KLL 	
tDII !	
L DII00233ra   c                b   [        U R                  [        R                  [        R                  45      (       d   S[        U R                  5      < 35       e[        U R                  [        R                  5      =(       a.    [        U R                  R                  [        R                  5      $ r  )r   rR   r    r  r  rm   r   	SplitScanre   s    r^   r  SchedulerNode.is_split_scan  s|    $))b&7&79J9J%KLL 	
tDII !	
L $))R%6%67 
JIINNBLL=
 	
ra   c                J    [        U R                  [        R                  5      $ rZ   r   rR   r    r  re   s    r^   r  SchedulerNode.is_template  s    $))R%6%677ra   c                p    [        U R                  [        R                  5      (       a  U R                  $ S $ rZ   r  re   s    r^   rd  SchedulerNode.get_template_node  s'    &tyy"2C2CDDtyyN$Nra   c                f    U R                  5         U R                  5         U R                  U5        g rZ   )r  r=  r  )r\   
index_varss     r^   runSchedulerNode.run  s#    ""$Z ra   c                (   U R                   n[        [        [        U5      5      [        [        [        U5      5      :X  d   e[	        [        [        R                  R                  U5      [        R                  R                  U5      5      5      nU$ rZ   )	r  r  maprs   dictziprE  rF  from_iterable)r\   r  r  
var_rangess       r^   ranges_from_index_vars$SchedulerNode.ranges_from_index_vars  sp     3sE?#s3sJ+?'@@@@--j9--e4

 ra   c                   U R                  U5      n [        R                  " [        [        R                  " 5       U5      5         [        R
                  R                  U 5         U R                  " U6   S S S 5        S S S 5        g ! , (       d  f       N= f! , (       d  f       g = f! [         a"    [        R                  SU R                  5        e f = f)NzError in codegen for %s)r'  rI   set_ops_handlerr9   get_ops_handlerr   set_current_noder  r   r   fatalrR   )r\   r  r&  s      r^   r  SchedulerNode.codegen  s    00<
	!!"213D3D3F
"ST))$/

J' 0 UT// UT  	II/;	sA   3B)  B&B6B>B) 
B	B
B&"B) &B) ),Cc                    U(       a  U R                   O[        U R                   5      u  p#[        R                  " U R                  U[
        R                  R                  /[        U5      -  /S9$ )zL
Get the memory dependencies in either the pointwise or the reduction axes.
)hidden_args)	r  reversedr   r  r  sympySZerors   )r\   	pointwise
keep_sizesignore_sizess       r^   "pointwise_or_reduction_read_writes0SchedulerNode.pointwise_or_reduction_read_writes  sR     3<4;;$++AV 
//JJ
%'',,#lBS1S0T
 	
ra   c                     U R                  SS9$ )z8
Get the memory dependencies in the non-reduction axes.
Tr5  r8  re   s    r^   r  #SchedulerNode.pointwise_read_writes  s    
 666FFra   c                     U R                  SS9$ )z4
Get the memory dependencies in the reduction axes.
Fr;  r<  re   s    r^   reduction_read_writes#SchedulerNode.reduction_read_writes  s    
 666GGra   c                (   U R                  5       (       a  g[        S U R                  5        5       5      (       a  g[        U R                  R
                  5      S:X  a  [        U[        R                  5      (       a  [        [        U R                  R
                  5      5      n[        U[        R                  5      (       d   S[        U5      < 35       eUR                  UR                  :H  =(       a    UR                  UR                  :H  $ g)NFc              3  @   #    U  H  oR                  5       v   M     g 7frZ   )rp   r  s     r^   r'  ,SchedulerNode.can_inplace.<locals>.<genexpr>  s     ?,>S  ,>r{  r   ztype(write_dep)=)r  r)  r   rs   r   r   r   r   r(   rq  iterrm   indexsize)r\   r  	write_deps      r^   r  SchedulerNode.can_inplace  s    ?D,<,<,>???t&&'1,l,,2
 2
 T$"2"2"9"9:;Ii)?)?@@WEUT)_DVBWW@>>Y__4X)..9XXra   c                8   [        5       n[        U R                  [        5      (       a  U R                  R	                  5        H  nUR
                  S:X  d  M  UR                  S:X  d  M'  SUR                  ;   a  UR                  S   S:X  d0  [        UR                  5      S:X  d  Me  UR                  S   S:X  d  Mz  UR                  SUR                  ;   a  UR                  S   O)[        UR                  5      S:  a  UR                  S	   OS
5        M     U$ )Ncall_methodstoremode
atomic_add   r  rd      r   r   )r   r   r  r4   r|  r]   r  r   rs   r   rO  )r\   buffers_store_as_atomic_addrR   s      r^   _get_atomic_add_buffers%SchedulerNode._get_atomic_add_buffers  s    7A|#djj(++

,,.GG},w.4;;.4;;v3F,3V		Na/DIIaLL4P 033!T[[0 F+.1$))n.Adiilr / +*ra   )r  r  r   )rQ   rP   rR   z+Union[ir.ComputedBuffer, ir.TemplateBuffer]r   r   NN)r  z*Optional[tuple[dict[Any, Any], list[Any]]]r  zOptional[Callable[..., Any]]r   r   )r  r   r  r   r   r   )r  zSequence[int]r   r   r   r{  r   )r   Sequence[Sequence[sympy.Expr]]r   r  )r  Sequence[sympy.Expr]r   r   )r  rT  r   zdict[sympy.Expr, sympy.Expr])r  rT  r   r   r  )r5  r   r   r   )r   r   r  r  )rn   r   r   r   r   r   r  r  r  r  r  r  r   r?  r  r  r  rd  r  r'  r  r8  r:   r  r?  r  rQ  r   r  r  s   @r^   r  r    sk   ,,O : 
	 RVBF$N $@ 
	D RVBF
$N
 $@
 
	
;;8<;	;8Q
Q!.7	( ,4
8O!
8	%
 !%	
	
	 	
 G G H H + +ra   r  c           	     z  ^  T R                   nT R                  [        R                  R	                  U Vs/ s H  o"R
                  PM     sn5      5        [        U 4S j[        R                  " U Vs/ s H  o"R                  PM     sn6  5       5      T R
                  R                  -
  T l        g s  snf s  snf )Nc              3  h   >#    U  H'  nUR                   TR                  5       ;  d  M#  Uv   M)     g 7frZ   rd   r  )r&  r!  group_snodes     r^   r'  2refresh_group_node_dependencies.<locals>.<genexpr>  s/      
Pxx{;;== CP   "2	2)
r"  r  r   
ReadWrites
merge_listr   r   unionr   r   )rY  r"  r  s   `  r^   refresh_group_node_dependenciesr_    s     F**6+J6aMM6+JK
 	 
!'')O1*>*>)OP
 	

 
!
!
(
(	) " ,K *Ps   B34B8rP   c                   [        U [        [        45      (       d   eX l        Xl        S U l        [        R                  " U Vs/ s H  o3R                  c  M  UR                  PM     sn6 U l        [        U 5        [        S U R                   5       5      U l        [        S U R                   5       5      U l        U R                  5        Vs0 s H  oDR                  5       U_M     snU l        g s  snf s  snf )Nc              3  8   #    U  H  oR                   v   M     g 7frZ   r   r&  r  s     r^   r'  "init_group_node.<locals>.<genexpr>       H5G5Gr  c              3  8   #    U  H  oR                   v   M     g 7frZ   )r   rc  s     r^   r'  rd    re  r  )r   rA  GroupedSchedulerNoder"  rQ   rR   r   r^  r   r_  r3  r   rW  r   r   r[   r   )rY  rQ   r"  r  r   s        r^   init_group_noderh    s    
 k$68L#MNNNN%K&,,%	Av!+!++v	AK $K0H[5G5GHHKH[5G5GHHK'2'>'>'@#'@'@#K 
B#s   C4C4C9c                    ^  \ rS rSr% SrS\S'   \      SS j5       r\S S j5       r	      S!S jr
S"U 4S jjr\S#S	 j5       rS#S
 jr\S$S j5       rS%S jrS#S jrS#S jr      S&U 4S jjr\S$S j5       r\S$S j5       rS'S jrS#S jr\S(S j5       r\S(S j5       r\S(S j5       r\S)S j5       rS*S jr\S(S j5       rS+S jrS,S jrS-S jrS#S jr Sr!U =r"$ ).rA  i  z
This is a "fake" scheduler node that represents a group of scheduler nodes
that are meant to be fused together. The way it does this is by maintaining
its unmet dependencies as the union of its constituent nodes.
r  r"  c           	        UR                   UR                   L d   e[        U[        [        45      (       d   eUR	                  5       (       Ga  [        U[
        5      (       Ga  [        UR                  [        5      (       d   e[        UR                  R                  5      S:X  d   e[        [        [        UR                  R                  5      5      [        5      (       d   e[        [        UR                  R                  5      5      R                  nUR                  5        Vs/ s H  oDR	                  5       (       d  M  UPM     nn[        U5      S:X  d   eUS   n[        UR                  R                  5      S:X  d   e[        [        UR                  R                  5      5      n[        U[         5      (       d   e[#        [!        X7R$                  UR&                  UR(                  UR*                  5      /5      UR                  l
        O[        U[        [        45      (       d   e[-        [.        R0                  " UR                  5       UR                  5       5      5      nU " UR                   U5      $ s  snf )Nr   r   )rQ   r   r  rA  r  r:  rR   r1   rs   r   r   rq  rD  r)   rd   r|  r(   r   rE  	var_namesrF  rL  r   rE  rF  )	clsr  r  rd   rR   template_nodesrv  writers  s	            r^   fuseFusedSchedulerNode.fuse  s    %//111%-1C!DEEEE:e5N#O#O ejj+6666u((//0A555d4(9(9(@(@#ABGLLLLU..5567<<D/4/@W/@tDTDTDVd/@NW~&!+++*1-M}00778A===m77>>?@EeY////'1kk5??EJJ

(E$ em5G%HIIIIY__U__%68IJK5??E**! Xs   ,JJc                    [        [        S S U R                  5        5       5      5      n[        U5      S:X  a  g [	        U5      nU$ )Nc              3     #    U  HA  nUR                  5       (       d  UR                  5       (       d  M/  UR                  5       v   MC     g 7frZ   r  r  rO  ry  s     r^   r'  4FusedSchedulerNode.estimate_flops.<locals>.<genexpr>E  =       0''))T^^-= *D'')) 0
   .AAr   r   filterr|  rs   r  r\   fpsrets      r^   rO  !FusedSchedulerNode.estimate_flops?  K      $ 0	
 s8q=#h
ra   c                   U R                  5       (       a  g S nU R                   Hh  n[        U[        5      (       d   eUb<  [	        U5      [	        UR
                  S   5      :w  a  [        R                  S5          g UR
                  S   nMj     S nUc   e[        U5      UR                  s=:X  a  UR                  :X  a  O  OUR                  U5      nU(       d%  [        R                  SU R                  5       5        g [        =R                  S-  sl        [        R                  SU R                  5       U5        U R                   H+  n[        U[        5      (       d   eUR                  U5        M-     [        U 5        g )Nr   z1Can not reorder fused node due to different sizeszODont reordering fused node %s because we can not decide the suitable loop orderr   z-Reorder loops for fused node %s with order %s)r  r"  r   r  tupler  r  r  rs   r  r  r[   r!   r  r  r_  )r\   r  r  r	  snoder  s         r^   r  ,FusedSchedulerNode.reorder_loops_by_dep_pairQ  sH    
[[Ee]3333%%
*;uU\\RS_?U*U!''G aJ ! 	%%%z?h//E93E3EE ;;IFI##a ##q(#;T]]_i	
 [[Ee]3333&&y1 ! 	(-ra   c                ~   > [         TU ]  U5        [        XU5        / U l        [	        US S9R
                  U l        g )Nc                4    [        U R                  5       5      $ rZ   )r   r  r  s    r^   r   -FusedSchedulerNode.__init__.<locals>.<lambda>z  s    s1>>3C/Dra   r  )r  r   rh  rW   rW  r   r\   rQ   r"  r  s      r^   r   FusedSchedulerNode.__init__v  s6    #0%'
%DEKK
ra   c                ~    SR                  U R                   Vs/ s H  oR                  5       PM     sn5      $ s  snf N_r  r"  r[   r\   r  s     r^   r[   FusedSchedulerNode.get_name|  +    xxt{{;{!{;<<;   :c                <    U R                   S   R                  5       $ Nr   r"  r[   re   s    r^   ru  !FusedSchedulerNode.get_first_name      {{1~&&((ra   c                    [         R                  " U R                   Vs/ s H  oR                  5       PM     sn6 $ s  snf rZ   r   r^  r"  r  r  s     r^   r  #FusedSchedulerNode.get_buffer_names  0    !L1"4"4"6!LMM!L   <c                n    / nU R                    H"  nUR                  UR                  5       5        M$     U$ rZ   r"  rQ  r   r\   rv   rR   s      r^   r   FusedSchedulerNode.get_outputs  /    (*KKDMM$**,-  ra   c           
        [        U R                  5       VVs/ s H+  u  pU R                  5        SU SUR                  5        3PM-     nnnU R                  S   R                  nUb  UR                  U R                  5       5        [        R                  " SR                  U5      R                  5       S5      $ s  snnf )Nz.snodes[z] =
r   r   r  )rr  r"  r[   rx   rR   rQ  r   r  rt   r  r   )r\   ro  rR   r  s       r^   r   "FusedSchedulerNode.debug_str_extra  s     %T[[1
1 }}xs%0@/AB1 	 
 {{1~""LL3356tyy/668&AA
s   2B=c                l    U R                    Vs/ s H  oR                  5       PM     nnU  SU 3$ s  snf )Nz
, snodes: )r"  r  )r\   rR   
snodes_strs      r^   r  "FusedSchedulerNode.debug_str_short  s8    9=E**,
Ez*.. Fs   1c                   > [         TU ]  X5        [        5       n[        U R                  5       H/  nUR                  X5        UR                  UR                  5        M1     g rZ   )r  r:  r   r1  r"  updater   )r\   r8  r4  rR   r  s       r^   r:  !FusedSchedulerNode.set_last_usage  sQ    
 	2G 0:|T[[)D 3H&&t7 *ra   c                    [         R                  " U R                   Vs/ s H  oR                  5       PM     sn6 $ s  snf rZ   )r   r^  r"  rG  r  s     r^   rG  $FusedSchedulerNode.used_buffer_names  s0    !MA"5"5"7!MNN!Mr  c                    [         R                  " U R                   Vs/ s H  oR                  5       PM     sn6 $ s  snf rZ   )r   r^  r"  r7  r  s     r^   r7  /FusedSchedulerNode.used_or_aliased_buffer_names  s5    8<D1,,.D
 	
Dr  c                    U R                   $ rZ   r'  re   s    r^   r|  FusedSchedulerNode.get_nodes  r  ra   c                T    [        U 5      R                   SU R                  5        S3$ )Nz(nodes=r   r   re   s    r^   r   FusedSchedulerNode.__repr__  s'    t*%%&gdmmo->a@@ra   c                :    [        S U R                   5       5      $ )Nc              3  @   #    U  H  oR                  5       v   M     g 7frZ   )r  rc  s     r^   r'  2FusedSchedulerNode.is_reduction.<locals>.<genexpr>  s     9[>>##[r{  r)  r"  re   s    r^   r  FusedSchedulerNode.is_reduction  s    9T[[999ra   c                :    [        S U R                   5       5      $ )Nc              3  @   #    U  H  oR                  5       v   M     g 7frZ   )r  rc  s     r^   r'  3FusedSchedulerNode.is_split_scan.<locals>.<genexpr>  s     :k??$$kr{  r  re   s    r^   r   FusedSchedulerNode.is_split_scan  s    :dkk:::ra   c                :    [        S U R                   5       5      $ )Nc              3  @   #    U  H  oR                  5       v   M     g 7frZ   rn  rc  s     r^   r'  1FusedSchedulerNode.is_template.<locals>.<genexpr>  s     8Kq==??Kr{  r  re   s    r^   r  FusedSchedulerNode.is_template  s    8DKK888ra   c                x    U R                    H*  nUR                  5       (       d  M  UR                  5       s  $    g rZ   )r"  r  rd  r\   rR   s     r^   rd  $FusedSchedulerNode.get_template_node  s3    KKD!!--//   ra   c                     U R                   S   $ r  )r   re   s    r^   r   FusedSchedulerNode.get_device  s    zz!}ra   c                :    [        S U R                   5       5      $ )Nc              3  @   #    U  H  oR                  5       v   M     g 7frZ   )r*  rc  s     r^   r'  >FusedSchedulerNode.has_aliasing_or_mutation.<locals>.<genexpr>  s     EA--//r{  r  re   s    r^   r*  +FusedSchedulerNode.has_aliasing_or_mutation  s    EEEEra   c                    [         erZ   NotImplementedErrorr  s     r^   r  'FusedSchedulerNode.update_mutated_names      !!ra   c                    [         erZ   r  )r\   rd   s     r^   r"  FusedSchedulerNode.add_fake_dep  r  ra   c                    [         erZ   r  r  s     r^   r  FusedSchedulerNode.can_inplace  r  ra   c                X   U R                  5       nSR                  S U R                   5       5      n[        5       nUR	                  U S[        U 5      R                   SU SU S[        U R                  R                  5       SU S[        U R                  5       SU S	[        U R                  R                  U R                  -
  5       SU S
35        UR                  5          U R                  5        H"  nUR	                  UR                  5       5        M$     SSS5        UR                  S5         UR	                  U R!                  5       5        UR)                  5       R+                  5       $ ! , (       d  f       N]= f! ["         a    [$        R'                  SSS9   NOf = f)r   rj   c              3  L   #    U  H  n[        U5      R                  v   M     g 7frZ   )rm   rn   r  s     r^   r'  /FusedSchedulerNode.debug_str.<locals>.<genexpr>  s     F+QQ 0 0+s   "$ri   r   r   r   r   r   r   z.outputs = [
            Nrk   r   Tr   )r[   r  r"  rA   r   rm   rn   rq   r   r   r   r   rt   r   rx   rl   r   r   r   r   ru   r   )r\   rd   node_typestrr   r   s        r^   rx   FusedSchedulerNode.debug_str  sx   }}xxF$++FF

bd		Q|n -j))0012 3WT%<%<=> ?74#3#3#9#9D<S<S#STU V 	
 ZZ\'')

3==?+ *  	c	HJJt++-.  '')) \  	HKK7$KG	Hs   )7E7:F 7
FF)(F))r   rW   r  rL   r  rL   r   rA  r  r{  rQ   rP   r"  r  r   r   r   r  r   zlist[SchedulerBuffer]r~  r  r   r  )r   torch.devicer|  )rd   r'   r   r   r  )#rn   r   r   r   __doc__r   classmethodro  r:   rO  r  r   r[   ru  r  r   r   r  r:  rG  r7  r|  r   r  r  r  rd  r   r*  r  r"  r  rx   r   r  r  s   @r^   rA  rA    s    $#+%+.?+	+ +B  "#.!#..7#.	#.JL = =) N N	B/8#28HV8	8 O O 
 

A : : ; ; 9 9   F F
"""* *ra   rA  c                  z  ^  \ rS rSr% Sr    SS jr    SS jr\SS j5       r\      SS j5       r	   S             SU 4S jjjr
\    SS j5       r\    SS	 j5       r\rS
\S'   \    SS j5       r\    SS j5       rSS jrSS jrS S jrS!S jrS"S jrS#S jr    S$S jrSrU =r$ )%ForeachKernelSchedulerNodei  z
This is a schedular node that consists of a set of scheduler nodes that
has no data dependencies among them and can be executed in parallel.
c                    UR                  5        H@  nUR                  5       U R                  ;   d  M#  U R                  UR                  5          s  $    g rZ   )r   r[   read_to_node)r\   producerr   s      r^   get_consumer_subnode_for3ForeachKernelSchedulerNode.get_consumer_subnode_for  sG     '')C||~!2!22((88 * ra   c                   [         [           " 5       nUR                  R                   H  nUR                  U R
                  R                  ;  a  M)  U R
                  R                  UR                     R                  5       nX@R                  ;   d  Mk  UR                  U R                  U   5        M     [        U5      S:X  a  [        [        U5      5      $ g Nr   )r   rL   r   r   rd   rQ   r   r_   name_to_noderO  rs   rq  rD  )r\   consumer	producersrd	node_names        r^   get_producer_subnode_for3ForeachKernelSchedulerNode.get_producer_subnode_for  s     013	&&,,Bwwdnn88822277;LLNI---d//	:; - y>QY((ra   c                  ^ [        TU5      nTR                  5       (       a  UR                  5       (       a  [        R                  " [        T5      m[        R                  " [        U5      n[        TR                  5      [        UR                  5      :H  nU(       d  U" S5        U=(       a3    [        U4S j[        TR                  UR                  5       5       5      $ UR                  5       (       ar  TR                  5       (       a	  U" S5        g[        R                  " [        U5      nUR                  T5      nUb  UR                  R                  TU5      $ U" S5        gTR                  5       (       aq  UR                  5       (       a	  U" S5        g[        R                  " [        T5      mTR                  U5      nUb  TR                  R                  Xb5      $ U" S5        g[        S5      e)	Nzforeach do not have same lengthc              3  ^   >#    U  H"  u  pTR                   R                  X5      v   M$     g 7frZ   )rQ   can_fuse)r&  lrr  s      r^   r'  6ForeachKernelSchedulerNode.can_fuse.<locals>.<genexpr>$  s.      )ADA ""++A11A   *-zXcandidate producer is a reduction, foreach ops cannot be fused with reductions currentlyFz5candidate producer is not dep of any foreach consumerzXcandidate consumer is a reduction, foreach ops cannot be fused with reductions currentlyz5candidate consumer has no dep in any foreach producerzXAt least one node passed to ForeachKernelSchedulerNode.can_fuse should be a foreach node)r  r  typingcastr  rs   r"  r  r$  r  r  rQ   r  r  rV  )rl  r  r  whyforeach_matchconsumer_subnodeproducer_subnodes    `     r^   r  #ForeachKernelSchedulerNode.can_fuse  s   (+  X%8%8%:%:{{#=xHH{{#=xHH0C4HHM 56  S )A) &    ""$$&&n {{#=xHH'@@J+))228=MNNGH  ""$$&&n {{#=xHH'@@J+))223CNNGHf
 	
ra   c           	     `   UR                  5       (       d  UR                  5       (       d   eUR                  5       (       a4  [        R                  " [        U5      nUR                  nUR
                  nO3[        R                  " [        U5      nUR                  nUR
                  nS nS nUR                  5       (       a  UR                  5       (       a  [        R                  " [        U5      n[        R                  " [        U5      n[        UR                  UR                  5       VVs/ s H  u  px[        R                  Xx5      PM     n	nnGO?UR                  5       (       a  [        R                  " [        U5      nUR                  U5      n
/ n	UnS nUR                   HB  nXL a*  [        R                  X5      nUnU	R                  U5        M1  U	R                  U5        MD     OUR                  5       (       a  [        R                  " [        U5      nUR                  U5      n/ n	UnS nUR                   HB  nXL a*  [        R                  X5      nUnU	R                  U5        M1  U	R                  U5        MD     O[        S5      eU " UR                  U	UUUUS9$ s  snnf )NzTAt least one node passed to ForeachKernelSchedulerNode.fuse should be a foreach node)use_custom_partition_algoprev_node_1prev_node_2enable_autotune)r  r  r  r  r  r  r$  r"  rA  ro  r  r  r  rV  rQ   )rl  r  r  r  r  r  r  r  r  fused_nodesr  rR   new_noder  s                 r^   ro  ForeachKernelSchedulerNode.fuseJ  s\    ""$$(;(;(=(===  {{#=xHH(0(J(J%&66O{{#=xHH(0(J(J%&66O  X%8%8%:%:{{#=xHH{{#=xHH  AADA #''-A  K   ""{{#=xHH'@@JK"KK +166tFH"*K&&x0&&t, (   ""{{#=xHH'@@JK"KK +166xFH"*K&&x0&&t, ( !f  &?##+
 	
Ks   0!J*c                  >^  0 T l         0 T l        Ub  Ucv  [        TT ]  X5        U H_  nUR                  R
                   H  nUT R                   UR                  '   M     UR                  5        H  n	UT R                  U	'   M     Ma     GOUT l        UT l	        S T l
        / T l        T R                  [        R                  R                  UR                  UR                  /5      5        [!        U 4S j[         R"                  " UR$                  UR$                  5       5       5      T R                  R&                  -
  T l        [)        UR*                  UR*                  /5      T l        [-        UR.                  UR.                  /5      T l        UR1                  5       (       a  [3        U[4        5      (       d   eXEpO[3        U[4        5      (       d   eXTpU
R6                  T l        T R6                  R9                  UR6                  5        U
R                  T l        UR                  5        H  n	UT R                  U	'   M     T R                   VVVs0 s H(  oR:                  R=                  5         H  u  pX_M	     M*     snnnT l        UT l        US   RA                  5       nU(       d   eU[B        RD                  " S5      444T l#        [         [H        RJ                  RL                     " 5       T l'        UT l(        g s  snnnf )Nc              3  h   >#    U  H'  nUR                   TR                  5       ;  d  M#  Uv   M)     g 7frZ   rX  rW  s     r^   r'  6ForeachKernelSchedulerNode.__init__.<locals>.<genexpr>  s5        xxt'<'<'>>	 C r[  r   combo_kernel))r  r  r  r   r   r   rd   r}  rQ   r"  rR   rW   r  r   r\  r]  r   r^  r   r   r3  r   rW  r   r  r   r  r   r  r   itemsr  r   r2  Exprr   r  fxNoder  r  )r\   rQ   r"  r  r  r  r  rR   r  rd   foreach_node
other_noder  r3  vr  r  s   `               r^   r   #ForeachKernelSchedulerNode.__init__  s    +"5GY/ ,,22D37D%%dii0 3 !446D.2D%%d+ 7	  'DN DKDI)+DJ  ''22 ,,k.E.EF  )//#668V8V   ""))* # !+"7"79N9N!OPDN +"7"79N9N!OPDN%%''!+/IJJJJ+6j!+/IJJJJ+6j)33DNNN!!*"6"67 , 9 9D"668*4!!$' 9 #'++@"-:O:O:U:U:W$!:W+@D  *C&%%'v

> :<>?
!%((--02.@s   /Lc           	     x   U Vs/ s H  n[        U[        5      (       d  M  UPM     nnU(       aW  [        R                  S[	        U5      U Vs/ s H+  oDR
                  c  M  UR
                  R                  5       PM-     sn5        U Vs/ s H"  n[        U[        [        45      (       a  M   UPM$     nnU Vs/ s H  n[        U[        5      (       d  M  UPM     nnU(       a  [        R                  S[	        U5      5        U Vs/ s H  n[        U[        5      (       a  M  UPM     nnU Vs/ s H  o"R                  5       (       d  M  UPM     nnU(       a   [        R                  S[	        U5      U5        U Vs/ s H  o"U;  d  M
  UPM     nnU$ s  snf s  snf s  snf s  snf s  snf s  snf s  snf )Nz/ComboKernels: %d external nodes are filtered %sz+ComboKernels: %d foreach nodes are filteredz0ComboKernels: %d template nodes are filtered: %s)
r   r:  r   r  rs   rR   r  r  r  r  )rl  rs  r  externrR   filtered_nodesforeach_nodesrm  s           r^   combinable_nodes+ForeachKernelSchedulerNode.combinable_nodes  s|    #OUj4M&N!UOIIAF5;UVTyy(&&(VU 
a"8:S!TU  	 
 &
%!A7Q)RA~ 	 
 IICSEWX%
%!Z;U-VA~ 	 
 &4G^}}!^GIIBN#
 &4O^7N!^O9 P
 V




 H PsR   FFF#FF#/F#;F(F(
F-'F-3F2F2	F7F7c           
         U R                  5       n/ nSnU H=  nUR                  [        S[        U5      U5       Vs/ s H	  nXEXS-    PM     sn5        M?     U$ s  snf )zC
Returns a list of lists of nodes that are to be grouped together.
   r   )_topological_sort_nodesrQ  rangers   )rQ   sorted_nodesgrouped_nodesmax_num_nodesrs  ro  s         r^   &_default_group_nodes_for_combo_kernelsAForeachKernelSchedulerNode._default_group_nodes_for_combo_kernels  ss     !88:!E   #1c%j-@@ a/0@ " s   A
4Callable[[Scheduler], list[list[BaseSchedulerNode]]]!group_algorithm_for_combo_kernelsc                    U [         l        g rZ   r  r  )custom_group_algorithms    r^   %set_group_algorithm_for_combo_kernels@ForeachKernelSchedulerNode.set_group_algorithm_for_combo_kernels  s    
 # 	#Dra   c                ,    [         R                  U 5      $ rZ   r  rQ   s    r^   group_nodes_for_combo_kernels8ForeachKernelSchedulerNode.group_nodes_for_combo_kernels  s     *KKIVVra   c                    [         erZ   r  re   s    r^   r=  #ForeachKernelSchedulerNode.mark_run  r  ra   c                    [         erZ   r  re   s    r^   r  "ForeachKernelSchedulerNode.codegen  r  ra   c                    gr  r   re   s    r^   r  %ForeachKernelSchedulerNode.is_foreach!  rf  ra   c                ,    [        U R                  5      $ )z]Returns a list of nodes which comprise the combo kernel.
These nodes may be vertically fused.)r   r"  re   s    r^   get_subkernel_nodes.ForeachKernelSchedulerNode.get_subkernel_nodes$  s     DKK  ra   c                t    [        [        R                  R                  S U R                   5       5      5      $ )ziReturns all nodes contained in this kernel, unpacking fused nodes
into their constituent scheduler nodes.c              3  @   #    U  H  oR                  5       v   M     g 7frZ   )r|  rc  s     r^   r'  7ForeachKernelSchedulerNode.get_nodes.<locals>.<genexpr>,  s     1UA++--r{  )r   rE  rF  r%  r"  re   s    r^   r|  $ForeachKernelSchedulerNode.get_nodes)  s(     IOO111U1UUVVra   c                <    U R                   S   R                  5       $ r  )r"  ru  re   s    r^   ru  )ForeachKernelSchedulerNode.get_first_name.  s    {{1~,,..ra   c                    [        XU R                  R                  5        U R                   H  nUR	                  U5        M     g rZ   )rl  rQ   r   r"  rn  )r\   rm  rR   s      r^   rn  /ForeachKernelSchedulerNode.prune_redundant_deps1  s5     	d8R8RSKKD%%&89  ra   )r   r  r   r   r   r  rR   r  r   r  rQ   r"  r   r  rW   )r  rL   r   rS   )r  rL   r   rS   r  rL   r  rL   r   r   )r  rL   r  rL   r   r  )NNF)rQ   rP   r"  r  r  r   r  rS   r  rS   r  r   r   r   rs  r  r   r  )rQ   rP   r   list[list[BaseSchedulerNode]])r  r  r   r   r   r   r   r  r  r   r  )rn   r   r   r   r  r  r  r  r  ro  r   r  r  r  r  r   r  r#  r=  r  r  r,  r|  ru  rn  r   r  r  s   @r^   r  r    s   
)	$)	$& ,
 ,
\ >
(>
4E>
	#>
 >
J 4837 %F/F/ (F/ $(	F/
 1F/ 1F/ F/ 
F/ F/P +	  B 	& * 	/ & ( / 
 T
	
 
 WW	&W W
""!
W
/:">:	: :ra   r  c                     ^  \ rS rSr% SrS\S'   \SS j5       rSU 4S jjrSS jr	SS jr
\SS	 j5       rSS
 jr\SS j5       rSS jr\SS j5       rSS jr\SS j5       rSrU =r$ )rg  i:  a'  
This is a "fake" scheduler node that represents a group of scheduler nodes
that are meant to be *grouped* together (it does not allow another node to be scheduled
in between its constituent nodes, nor does it allow another node to fuse into any of its constituent nodes).
The way it does this is by maintaining its unmet dependencies as the union of its constituent nodes.
Fusion will still happen among the nodes within each GroupedSchedulerNode.
At codegen time, this scheduler node will be unpacked and codegen is called on each constituent node.
r  r"  c                   ^ US   R                   m[        U4S jU 5       5      (       d   eU " TU5      nU H   nUTR                  UR                  5       '   M"     UTR                  UR                  5       '   U$ )Nr   c              3  >   >#    U  H  oR                   TL v   M     g 7frZ   r"  )r&  rR   rQ   s     r^   r'  .GroupedSchedulerNode.create.<locals>.<genexpr>I  s     B64>>Y.6s   )rQ   r  rm  r[   )rl  r"  grouped_snoder  rQ   s       @r^   createGroupedSchedulerNode.createF  su    1I''	B6BBBBBIv.E=JI(()9: AN	$$]%;%;%=>ra   c                <   > [         TU ]  U5        [        XU5        g rZ   )r  r   rh  r  s      r^   r   GroupedSchedulerNode.__init__P  s    #0ra   c                   U R                    H)  nXR                  R                  UR                  5       '   M+     U R                  R                  U R                  5       	 U R                  R	                  U R                   5      $ )zw
Do fusion among nodes within this GroupedSchedulerNode,
and then unpack this GroupedSchedulerNode into regular nodes.
)r"  rQ   rm  r[   
fuse_nodes)r\   r  s     r^   unpackGroupedSchedulerNode.unpackT  s\    
 [[EBGNN--enn.>? !NN--dmmo>~~((55ra   c                    U R                  U R                  R                  U5      5        U R                  R	                  U5        g rZ   )r  r   r  r   rO  )r\   fake_deps     r^   r"  !GroupedSchedulerNode.add_fake_dep^  s5    T--77AB##H-ra   c                ~    SR                  U R                   Vs/ s H  oR                  5       PM     sn5      $ s  snf r  r  r  s     r^   r[   GroupedSchedulerNode.get_nameb  r  r  c                <    U R                   S   R                  5       $ r  r  re   s    r^   ru  #GroupedSchedulerNode.get_first_namef  r  ra   c                    [         R                  " U R                   Vs/ s H  oR                  5       PM     sn6 $ s  snf rZ   r  r  s     r^   r  %GroupedSchedulerNode.get_buffer_namesi  r  r  c                n    / nU R                    H"  nUR                  UR                  5       5        M$     U$ rZ   r  r  s      r^   r    GroupedSchedulerNode.get_outputsm  r  ra   c                    [        [        S S U R                  5        5       5      5      n[        U5      S:X  a  g [	        U5      nU$ )Nc              3     #    U  HA  nUR                  5       (       d  UR                  5       (       d  M/  UR                  5       v   MC     g 7frZ   rs  ry  s     r^   r'  6GroupedSchedulerNode.estimate_flops.<locals>.<genexpr>y  ru  rv  r   rw  ry  s      r^   rO  #GroupedSchedulerNode.estimate_flopss  r}  ra   c                    U R                   $ rZ   r'  re   s    r^   r|  GroupedSchedulerNode.get_nodes  r  ra   c                    gr]  r   )rl  r  r  s      r^   r  GroupedSchedulerNode.can_fuse  s     ra   r   )r"  r  r   rg  r  r9  )rH  r'   r   r   r   r  r  r  r  r6  )rn   r   r   r   r  r   r  r?  r   rE  r"  r:   r[   ru  r  r   rO  r|  r  r   r  r  s   @r^   rg  rg  :  s     $# 16. = =) N N  "  ra   rg  c           
     0  ^ ^ [         R                  SUU 4S jj5       n[        [        [	        [        T S   5      5      5      5      n[        U5      S:  a  U Vs/ s H  nT U   PM
     snm [        R                  (       a  UR                  US9  U$ s  snf )zu
A heuristic to decide loop iteration orders.  This has not been well
tuned and may be something we should autotune.
c                z  > TU    S:X  d	  TU   S:X  a  [        TU    S:H  TU   S:H  5      $ T Vs/ s H  n[        X    5      PM     nnT Vs/ s H  n[        X!   5      PM     nn[        S [        X45       5       5      n[        S [        X45       5       5      nXV:  a  gXe:  a  g[        X5      $ s  snf s  snf )Nr   c              3  F   #    U  H  u  pUS :H  =(       d    X:  v   M     g7fr   Nr   r&  sl_asl_bs      r^   r'  5pick_loop_order.<locals>.index_cmp.<locals>.<genexpr>  $      
7VDAI$$7V   !c              3  F   #    U  H  u  pUS :H  =(       d    X!:  v   M     g7fr]  r   r^  s      r^   r'  ra    rb  rc  r  )r;   absr  r$  )	abslstride_len_astride_len_ba_firstb_firstr  stride_lengthss	          r^   	index_cmp"pick_loop_order.<locals>.index_cmp  s    8q=E!HMuQx1}eAh!m44 .<<^rBE
^<-;<^rBE
^<  
7:<7V
 
  
7:<7V
 
  1y# =<s   B3B8r   r  )rf  r   rg  r   r   r   )		functools
cmp_to_keyr   r1  r  rs   r   pick_loop_orderssort)rm  r  priority_idxrn  orderpis   ``    r^   pick_loop_orderrw    s      4 %N1$5 6789E
<17CD|.,|D

y
!L Es   Bc                  d    \ rS rSr% S\S'   SrS\S'   SrS\S'   SS jrSS	 jrSS
 jr	SS jr
Srg)NodeUseri  $Union[BaseSchedulerNode, OutputNode]rR   Fr   r  is_weakc                v    [        U R                  R                  5       U R                  U R                  45      $ rZ   )rc   rR   r[   r  r{  re   s    r^   rf   NodeUser.__hash__  s+    TYY'')4+;+;T\\JKKra   c                    [        U[        5      =(       aa    U R                  5       UR                  5       :H  =(       a9    U R                  UR                  :H  =(       a    U R                  UR                  :H  $ rZ   )r   ry  r[   r  r{  r\   others     r^   __eq__NodeUser.__eq__  s[    uh' .5>>#33.  E$5$55. -		
ra   c                6    U R                   R                  5       $ rZ   r{   re   s    r^   r[   NodeUser.get_name  r}   ra   c                    U R                   UR                   L d   e[        U R                   U R                  =(       a    UR                  U R                  =(       a    UR                  5      $ rZ   )rR   ry  r  r{  r  s     r^   r   NodeUser.merge  sP    yyEJJ&&&II2!2!2LL*U]]
 	
ra   r   Nr   )r  objectr   r   r   )r  ry  r   ry  )rn   r   r   r   r   r  r{  rf   r  r[   r   r   r   ra   r^   ry  ry    s3    
..K GTL
$
ra   ry  c                  N  ^  \ rS rSr% SrS\S'   SLS jrSLU 4S jjrSMS jr\	SNS j5       r
\
R                  SOS	 j5       r
SPS
 jrSQS jrSRS jrSPS jrSPS jrSPS jr    SSS jrSTS jrSUS jrSPS jrSPS jrSSS jrSPS jr    SVS jr      SWS jr      SXS jrSPS jrSYS jr      SZS jrS[S jr    SSS jr S\S]S jjr!S^S  jr"    S_S! jr#      S`S" jr$      S`S# jr%      S`S$ jr&        SaS% jr'      SbS& jr(ScS' jr)        SdS( jr*S`S) jr+      S`S* jr,        SeS+ jr-SfS, jr.SgS- jr/      SbS. jr0    ShS/ jr1    SiS0 jr2SPS1 jr3SPS2 jr4SPS3 jr5SjS4 jr6SkS5 jr7SlS6 jr8SmS7 jr9      SnS8 jr:ScS9 jr;  SoS: jr<    SpS; jr=      SqS< jr>      SrS= jr?    SsS> jr@    SSS? jrA    SSS@ jrB    SSSA jrC  StSB jrDSPSC jrE      SuSD jrFSPSE jrGS^SF jrH    SvSG jrISwSH jrJSxSI jrKSPSJ jrLSKrMU =rN$ )yrP   i  z
A Scheduler is a graph of BaseSchedulerNodes. It is responsible for
optimizations such as fusion, reorder, and graph partition.
zdict[Dep, int]_Scheduler__dep_size_hint_cachec                p    [        S5         U R                  U5        S S S 5        g ! , (       d  f       g = f)NzScheduler.__init__)r   _initr\   rs  s     r^   r   Scheduler.__init__  s#    ./JJu 0//s   '
5c           
       >^  [         TT ]  5         0 T l        T [        R                  l        0 T l        [        [        5      T l	        [        R                  " 5       T l        [        5       T l        [        / [        R                  R                  R!                  5       Q[        R                  R"                  R!                  5       Q[        R                  R$                  R!                  5       Q5      T l        U Vs/ s H  nT R)                  U5      PM     snT l        T R-                  5         T R&                  R/                  [        R                  R"                  R!                  5       5        T R*                   H  nUR1                  5         M     T R3                  5       T l        T R*                   Vs0 s H  o"R7                  5       U_M     snT l        T R*                   VVs0 s H*  o3R;                  5         H  oDR7                  5       U_M     M,     snnT l        T R8                  R?                  5       T l         0 T l!        0 T l"        [F        RH                  " T R*                  T R<                  T R@                  5      T l        T RK                  5         T RM                  T R*                  5      T l        T RO                  5         T R*                   Vs0 s H  o"R7                  5       U_M     snT l         T RQ                  5         [R        =RT                  [W        T R*                  5      -  sl*        SSK,J-nJ.n  U" T R*                  5        [W        T R*                  5      T l/        T Ra                  5         T RM                  T R*                  5      T l        [        [b        [d        [d        4      " 5       T l3        [h        Rj                  b%  [h        Rj                  " T R*                  5      T l        T Rm                  T R*                  5      T l        [h        Rn                  b%  [h        Rn                  " T R*                  5      T l        T Rq                  5         T Rs                  5         [h        Rt                  (       a  T Rw                  S S9  [h        Rx                  (       a  SSK=J<n  U" T R*                  T R<                  T R@                  [        [        R                  R                  R!                  5       5      [        [        R                  R}                  5       5      5      T l        [h        R~                  (       a%  [F        R                  " T R*                  5      T l        T R                  5         [        R                  Rh                  R                  (       a@  T R                  T R*                  5      T l        T R                  T R*                  5      T l        T R                  5         U" T R*                  5        [        R                  R                  T R*                  5        T R                  5         [        5       T lK        0 T lL        [        S5      R                  U 4S j5        g s  snf s  snf s  snnf s  snf )Nr   )log_ir_post_fusionlog_ir_pre_fusion)num_ck_nodesr   )reorder_for_peak_memorygraph_statsc                 ^   > T R                   T R                  [        T R                  5      S.$ )N)graph_idnum_nodes_before_fusionnum_nodes_after_fusion)post_grad_graph_idnum_orig_nodesrs   rs  re   s   r^   r   !Scheduler._init.<locals>.<lambda>`  s%     33+/+>+>*-djj/ra   )Or  r   r  rI   r   rQ   backendsrq  _post_grad_graph_counterr  rE  count_graph_partition_counterr   r  rB  keys	constantstorchbind_constantsrV  create_scheduler_noders  update_zero_dim_cpu_tensorr  r-  get_donated_buffersr   r[   r  r   r   copyrm  r4  mutation_renamesr   decide_global_ordering_of_commscompute_dependenciestopological_sort_scheduledead_node_eliminationcompute_ancestorsr!   ir_nodes_pre_fusionrs   torch._inductor.debugr  r  r  create_foreach_nodesr  r   logged_slow_fusionr   _pre_fusion_custom_passrD  _post_fusion_custom_passr  finalize_multi_template_bufferscombo_kernelscreate_combo_kernel_nodesr  memoryget_output_names reorder_for_compute_comm_overlap$reorder_compute_and_comm_for_overlapprocess_grouped_nodesr  r  graph_partition&maybe_reorder_for_minimizing_partition,reorder_for_partition_with_simple_dependencycompute_last_usager  graph_diagramdebug_draw_graphbuffer_names_to_freeorigin_to_indexr   add_row)	r\   rs  r  rR   r   r  r  r  r  s	   `       r^   r  Scheduler._init  s   %'" <>"&'?"@(1(9%5?\!&0%%**,""'') ,,113'
# >CCUd003UC
'')##**177+<+<+A+A+CDJJDOO  $$& 	# &*ZZ;
%/JJL!OZ;
 -1JJ8
,6DBRBRBT3LLNCBTNJ8
 AE@Q@Q@V@V@X 35 13 ::JJ##

 	!!#33DJJ?
""$<@JJ"GJq::<?J"G ##s4::6#O$**%!$**o!!#33DJJ?
",U38_"="?))577

CDJ__TZZ0
**688DDJ,,.***= ))70

  ''177//44671773356DJ 22CCDJJODJ""$??!!11DDTZZPDJJJ4::VDJ!4::&	djj) 6@\! :<'//	
G D;
8
B #Hs   =Y/7Y4(1Y9Y?c                   0 n[         R                  R                   Hg  n[        [         R                  R                  U   [        R
                  5      (       d  M?  [        U [         R                  R                  U   S S9X'   Mi     U$ )N)rT   )rI   r   graph_inputs_originalr   r    DonatedBufferr   )r\   name_to_donated_bufrd   s      r^   r  Scheduler.get_donated_buffersg  sl     GG11D!''77=r?O?OPP,BGG11$7 $-#) 2 #"ra   c                6    [         R                  R                  $ rZ   rI   r   current_devicere   s    r^   r  Scheduler.current_devicer  s    ww%%%ra   c                .    U[         R                  l        g rZ   r  r  s     r^   r  r  v  s    !'ra   c                |    [         R                  R                  SS5      S:X  a  SSKJn  U" U R
                  SS9  gg)z,Generate an image of the graph for debuggingINDUCTOR_WRITE_SCHEDULER_GRAPHN1r   )draw_buffersT)print_graph)osenvironr2  r  r  rs  )r\   r  s     r^   r  Scheduler.debug_draw_graphz  s1    ::>>:DASH+6 Ira   c                    [         R                  [        R                  5      (       a:  [         R	                  SU5        U R
                   H  nUR                  5         M     g g )Nz%s:)r   isEnabledForloggingINFOr  rs  r  )r\   labelrR   s      r^   debug_print_nodesScheduler.debug_print_nodes  sD    GLL))HHUE"

  " # *ra   c                P   UR                  5       c   S5       eUR                  5       (       a  [        X5      $ [        U[        R
                  [        R                  45      (       a  [        X5      $ [        U[        R                  5      (       a  [        X5      $ [        U5      e)Nz2All nodes passed to scheduling must have an origin)r  is_no_opr  r   r    r  r  r  ExternKernelr:  r  r  s     r^   r  Scheduler.create_scheduler_node  s    !- 	
@	
- ==??)$55r00"2C2CDEE ,,boo..,T88%d++ra   c                   [        5       n/ nU R                  R                  5       n[        R                  R
                  R                  5        H  nU Vs/ s H0  nXS;   d  M
  [        U R                  U   [        5      (       a  M.  UPM2     nnU(       d  MI  UR                  U5        U Vs/ s H  oPR                  U   PM     nn[        R                  S:  n[        U USUS9nUR                  U5        U H  nXR                  U'   M     M     U R                   V	s/ s H  oR!                  5       U;  d  M  U	PM     sn	[#        U5      -   U l        g s  snf s  snf s  sn	f )Nr   Fr  r  )r   rm  r  rI   r   listsr   r   r  r  r  r   combo_kernels_autotuner  r  rs  r[   r   )
r\   removed_node_namesfe_nodeskept_node_namesnamesrd   r"  r  fe_noderR   s
             r^   r  Scheduler.create_foreach_nodes  sK   .8l11668WW]]))+E "!D*  #4#4#4T#:<RS !   %%e,:?@%$''-%F@$;;a?O0*/ /	G OOG$07''- 1 ,8 "ZZ
'T==?BT+TDZ
N
5 A
s$   	E# EE-E E ;E c                  ^ ^^ ^! [        S5      n " U4S jS[        U   5      m[        R                  " T5      m T R                   H  nUR                  5        H  nUR                  5       nUR                  5        He  nUT ;   aD  UT ;   a>  T U   nT U   nXg-   nT R                  5        H  n	T U	   UL d
  T U	   UL d  M  UT U	'   M     MM  UT ;   a
  T U   T U'   M]  T U   T U'   Mg     M     M     SU!U 4S jjm!  S         SU U!4S jjjn
0 n[        R                  R                  R                  5        H  u  p[        U[        R                  5      (       a  UR                    H  nSX'   M	     M=  [        U["        R$                  5      (       d  M^  UR'                  5        Vs/ s H&  n[        U[        R                  5      (       d  M$  UPM(     nnU H  nUR                    H  nSX'   M	     M     M     T R                   GH  n[(        R+                  SUR,                  5        UR,                  c   e[/        UR,                  R1                  5       S S	9nU H=  n[        U[        R2                  5      (       d   eX;  d  M+  UR                  5       X'   M?     [/        UR,                  R5                  S
S9S S	9nU Hi  nX;   d   U SU 35       eX   =nc  M  T R6                  U   R                  5        H+  nUR9                  [;        UR                  5       5      5        M-     Mk     [=        UR>                  R@                  5      S:X  aQ  [C        [E        UR>                  R@                  5      5      =n(       a"  [        U[F        5      (       a  URH                  nOSnUR                  5        GH  n[=        URK                  5       5      S::  d   eURK                  5        H  nT!" U5      nU
" UU5        UR9                  [;        UUS95        T U   R                   H  nUR                  5       UR                  5       :X  a  M'  [        UR,                  [L        5      (       d   eUR,                  RO                  5        H:  nT!" U5      nUR9                  [Q        UUR                  5       S95        U
" UUS
S9  M<     M     M     GM     UR>                  RR                   H<  n[        U[P        5      (       a  M  U
" URT                  X"RW                  U5      5        M>     URY                  T RZ                  5        UR                  5        H  nURK                  5        Hz  nUR                  5       T RZ                  T!" U5      '   UR                  5       T RZ                  U'   T R\                  R_                  UU5      T R\                  UR                  5       '   M|     M     GM     [        R                  Ra                  5        H4  n[(        R+                  SU5        U
" U[c        [;        U5      5      5        M6     [        R                  Rd                   H  nUR5                  S
S9 H  nX;   d   U SUR                  5        35       eX   =n(       d  M/  T R6                  U   RO                  5        H5  n[(        R+                  SUU5        U
" U[c        [;        U5      5      5        M7     M     M     T RZ                   H  nU[        R                  R                  ;   aF  U
" U[c        [;        U5      5      5        [        R                  Rf                  Ri                  U5        Mg  U[        R                  Rj                  ;   d  M  U
" U[c        [;        U5      5      5        M     [m        [        R                  R                  R                  5       5       VVs0 s H  u  noU_M
     nnn[        R                  Rf                   Vs/ s H  nUU   PM
     sn[        R                  l7        T R                   HF  nUR                  5        H/  nURq                  T UR                  5          R                  5        M1     MH     T Rr                   H.  nT Rr                  U   Rq                  T U   R                  5        M0     gs  snf s  snnf s  snf )zQ
Create dependency edges between nodes, handling aliasing and
mutation properly.
Tc                  P   > \ rS rSrSr  S     S	S jjrS
S jrSU 4S jjrSrg)1Scheduler.compute_dependencies.<locals>.DedupListi  a  
This data structure behaves like a list except it makes sure the
elements remain unique.
Normally one could use a OrderedSet/dict for this purpose however
the list in question gets elements appended as it is being
iterated over which means that we need to keep the list
semantics.
Nc                T    U=(       d    / U l         U=(       d
    [        5       U l        g rZ   )r  r   
membership)r\   r  r  s      r^   r   :Scheduler.compute_dependencies.<locals>.DedupList.__init__  s    
 #[b
","<
ra   c                    XR                   ;   a  g U R                  R                  U5        U R                   R                  U5        g rZ   )r  r  r  rO  )r\   	node_users     r^   r  8Scheduler.compute_dependencies.<locals>.DedupList.append  s3    /

!!),##I.ra   c                   > [         R                  " U R                  UR                  5      nU R                  UR                   Vs/ s H  o3U R                  ;  d  M  UPM     sn-   nT" XB5      $ s  snf rZ   )r   r^  r  r  )r\   r  new_membershipr  	new_items	DedupLists        r^   __add__9Scheduler.compute_dependencies.<locals>.DedupList.__add__  sc    !+!1!1$//5CSCS!T JJ${{**!t.FA{* 	 !;;*s   A0A0)r  r  rS  )r  zOptional[list[T]]r  zOptional[OrderedSet[T]]r   r   )r  r  r   r   )r  DedupList[T]r   r  )	rn   r   r   r   r  r   r  r  r   )r  s   r^   r  r    s@     ,06:=(= 4= 	=/< <ra   r  c                R   > U TR                   ;   a  T" TR                   U    5      $ U $ rZ   )r  )r  r  r\   s    r^   r  .Scheduler.compute_dependencies.<locals>.rename  s,    D)))d33A677Hra   c                N   > TT" U 5         R                  [        XU5      5        g rZ   )r  ry  )used_by_namer  r  r{  name_to_usersr  s       r^   add_user0Scheduler.compute_dependencies.<locals>.add_user  s'     &./669ra   Nzscheduling %sc                    U R                   $ rZ   rA  r  s    r^   r   0Scheduler.compute_dependencies.<locals>.<lambda>!	  s    AFFra   r  T)unbacked_onlyc                    U R                   $ rZ   rA  r  s    r^   r   r  ,	  s    RSRXRXra   z not in r   )rL  )mutating_buf)r{  zscheduling output %sz+scheduling output %s for unbacked symint %s)r  r   r   r   )FF)
r  r   r  rz  r  r   r{  r   r   r   ):r   r   r@  r   rs  r   r[   rp   r  rI   r   rB  r  r   r2  r  r   r    	TensorBoxr  r   r  rR   r  get_unbacked_symbol_defsSymbolget_free_symbol_usesr  r"  r)   rs   r   r   rq  rD  r(   rL  rr   rL   r  r*   r   rd   r  r  r  r4  r2  r  r   graph_outputsmutated_inputsrO  r  rr  mutated_input_idxsr   r   )"r\   r  rR   buf1	buf1_name	buf2_namelist1list2combinedr  r  unbacked_symbol_to_origin_noderd   valfsr  sym_sizeunbacked_symbol_defsunbacked_symbol_usesr  r   r!  	node_modealt_namerw   
other_namer  r  r   rE  	inp_namesr  r   r  s"   `                              @@@r^   r  Scheduler.compute_dependencies  s    CL	<
 	<> @K?V?V@
 JJD((* MMO	!%!1!1!3I M1i=6P -i 8 -i 8#(=#0#5#5#7C -c 2e ;#0#5#>5=c 2 $8 #m33@3Ki03@3Ki0 "4 + (	 	 !&!			;	 	 		
 	 	 MO&
 --335ID#uzz****B9=26 +C.. (+||~S~!Auzz9RA~S!Ann=A6: - " 6 JJDIIotyy1 99(((#)		224:J$  *!!U\\2222 :8<25 * $*		..T.BHX$  *: c"@!AB: 8::AG#003??A))'#,,.*AB  B * D$$++,1 d&6&6&=&=!>??S?sI..HH	 	 '')3,,./1444 # 1 1 3H%h/HXt,%%ghY&GH -h 7 = ===?dmmo=$)$))5FGGGG*.))*D*D*FJ)/
);J -- '
 P %ZtD +G !> !4 *, ((..!$00TYY.>.>t.DE / %%d&;&;< '') # 1 1 3H>AllnD))&*:;69llnD))(3//33HhG ++CLLN; !4 *I Z 002HII,h7Xz'(*;<= 3
 77((C--D-A: c"@"E"E"G!HI: 79919$($5$5a$8$I$I$K		I8UV !:gh6G+HI	 %L B ) ))Dqww+++z'$-89&&**40***z'$-89 * ,5QWW5I5I5N5N5P+Q
+QKE4%K+Q 	 
 )*(>(>&
(>IdO(>&
"
 JJD'')mCLLN;AAB *  //D''-77d8K8Q8QR 0q TX
&
s   '#c
c
"ccc                  ^	 / n[        U R                  5       GH  nSS jm	SnUR                  5        H  n[        U	4S jUR                   5       5      nU(       a]  [
        R                  SUR                  5       5        [        R                  R                  R                  UR                  5       5        M  SnM     UR                  5       (       + =(       a    U(       + nU(       d  UR                  U5        M  [
        R                  SUR                  5       5        [        R                  R                  R                  UR                  5       5        UR                  R                    H  nUR"                  U R$                  ;   d  M  U R$                  UR"                     R                  nU Vs/ s H2  oR&                  R                  5       UR                  5       :w  d  M0  UPM4     snU R$                  UR"                     l        M     GM     [)        [        U5      5      U l        U R                   H  nUR+                  5         M     gs  snf )	z 
Remove any nodes without users
c                ~    U R                   =(       d+    U R                  5       [        R                  R                  ;   $ rZ   )r{  r[   rI   r   r^  )rw   s    r^   can_eliminate_user;Scheduler.dead_node_elimination.<locals>.can_eliminate_user	  s&    ||Tt}}!'':T:T'TTra   Fc              3  4   >#    U  H  nT" U5      v   M     g 7frZ   r   )r&  ur"  s     r^   r'  2Scheduler.dead_node_elimination.<locals>.<genexpr>	  s     #M9a$6q$9$99   zremoved dead buffer: %sTzremoved dead operation: %sN)rw   ry  r   r   )r1  rs  r   r  rW   r   r  r[   rI   r   r  rO  r  r  r^  r   r   rd   r   rR   r   ri  )
r\   updated_nodesrR   active_buffersr   can_eliminater  rW   r%  r"  s
            @r^   r  Scheduler.dead_node_elimination	  s    TZZ(DU #N'') ##M399#M M II7HGG++//?%)N * !% 5 5 77N<NM $$T* 		6H**..t}}? ,,22DyyD$4$44 $ 0 0 ; A A',=',!0AT]]_0TAu=((39 3- )8 (=12
 JJD  " =s   4/I'Ic                   ^^^^ [         [           " 5       m[        5       m/ mSUUUU4S jjmU H  nUR                  5        H  nUTU'   M
     M!     U H  nT" U5        M     T$ )z/
Ensure nodes is in topologically sorted order
c                   > U T;  af  TR                  U 5        [        U R                  S S9 H*  nUR                  T;  a  M  T" TUR                     5        M,     TR	                  U 5        g g )Nc                    U R                   $ rZ   rA  )ds    r^   r   DScheduler.topological_sort_schedule.<locals>.visit.<locals>.<lambda>	  s    affra   r  )rO  r  r   rd   r  )r  r!  r  rv   seenvisits     r^   r2  2Scheduler.topological_sort_schedule.<locals>.visit	  sa    }!!"6"6<LMCxx|3 ,sxx01	 N
 a  ra   )r  rL   r   r   )r   rL   r#  r  )r\   rs  rR   rd   r  rv   r1  r2  s       @@@@r^   r  #Scheduler.topological_sort_schedule	  sj     +,.59V*,	! 	! D--/%)T" 0  D$K ra   c                D  ^  [        5       n[        U[        [        [        [
        45      (       a/  UR                   H  nUR                  UR                  5        M      O[        S[        U5       S35      eU 4S jU 5       n[        [        U 4S jU 5       5      5      $ )Nz+get_unmet_dep_nodes is not implemented for .c              3  ^   >#    U  H"  nTR                   U   R                  5       v   M$     g 7frZ   )r   r_   rW  s     r^   r'  1Scheduler._get_unmet_dep_nodes.<locals>.<genexpr>	  s(     XZc))#.??AAZr  c              3  B   >#    U  H  nTR                   U   v   M     g 7frZ   rm  )r&  r  r\   s     r^   r'  r8  	  s     Q=at66q9=s   )r   r   r  r:  r  rA  r   rO  rd   RuntimeErrorrm   r   )r\   r  
unmet_depsr!  unmet_dep_opss   `    r^   _get_unmet_dep_nodesScheduler._get_unmet_dep_nodes	  s    &0l
)&"	
 
 //sxx( 0 =d5k]!L  YZXJQ=QQRRra   c                   / n[         R                  U R                  S5      n0 nU R                   HQ  nU R                  U5      n[	        U5      X$'   U H*  nUR                  U/ 5      nUR                  U5        XsU'   M,     MS     UR                  5        VV	s/ s H  u  pU	S:X  d  M  UPM     n
nn	U
(       a  UR                  U
5        U
 H9  nUR                  U/ 5       H  nX+==   S-  ss'   M     UR                  U5        M;     UR                  5        VV	s/ s H  u  pU	S:X  d  M  UPM     n
nn	U
(       a  M  U(       a   S5       eU$ s  sn	nf s  sn	nf )zE
Sort nodes by their topological order, return a list of node lists.
r   r   zTopological sort failed!)	r#  fromkeysrs  r>  rs   r2  r  r  rN  )r\   ru  rs  childrenrR   rR  r!  cr  r	  zero_deg_nodesrw   s               r^   r  !Scheduler._topological_sort_nodes	  s,    djj!,#%JJD,,T2Dd)EKLLb) !   ).@a!@LL(#$LLB/DK1$K 0		! $ -2KKMDMDAQ!VaMND n 444y A Es   E)EE,Ec                j   0 nU R                    Hw  n[        5       nUR                   HB  nU R                  UR                     R                  5       nUR                  U5        X1U   -  nMD     X1UR                  5       '   X2l        My     [        U R                   5       H  u  pbXbl
        Xbl        M     g)z
Populate each node.ancestors
N)rs  r   r   r   rd   r_   rO  r[   r   rr  r   r   )r\   name_to_ancestorsrR   r   r!  dep_node_nameru  s          r^   r  Scheduler.compute_ancestors
  s    
 9;JJD)3I.. $ 0 0 : K K Mm,}==	 / 2;dmmo.&N  %TZZ0KE"N"N 1ra   c                   U R                    H  n[        R                  (       d  M  [        U[        [
        45      (       a)  UR                  5       (       d  [        R                  S:w  a  M`  UR                  5        H?  n[        U[        5      (       a  UR                  5       (       a  M/  UR                  5         MA     M     g )Nhalide)rs  r   r  r   r  rA  rD   cpu_backendr|  r  r  )r\   rR   r  s      r^   r  Scheduler.merge_loops
  s    JJD44 d]4F$GHHKKMMf&8&8H&D)!%775;L;L;N;N!!# * ra   c                z   [        SSSS9   [        S5       H  n[        U5      n[        R	                  SUS-   U5        U R                  U5      n[        U5      n[        R	                  SUS-   UU5        XC:X  d  US:X  d  Ml  [        R	                  SUS-   5          O   UsS	S	S	5        $ ! , (       d  f       g	= f)
z2
Combine eligible nodes into FusedSchedulerNodes.
zScheduler.fused_nodesT)log_pt2_compile_eventlog_waitcounter
   z/===== attempting fusion (%d/10): %d nodes =====r   z=completed fusion round (%d/10): fused %d nodes into %d nodes
z+===== fusion complete (%d iterations) =====N)r   r  rs   r  r  fuse_nodes_once)r\   rs  ro  old_lennew_lens        r^   rD  Scheduler.fuse_nodes1
  s     #4QU
 2Ye*  EE
 ,,U3e*  TE	 %A$$Eq1u ' ( /
 
 
s   A4B,B,,
B:c                    / nU R                    H:  nUR                  [        U[        5      (       a  UR	                  5       OU/5        M<     Xl         g)z1
Unpack GroupedSchedulerNode into regular nodes.
N)rs  rQ  r   rg  rE  )r\   	new_nodesrR   s      r^   r  Scheduler.process_grouped_nodesN
  sF     .0	JJD!+D2F!G!GdV  
ra   c                    [        U5      S:  d   eUS   R                  5       nX l        U R                  U5      n[	        SSSS9   UR                  U5      sSSS5        $ ! , (       d  f       g= f)k
Benchmark fused list of nodes and return the execution time
in milliseconds on randomly generated inputs.
r   benchmark_fused_nodesTcompile_time_autotune_time_us)rO  dynamo_compile_column_usN)rs   r   r  r  r   r[  )r\   rs  r  backends       r^   r[  Scheduler.benchmark_fused_nodesY
  sm     5zA~~q$$&$""6*#"&%D

 007
 
 
   A""
A0c                    [        U5      S:  d   eUS   R                  5       nX0l        U R                  U5      n[	        S5         UR                  X5      sSSS5        $ ! , (       d  f       g= f)rZ  r   r[  N)rs   r   r  r  r   generate_kernel_code_from_nodes)r\   rs  benchmark_kernelr  r^  s        r^   rb  )Scheduler.generate_kernel_code_from_nodesk
  s_     5zA~~q$$&$""6*12::5S 322r`  c                    X l         U R                  U5      n[        S5         UR                  U5      sSSS5        $ ! , (       d  f       g= f)rZ  r[  N)r  r  r   benchmark_codegened_module)r\   moduler  r^  s       r^   rf  $Scheduler.benchmark_codegened_moduley
  s=     %""6*1255f= 322s	   >
Ac                  ^       SS jn[        U R                  5       GH#  u  p#[        U[        5      (       d  M  [        UR                  [
        R                  5      (       d  MH  UR                  n[        R                  R                  (       d  UR                  5       u  pVO[        S UR                   5       5      n[        U[        R                  R
                  R                  5      (       a  UR                  R!                  U5        M  UR#                  5       nUR$                  n[        U[
        R&                  5      (       d   eUR$                  n	[        U	[
        R(                  5      (       d   eUR*                  U	l        U" XI5        U R-                  U	5      n
XR                  U'   XR.                  UR1                  5       '   XR2                  UR1                  5       '   0 m[4        R6                  " UR8                  R:                  UR<                  5       HA  nU R>                  RA                  URB                  S5      =n(       d  M2  URB                  TU'   MC     SU4S jjnU" U
R<                  5      U
l        U" U
R8                  R:                  5      U
R8                  l        [E        U
RG                  5       URG                  5       5       H2  u  pXRH                  UR1                  5       '   URJ                  Ul%        M4     URL                  U
l&        URN                  U
l'        URP                  U
l(        GM&     g)aP  
Finalize a backing choice for MultiTemplateBuffers which did not already have a
choice finalized through fusion. In the case of an extern choice, this will result
in replacing the SchedulerNode.

If a MultiTemplateBuffer did not have any fusion opportunities, finalizing a choice
will force completion of compilation and benchmarking.
c                   UR                  5       nU R                  5       n[        U[        5      (       a  [        U[        5      (       d   eUR                  5       nU R                  5       n[        U[        5      (       a  [        U[        5      (       d   e[        R
                  R                  U	 X1l        [        R
                  R                  U	 XQl	        [        R
                  R                  R                  U 5      n[        R
                  R                  R                  U5        U[        R
                  R                  U'   U[        R
                  R                  U'   [        R
                  R                  R                  U 5      n[        R
                  R                  R                  U5        U[        R
                  R                  U'   U[        R
                  R                  U'   g rZ   )r[   r   r   rq  rI   r   rP  rd   
name_to_opoperation_namebuffersrE  remove
operations)	orig_noder  replaced_buf_nameorig_buf_namereplaced_op_nameorig_op_nameorigs          r^   replace_operation_bufferKScheduler.finalize_multi_template_buffers.<locals>.replace_operation_buffer
  s_    !) 1 1 3%..0MmS11jARTW6X6XXX'::<$779LlC00Z@PRU5V5VVV&&'89)M""#34&2#77??((3DGGOO""8,$,AGGOOD!4<AGG""=177%%++I6DGG%%h/'/AGGt$/7AGG|,ra   c              3     #    U  H<  n[        U[        R                  R                  R                  5      (       d  M8  Uv   M>     g 7frZ   )r   r  r  select_algorithmExternKernelCaller)r&  timings     r^   r'  <Scheduler.finalize_multi_template_buffers.<locals>.<genexpr>
  s7      *C) & % @ @ S S  #F*Cs
   7A	ANc                .   > [        U4S jU  5       5      $ )Nc              3  D   >#    U  H  oR                  T5      v   M     g 7frZ   )r  )r&  r!  r  s     r^   r'  QScheduler.finalize_multi_template_buffers.<locals>.rename_deps.<locals>.<genexpr>
  s     %Sdsjj1A&B&Bds    r   )rR  r  s    r^   rename_deps>Scheduler.finalize_multi_template_buffers.<locals>.rename_deps
  s    %%Sd%SSSra   )rp  zir.MultiTemplateBufferr  zir.OperationBufferr   r   )rR  r   r   r   ))rr  rs  r   r  rR   r    MultiTemplateBufferr   test_configs%force_extern_kernel_in_multi_templateget_min_choicerq  choice_timingsr  r  TritonTemplateCallerBasefinalize_as_triton_calleroutput_noder   
StorageBoxOperationBufferro   r  r  r[   rm  rE  rF  r   r   r   r4  r2  rd   r$  r   r   rW   r   r   r   )r\   rv  ro  rR   
multi_nodemin_node_unfusedr  out_tensorboxout_storage
out_buffernew_scheduler_noder!  	real_namer  new_outold_outr  s                   @r^   r  )Scheduler.finalize_multi_template_buffers
  s   	8-	89K	8	86 !,GA$..:		2114 4 "YY
**PP*4*C*C*E'$a'+*4*C*C	($ $OO&&??  II778HI 0 < < >+00!+r}}====(--
!*b.@.@AAAA$.$5$5
!(@%)%?%?
%K" 2

15G!!$--/2;M''8 $& $??$$**D,C,CC %)$;$;$?$?$$OOyO69hh(3	T 9D&999"5 8C&22888"..4 ),&224d6F6F6H)$G <C$$W%5%5%78$+MMGM	) 04~~",/3~~",04"-E -ra   c                &    [        S U 5       5      $ )Nc              3    #    U  H  n[        UR                  S 5      =(       a_    UR                  SL=(       aJ    [        UR                  R                  S5      =(       a#    UR                  R                  R                  S:H  v   M     g7f)r   Nscatter_moderM  )r   rR   r   r  r  s     r^   r'  ,Scheduler._any_atomic_add.<locals>.<genexpr>
  sp      

 	 AFFF# 9d"9^49 ((L89 s   B	B)r)  r\   	node_lists     r^   _any_atomic_addScheduler._any_atomic_add
  s     

 
 
 	
ra   c           	     v  ^ ^^^^^^^^^^^^^^^ [        S TT4 5       5      n[        R                  (       d  U(       d  gTR                  5       (       a-  [	        TR                  5       [        R                  5      (       a*  TR                  5       (       d  TR                  5       (       a  gTR                  5       nUS   R                  5       mT(       d   eTR                  S:X  a  gTR                  5       n[        [        R                  " XE5      5      nT R                  U5      (       a  gSSKJm  [%        TT5      mUS   R                  5       mTc   eSUU4S jjm[&        R(                  R*                  R-                  5       m    SUU 4S jjnU(       Ga  [        S	 TT4 5       5      (       Ga  TR                  5       SLmT(       a  TR                  5       OTR                  5       m[	        T[        R.                  5      (       d   eTR0                  nTR3                  5       u  n	mT(       a  T R5                  U5      OT R5                  U5      u  mn
/ mSn[7        UR9                  5       [:        R<                  " S
5      S9 H  u  p[	        U[&        R(                  R                  R>                  5      (       d  M:  T(       d-  [A        US5      (       a  URB                  TRB                  :w  a  Mn  UTT-   :  a    OTUS
-  nU[        RD                  :  a    O9TRG                  U5         TRI                  U/U" U5      Q75        SSS5        M     [K        T5      S:X  a  gSUUUUUUUU 4S jjnU$ U" U5      mU" U5      mU" U5      mSUUUUUUU U4S jjnU$ ! , (       d  f       GM'  = f)o
If config.benchmark_fusion is False, always return True.
Otherwise, return True if fusion can brings speedup.
c              3     #    U  HD  nUR                  5       =(       a(    [        UR                  5       [        R                  5      v   MF     g 7frZ   )r  r   rd  r    r  r  s     r^   r'  .Scheduler.speedup_by_fusion.<locals>.<genexpr>
  sD       
 $ MMO J1..0"2H2HIJ#s   AATr   r  CompilationErrorNc           
     z  > [         R                  [        R                  5      (       a  XU-   :  aE  [         R	                  STR                  5       TR                  5       [        X-   U -  S 5      5        g [         R	                  STR                  5       TR                  5       [        XU-   -  S 5      5        g g )Nz9can fuse (benchmark): fusing %s with %s cause %sx speedup.3fz=cannot fuse (benchmark): fusing %s with %s cause %sx slowdown)r  r  r  DEBUGr  r  r7   r8   )ms_fusedms1ms2r  r  s      r^   
log_fusion/Scheduler.speedup_by_fusion.<locals>.log_fusion(  s    &&w}}55Ci'$$S..0..0"syH&<S%AC	 $$W..0..0 Hc	$:3#?A	 6ra   c                   > TR                  U SS9n[        R                  " U5      nTR                  5       (       d  S nX24$ TR	                  SUS9n[        U[        5      (       d   eX24$ )NT)rc  triton_)kernel_namesource_code)rb  r   loaduse_process_pooltritonr   r   )rs  src_codemodfutasync_compiler\   s       r^   compile_kernel3Scheduler.speedup_by_fusion.<locals>.compile_kernel;  s     ;; < H ""8,C 1133
 : $**yh*W!#|4444:ra   c              3  D   #    U  H  oR                  5       S Lv   M     g 7frZ   rh  r  s     r^   r'  r  J  s      %
7E!!-~s    r   r  allowed_prologue_inpsFc            	     0  > [        S5      n S n0 nT HU  u  p4n Ub  UR                  5         TR                  U5         TR                  UT	5      u  pxXrU'   Xp:  a  Un UnS S S 5        MW     T" U TT5        U TT-   :  a  Ub  TR                  U5        UTl        gg! [         a\  n[        R	                  [
        R                  5      (       a)  [        R                  ST
(       d  SOS[        U5      5         S nAM  S nAff = f! , (       d  f       GM  = f)NinfzException in compiling %s: %sru  rw  TF)r  rv   r   r  r  r  r  r  r   swap_as_triton_callerrf  r  _choice_timings)min_ms_fusedms_fused_choicenew_timingschoicefuture	mod_fusedrX  r  pathr  epilogue_fusionfuture_choicesr  r  r  r  r\   s            r^   benchmark_when_ready9Scheduler.speedup_by_fusion.<locals>.benchmark_when_ready  s   $U|"& 1?-FI!!-"MMO $99&A)-)H)H%v* /7F+#2+3L.4O BA 2@0 <c239-/2M88I1<J. 1 % !%227==AA&,, ?2A
z #A
 !! BAs#   B"D
D&AC==D
D	c                 (  >^^^^^^ SSK Jn    TS   TS   TS   4 H  nUc  M  UR                  5         M     TR                  TS   T
5      u  mm[        R
                  " T5      (       a	  T" S5        gTR                  TS   T
5      u  mm[        R
                  " T5      (       a	  T" S5        gTR                  TS   T
5      u  mm[        R
                  " T5      (       a	  T" S5        gT" TTT5        [        S5      (       a[  TTT-   :  aR  TT4TR                  ;  a@  TR                  R                  TT45        [        S5      R                  UUUUUU4S	 j5        TTT-   :  $ ! U  a     gT	 a  nS
[        U5      ;   a   S nAge S nAff = f)Nr   )NoTritonConfigsErrorr   z%register spilling of the first kernelFz&register spilling of the second kernelz%register spilling of the fused kernelslow_fusionc            	     $   > TT TTTTTT T-   -  S.$ )N)kernel1_pathkernel1_latencykernel2_pathkernel2_latencyfused_kernel_pathfused_kernel_latencyslow_down_ratior   )r  r  r  path1path2
path_fuseds   r^   r   KScheduler.speedup_by_fusion.<locals>.benchmark_when_ready.<locals>.<lambda>  s&    053605365?8@3;sSy3I%ra   Loop-carried variableT))torch._inductor.runtime.triton_heuristicsr  rv   rf  mathisinfr   r  rO  r   r  r   )r  r  rX  r  r  r  r  r  r  r  r  future_and_mod_l1future_and_mod_l1_fusedfuture_and_mod_l2r  r\   r  s      @@@@@@r^   r  r    s   ; *!,)!,/2 
 ?JJL  "&!@!@)!,f"JC zz#CD$!%!@!@)!,f"JC zz#DE$+/+J+J/2F,(Hj zz(++CD$xc2 0>>$c	1"EN$2I2II//33UENC(7?? 
 $cCi//+ ! ' .#a&8#s<   E* AE* 5;E* 1;E* -A<E* *F2F7FFF)r  r  r  r  r  r  r   r   )rs  r9  r   z)tuple[Optional[LambdaFuture], ModuleType]r   )&r)  r   benchmark_fusionr  r   rd  r    TritonTemplateBufferr  r|  r   rm   r   rE  rF  r  triton.compiler.errorsr  r  r  r  r  AsyncCompiler  r  r  r[  r  r  operator
itemgetterr  r   r   max_epilogue_benchmarked_choicesr  r  rs   )r\   r  r  is_multi_templatenode_list_1node_list_2node_list_fusedr  r  r  r  triton_choicesr  unfused_timer  r  r  r  r  r  r  r  r  r  r  r  r  r  s   ```            @@@@@@@@@@@@@r^   speedup_by_fusionScheduler.speedup_by_fusion
  sd       
 U^ 
 

 &&/@ u668":Q:QRR!!!! oo'Q**,v ;;%oo'y{HI
 00;u% #..0!!!	 	" 55BBD	.	6	 	  %
8=u~%
 "
 "
 $557tCO # ''),,. 
 j"*@*@AAAA (66N..0FAs # **;7//< C TVNN(.$$&H,?,?,B)$ "&%//*<*<*U*UVV ((?@@44
8X8XX39,!#!F$K$KK55f="))6*TN?4S*TU >=3)8 >"a'%! %!N (' !/{ ; .{ ;&4_&E#@ @D ('o >=s   6N((
N8	c                <    U R                   UR                  5          $ )z0Look up the node in Scheduler name_to_fused_node)rm  ru  r  s     r^   r  Scheduler.get_fused_node  s    &&t':':'<==ra   c                ,  ^ ^^^ [        U5      m[        R                  [        R                  5      (       aB  [        R                  S5        T H'  n[        R                  SUR                  5       5        M)     0 m      SUU 4S jjm      SUUU 4S jjnT R                  U5       H  u  pEU" XE5        T R                  U5      nT R                  U5      nT R                  XE5      (       d  MG  T R                  XE5      (       a  M_  T R                  XE5      n[        U5      (       a  XdU4TU'   XdU4TU'   M  U(       d  M  T" XE5        M     [        5       nTR                  5        Hx  u  pn
X;   a  M  UR                  U5        T R                  U	5      U	L d   eT R                  U
5      U
L d   eU" 5       (       d  MX  T R                  X5      (       a  Mp  T" X5        Mz     [        TS S9nT R!                  U5      nT R#                  U5        U$ )	z
Combine eligible nodes into FusedSchedulerNodes.

This relies on two key functions to control the logic:
    - self.can_fuse(): checks if a fusion is legal
    - self.score_fusion(): assigns priority to a given fusion
zfuse_nodes_once, candidates:z  %sc                  > [         R                  SU R                  5       UR                  5       5        U R                  5       nUR                  5       U:X  d   eTR	                  U5      R                  X5      nTR                  U 5        TR                  U5        TR                  U5        TR                  R                  UR                  5        Vs0 s H  oDR                  5       U_M     sn5        U$ s  snf )Nzfusing %s with %s)r  r  r[   r   r  ro  rn  rO  rm  r  r|  )r  r  r  node3r  r  r\   s        r^   fuse_two_nodes1Scheduler.fuse_nodes_once.<locals>.fuse_two_nodes  s     0%..2BENNDTU%%'F##%///$$V,11%?Eu%u%OOE"##**.3oo.?@.?u$.?@ L As   C8c                D  > TR                  U 5      T;   d  TR                  U5      T;   a  TR                  TR                  U 5      TR                  TR                  U5      S 5      5      nUc   eUu  p4nTR                  US 5        TR                  US 5        TR                  U5      UL d   eTR                  U5      UL d   eU" 5       (       a  TR                  X5      (       a  M  T" XE5        TR                  U 5      T;   a  M  TR                  U5      T;   a  M  g g rZ   )r  r2  rN  will_fusion_create_cycle)	r  r  pending_fusion
is_speedup	node_key1	node_key2r  pending_fusionsr\   s	         r^   resolve_pending_fusions:Scheduler.fuse_nodes_once.<locals>.resolve_pending_fusions  s    ##E*o=&&u-@!0!4!4''.#''(;(;E(BDI" &1113A0
y##It4##It4**95BBB**95BBB!||t'D'DU'R'Ry4' ##E*o=&&u-@ra   c                    U R                   $ rZ   rb  r  s    r^   r   +Scheduler.fuse_nodes_once.<locals>.<lambda>\  s    !++ra   r  )r  rL   r  rL   r   rL   r  )r   r  r  r  r  r  r  get_possible_fusionsr  r  r  r  callabler   rO  r  r  rn  )r\   rs  rR   r  r  r  speedupseen_pair_speedup_fnis_speedup_fnr  r  r  r  r  s   `          @@@r^   rR  Scheduler.fuse_nodes_once  s    !'""7==11;<#  )=)=)?@ $  	
	$	->		 	 	5$	5->	5	5 	52 !55e<LE $E1''.E''.E}}U**43P3P4 4 00>G$$.5e-DOE*.5e-DOE*u,) =, @J|3B3I3I3K/Mi4 $$]3&&y1Y>>>&&y1Y>>>t'D'D( ( y4 4L {(=>..u5!!%(ra   c                   [        U R                  5      nSn[        U R                  5      n[        R	                  SU5        [        [        R                  U 5      5       GH(  u  pV[        R                  U5      n[        U5      S:  a  M,  Ub  X1:  a    OU R                  U5      (       d  [        R	                  SU5        Md  US-  n[        R                  S:  n[        US   R                  USUS9n[        R                  S	[        U5      U5        U H  n	UR                  U	5        M     UR                  U5        U R                   R#                  UR%                  5        V
s0 s H  oR'                  5       U_M     sn
5        GM+     [)        US
 S9U l        U R+                  U R                  5      U l        [        R                  SUU[        U R                  5      5        U R-                  U R                  5        gs  sn
f )z
Groups parallel nodes
r   z2ComboKernels: Generating with num_ck_nodes = %s...rO  Nz)ComboKernels: Not speeding up %d-th groupr   Tr  z0ComboKernels: Combining %d nodes for %d-th groupc                    U R                   $ rZ   rb  r  s    r^   r   5Scheduler.create_combo_kernel_nodes.<locals>.<lambda>  s    q{{ra   r  zDGenerated ComboKernel nodes: %d ComboKernels, totally %d -> %d nodes)r   rs  rs   r   r  rr  r  r#  r  speedup_by_combo_kernelr   r  rQ   r  rn  rO  rm  r  r|  r[   r  r  rn  )r\   r  r  r  num_nodes_orignumr  r  rY  rR   r  s              r^   r  #Scheduler.create_combo_kernel_nodesa  s    !,TZZ		FU'&DDTJ
NC 3CCINI9~!'E,@//	::		EsKQJE$;;a?O4!&&*. /	K HHBI
 """4( "OOK(##**4?4I4I4KL4Kq{*4KL7
< K-BC
33DJJ?
R

O		
 	!!$**- Ms   (H
c                L    U H  nUR                  U R                  5        M      g rZ   )rn  rm  )r\   rs  rR   s      r^   rn  Scheduler.prune_redundant_deps  s     D%%d&=&=> ra   c                  ^ ^	^
 / m	[         [        [        [        4      " 5       m
SU	U
U 4S jjn[        R                  " [
        5      nU HE  nT R                  U5      (       a  M  UR                  5        H  nX5   R                  U5        M     MG     UR                  5        H  nU" U5        M     [        R                  (       ak  [        R                  " [
        5      nU H,  n[        USS5      nU(       d  M  Xx   R                  U5        M.     UR                  5        H  nU" U5        M     T R                  T	5      m	T	R                  T R                  SS9  [         R#                  S[%        T	5      5        T	$ )zN
Helper to find all legal fusion opportunities, sorted by self.score_fusion()
c                  > [        U 5       H  u  pU US-   US-   [        R                  -     H  nX#4nUT;   a  M  TR                  U5        TR	                  X#5      (       a  TR                  U5        MH  UR                  5       (       d  UR                  5       (       d  Mt  TR	                  X25      (       d  M  TR                  X245        M     M     g r  )rr  r   )max_fusion_buffer_group_pairwise_attemptsrO  r  r  r  r  )rs  node1_indexr  r  r  possible_fusionsr1  r\   s        r^   check_all_pairs7Scheduler.get_possible_fusions.<locals>.check_all_pairs  s    &/&6""!Ok'FF'GE
 !.Cd{ HHSM}}U22(//4++--1A1A1C1CJ J )//?! '7ra   r   NT)r  reversezfound %d possible fusionsrs  r  r   r   )r   r  rL   r@  r   r   unfusable_noderG  r  r   r   aggressive_fusionr   *get_possible_fusions_with_highest_priorityrs  score_fusion_keyr  r  rs   )r\   rs  r  buffer_names_groupingrR   r   node_groupinggroup_groupingr   r  r1  s   `        @@r^   r  Scheduler.get_possible_fusions  sV    % 13D DEFH	@ 	@( !, 7 7 =D""4((--/%*11$7 0 
 399;MM* < ##(44T:Ngt45")006  "0!6!6!8. "9  JJ
 	$"7"7F4c:J6KLra   c                  ^ ^^^^ [         [           " 5       mSUUUU U4S jjmUR                  5       R                  R	                  5       UR                  5       R                  R	                  5       -  mUR
                  R                  R	                  5       UR
                  R                  R	                  5       -  T-
  m[        UU 4S jT 5       5      nU(       a  [        X5      " S5        U$ )zf
Finds whether there's a path from node1 to node2 (or vice-versa)
caused indirectly by other fusions.
c                ,  > [        U [        5      (       a~  U T;  ax  TR                  U 5        U R                  5       R	                  T5      (       a  g[        TU R                  -  5      =(       d#    [        UU4S jU R                  T-
   5       5      $ g)NFc              3  N   >#    U  H  nT" TR                   U   5      v   M     g 7frZ   r:  r&  r  
found_pathr\   s     r^   r'  IScheduler.will_fusion_create_cycle.<locals>.found_path.<locals>.<genexpr>  s,      H!DA #4#:#:1#=>>!D   "%)r   rA  rO  r}  issubsetr   r   r)  )rR   combined_ancestorscombined_namesr%  r\   visiteds    r^   r%  6Scheduler.will_fusion_create_cycle.<locals>.found_path  s    $ 233G8KD!++-667IJJ !   ?@ C H!%2D!DH E  ra   c              3  N   >#    U  H  nT" TR                   U   5      v   M     g 7frZ   r:  r$  s     r^   r'  5Scheduler.will_fusion_create_cycle.<locals>.<genexpr>  s&     WDVqJt66q9::DVr'  zwill create cyclerR   rL   r   r   )r   rA  r}  _dictr  r   r)  r  )r\   r  r  cycler)  r*  r%  r+  s   `   @@@@r^   r  "Scheduler.will_fusion_create_cycle  s     /02	 	2 %%'--224'')//4467 	
 OO!!&&(5??+@+@+E+E+GG WDVWWe#$78ra   c                  ^ ^ SSK Jm      SU 4S jjnU" U5      nU" U5      n[        U4S jU 5       5      n[        U4S jU 5       5      nUR                  U5      nSn	U H  n
 U	[	        U
S   5      -  n	M     T R                  X5      n[        R                  R                  R                  U	S	U-  5      (       a  g
g! [
         a       gf = f)a  
Return true if fusing the two nodes can potentially increasing peak memory.

The implementation is more like a heuristic since we don't really know if we are at peak
or not when trying to fuse these two nodes. The order of nodes may change later which makes the
peak memory estimation hard.

Here is how we decide the LOWER BOUND of extra memory allocation if we fuse these 2 nodes:
1. find all buffers read by each node with a single user. These buffers are supposed to
   be reused if we don't fuses these 2 nodes
2. find the intersection of these buffers for the two node and sum the total buffer size.
   If we don't fuse these two nodes, we can at lease avoid this much memory allocation.
   Note that the extra memory allocation is not necessarily causing peak memory increase.
   This is just a heuristic.

We return true only if the saving for fusion can not trade off the extra memory allocation.
r   )buffer_reuse_keyc                P  > / nU R                   R                   H  nTR                  R                  UR                  5      nU(       d  M1  [        UR                  5      S:X  d  ML  UR                  R                  5       (       d  Mm  UR                  UR                  5        M     U$ r  )
r   r   r   r2  rd   rs   rW   rR   has_tensor_outputr  )rR   r   r  r   r\   s       r^   _find_single_user_inputsKScheduler.can_fusion_increase_peak_memory.<locals>._find_single_user_inputs  sw     F&&,,&&**277333syy>Q.3883M3M3O3OMM#((+ - Mra   c              3  4   >#    U  H  nT" U5      v   M     g 7frZ   r   r&  r   r4  s     r^   r'  <Scheduler.can_fusion_increase_peak_memory.<locals>.<genexpr>       #S]c$4S$9$9]r'  c              3  4   >#    U  H  nT" U5      v   M     g 7frZ   r   r:  s     r^   r'  r;    r<  r'  r   rO  F    T)rR   rL   r   zlist[ir.Buffer])r  r4  r   intersectionr   rS  score_fusion_memoryrI   r   r  statically_known_gt)r\   r  r  r7  lhs_dep_nodesrhs_dep_nodeslhs_reuse_keysrhs_reuse_keyscommon_reuse_keysmemory_overheadr  	bw_savingr4  s   `           @r^   can_fusion_increase_peak_memory)Scheduler.can_fusion_increase_peak_memory  s    * 	6	#		 1707##S]#SS##S]#SS*77G$C3s1v;. % ,,U:	 77//iPP  s   (C
CCc                    [        [        UR                  UR                  -
  5      [        UR                  UR                  -
  5      5      nUS:  $ )a  
This function prevents fusion for nodes that can increase memory
footprint. This problem is more common in horizontal fusion, where nodes
that are far apart in the original order get fused, lengthening the live
intervals of tensors. This is very evident in models with activation
checkpointing, where the recomputed nodes from different checkpointed
regions get fused and significantly increase the memory footprint.

The current attempt is a quick, possibly hacky, heuristic to prevent the
fusion of nodes that are far away in the original order.

A better but difficult to implement heurisitic would be to use live
intervals of the buffers, find region of peak pressure in the original
program and prevent fusion that crosses that peak region. We might need
special care or good approximation in this implementation, as fusion of
node changes live intervals, and re-computing live intervals and peak
memory after each fusion can introduce large compilation overhead.
@   )rW  re  r   r   )r\   r  r  proximity_scores       r^   are_long_distant_nodes Scheduler.are_long_distant_nodes2  sE    * %//12%//12
 ##ra   c                (   0 nUR                   R                  5        Vs0 s H  oUR                  U_M     nnUR                   R                  5        Vs0 s H  oUR                  U_M     nnU GH  n[        R                  R                  U5      n	Xh   n
Xx   n[        U
[        5      (       a  [        U[        5      (       d  S[        U
5       S[        U5       3XH'   Ms  U
R                  5       UR                  5       :w  a(  SU
R                  5        SUR                  5        3XH'   M  [        U
R                  5      [        UR                  5      :w  a  SXH'   M  U
R                  5       nUR                  5       nX:w  a  SU SU 3XH'   GM!  U
R                  5       UR                  5       :X  a  SU
 SU 3XH'   GMP  Sn[        U	[        R                  5      (       d  SU	R                    3nS	U
 SU S
U 3XH'   GM     [#        U5      $ s  snf s  snf )ze
Try to decide reasons why fusion fail due to no shared memory even though
there are common buffers.
znot MemoryDep: z v.s. zdifferent numel: 	broadcastzdifferent offset: zMismatch loop orders: r   zLayout: zUnknown reason: z. )r   r  rd   rI   r   r,  r   r(   rm   	get_numelrH   rF  
get_offsetnormalize_with_stride_orderr    r1  ro   r   )r\   r  r  common_buf_namesreasonsr!  node1_name2depnode2_name2depr  r   lhs_deprhs_deplhs_offrhs_off
layout_strs                  r^   decide_fusion_fail_reason#Scheduler.decide_fusion_fail_reasonM  s    383D3D3U3U3WX3WC((C-3WX383D3D3U3U3WX3WC((C-3WX(H''$$X.C$.G$.Ggy11GY9W9W%d7m_F4=/J !   "g&7&7&99'(9(9(;'<F7CTCTCVBWX !  W\\*mGLL.II$/!((*G((*G! '9	y$Q! 3356689 '=WIVG9$U! Jc2#5#566'

|4
"7)6'"ZLI U )\ 7|c YXs   H
Hc                8   [         R                  (       a  [        S X4 5       5      (       a  gUR                  R	                  5       nUR                  R	                  5       nX4-  nU(       d  gUR                  R                  5        Vs0 s H  ofR                  U_M     nnUR                  R                  5        Vs0 s H  ofR                  U_M     nn/ n	U Hw  n
Xz   nX   nUR                  5       UR                  5       :X  d  M/  U	R                  [        R                  R                  R                  UR                  5       SS9UU45        My     [        U	5      S:X  a  g[        U	[         R"                  " S5      S9u  pn[%        U[&        5      (       a  [%        U[&        5      (       d  gUR(                  UR(                  :w  a4  UR+                  5       UR+                  5       :X  a  U R-                  U5      $ gUR/                  5       (       d  UR1                  X5        OZUR/                  5       (       d  UR1                  X5        O3[2        R5                  SUR7                  5       UR7                  5       5        U R9                  X5      $ s  snf s  snf )z
Right now just greedily reorder the loop of node1 to be compatible with node2,
but ideally we should have some heuristics to reorder the loop for node2
to be compatible with node1 if that's more efficient.
c              3  @   #    U  H  oR                  5       v   M     g 7frZ   )r  r  s     r^   r'  >Scheduler.shared_data_after_reordering_loop.<locals>.<genexpr>  s      8
 .1HHJJr{  r   r  r  z?Don't reorder loops since both nodes are reductions: %s v.s. %s)r   r  r)  r   buffer_namesr  rd   rT  r  rI   r   r  r  rR  rs   rW  r  r  r   r(   r  r  dep_size_hintr  r  r  r  r[   r@  )r\   r  r  node1_buffer_namesnode2_buffer_namescommon_buffer_namesr!  rW  rX  
candidatesbuffer_namerY  rZ  _numels                 r^   !shared_data_after_reordering_loop+Scheduler.shared_data_after_reordering_loop  sI    00C 8
!&8
 5
 5
 "..;;="..;;=0E"383D3D3U3U3WX3WC((C-3WX383D3D3U3U3WX3WC((C-3WX 
.K$1G$1G3356689 !!((2273D3D3FQR2S / z?a $'zx7J7J17M#N '9--Z5S5Sw///
   "g&7&7&99))'22 !!##++G=##%%++G=##Q   ''55e YXs   J?Jc                    [        U[        [        45      =(       a6    UR                  5       (       + =(       a    [	        UR
                  5      (       + $ )z.
Is this node unfusable under any conditions.
)r   r:  r  r  rF   rR   r  s     r^   r  Scheduler.unfusable_node  sD    
 t79OPQ C$$&&C7		BB	
ra   c                   UR                  5       [        R                  R                  ::  a  gUR	                  5       nUR                  5       nSnXEU-  :  a	  U" S5        g[        S UR                  5        5       5      nU[        R                  R                  R                  R                  4:X  a	  U" S5        gS	S jnU" UR                  5       R                  5      (       a  UR                  5       (       d	  U" S5        gg)
zD
Heuristics to avoid benchmarking predictably slow prologue fusions
T皙?z@prologue fusion will not increase amount of bytes read in kernelFc              3     #    U  HT  nUR                   c  M  UR                   R                  5         H#  nUR                  S:X  d  M  UR                  v   M%     MV     g 7f)Ncall_function)rR   r  r]   r  )r&  r  rX  s      r^   r'  EScheduler.check_prologue_fusion_heuristics_fusable.<locals>.<genexpr>   sS      
.vv  VV'')tt&	 AHH * .s   A,AAz\prologue fusion will not increase attempt to fuse in padding bc it increases unaligned readsc                F    U R                   S:*  =(       a    U R                  $ )NrO  )itemsizeis_floating_point)rY  s    r^   low_prec_fpGScheduler.check_prologue_fusion_heuristics_fusable.<locals>.low_prec_fp  s    >>Q&B5+B+BBra   zVprologue fusion that must be upcast to fp32 not profitable for low precision templates)rY  ztorch.dtyper   r   )r}  rI   r   invoke_quant_opsr  r	  r  r|  r  opsatenconstant_pad_nddefaultrj  rY  r  )	r\   prologue_noderv  r  
read_byteswrite_bytesBYTES_THRESHOLD_MULTIPLIERr  rw  s	            r^   (check_prologue_fusion_heuristics_fusable2Scheduler.check_prologue_fusion_heuristics_fusable  s     ,,.!''2J2JJ"88:
#::< &)"'AABRS  
",,.
 
 uyy~~55==??n 	C @@BHHII!>>@@h ra   c                h  ^ XL a  g[        X5      nUR                  5       (       a4  U R                  UR                  5       5      R	                  X5      (       a  g[        U[        5      (       d  [        U[        5      (       a	  U" S5        g[        U[        [        45      (       a  UR                  5       (       d	  U" S5        g[        U[        [        45      (       a  UR                  5       (       d	  U" S5        gUR                  5       UR                  -  (       a	  U" S5        gUR                  5       (       Gac  [        R                  (       d	  U" S5        gUR                  5       (       d  UR                  5       (       a	  U" S5        gUR                  5       n[        U[        R                   5      (       d	  U" S	5        gUR#                  5       n[%        S
 UR&                   5       5      U-
  nUR)                  5       U-  (       a	  U" S5        gUR+                  5       (       d  UR+                  5       (       a	  U" S5        gUR-                  5       mTSS  HK  nUR/                  5       nU H2  n	[1        U4S jU	R2                   5       5      (       a  M)  U" S5            g   MM     [        U[4        5      (       d  U/O2UR6                   V
s/ s H  oR                  5       (       d  M  U
PM     sn
n[9        U5      S:X  d   eUS   n[9        TS   R:                  5      S:X  aU  [9        TS   R:                  S   R2                  5      S:X  a,  TS   R:                  S   R2                  S   R<                  UL d	  U" S5        gU R?                  XU5      (       d  gUR                  5       (       aH  UR+                  5       (       d*  UR                  5       (       d  [        R@                  (       d	  U" S5        gUR)                  5       [B        RD                  RF                  -  (       d0  UR)                  5       [B        RD                  RF                  -  (       a	  U" S5        gUR                  5       nUR                  5       nX:w  a
  U" SX5        gAU RI                  X5      nU[        RJ                  :  a&  [        RL                  (       a  U RO                  X5      n[P        RS                  [T        RV                  5      (       a4  [P        RY                  SUR[                  5       UR[                  5       U5        [B        R\                  R_                  XX/5      (       d  gUR                  5       UR                  -  (       a_  U Ra                  X5      =(       aG    [B        R\                  Ra                  XX/5      =(       a     U R                  U5      Ra                  X5      $ [B        R\                  Rc                  XX/5      =(       a     U R                  U5      Rc                  X5      $ s  sn
f )zR
Determine if it is possible to combine node1 and node2 into a
single fused node.
FTz/grouped node must not be fused with other nodesznode1 is extern or nopznode2 is extern or nopznode1 must go before node2zprologue fusion turned offz2prologue fusion only supported for pointwise nodesz2prologue fusion only supported for TritonTemplatesc              3  @   #    U  H  oR                  5       v   M     g 7frZ   rt  )r&  inps     r^   r'  %Scheduler.can_fuse.<locals>.<genexpr>R  s     E_c<<>>_r{  z;prologue fusion not implemented for kernel for these inputsz:template prologue can only fuse functional pointwise nodesNr  c              3  @   >#    U  H  oR                   T;   v   M     g 7frZ   r   )r&  rw   prologue_nodess     r^   r'  r  b  s     QytyyN:ys   z7template prologue can only fuse nodes with a single user   r   zEtemplate prologue can only fuse nodes with a single use into templateztemplate epilogue not satisfiedz#fusion for buffer explicit disabledzdevice mismatch (%s vs %s)z%s and %s has %s shared data)2r  r  r  r   can_fuse_multi_outputs_templater   rg  r:  r  r}  r   r   prologue_fusionr  rj  r    r  get_allowed_prologue_inpsr   inputsr  r*  r|  r   r  rW   rA  r"  rs   r   rR   r  r  rI   r   no_fuse_buffer_namesr@  score_fusion_memory_thresholdr  rk  r  r  r  r  r  r[   choicesr  can_fuse_verticalcan_fuse_horizontal)r\   r  r  r  ri  r  unsupported_prologue_argsrR   	node_outsr   r  template_snodestemplate_snoder  device2shared_data_scorer  s                   @r^   r  Scheduler.can_fuse  s    >%4#3#3$

)
)%
7$8 e122j'7
 7
 ABu8:PQRR%%''()u8:PQRR%%''()$$&8,-))01!!##u'8'8':':HI779Hh(?(?@@HI$,$F$F$H! EX__EE'( &
 %%'*CCQR--//53Q3Q3S3SPQ"__.N&s+ ,,.	$CQsyyQQQUV$ % , "%);<< !&AAaA 
 '1,,,,Q/N N2&../14r*2215;;<A"2&..q177:??>Q[ @@sSS**,,!!##))12""$qww'C'CC""$qww'C'CC56!!#""$,f> 44UB D DD11 $ F Fu T))'--88##.  !	 yy!!$uHH$$&8 &&u4 MII//UVM$$V,>>uL 9900U M""6*>>uLMC Bs   X/3X/c                   UR                  5       n[        X5      n[        [        5      nUR                   Ht  nU R
                  R                  UR                  UR                  5      n[        U[        5      (       a  U R                  XaU5      (       a  Ma  XW   R                  U5        Mv     UR                  R                   H  n[        U[        5      (       d  M  UR                  U R
                  R                  UR                  UR                  5      5      n	U	(       d  Mb  U	 H,  n
U R                  X5      (       d  M  U	R!                  U
5        M.     M     [#        S [$        R&                  R)                  UR+                  5       5       5       5      nX-  (       a	  U" S5        gUR-                  5       nU HJ  nU R.                  U   R1                  5       nXR2                  U   R4                  -  (       d  MB  U" S5          g   g)z
Check if it is legal to fuse a consumer (node2) into a producer (node1).

We can fuse them if all the reads of node2 either match
corresponding writes in node1, or are written by nodes that can
be scheduled before the fusion of node1 and node2.
c              3  :   #    U  H  nUR                   v   M     g 7frZ   rA  rB  s     r^   r'  .Scheduler.can_fuse_vertical.<locals>.<genexpr>  s      $
U HHUrD  zmemory deps did not matchFz(intermediate nodes between node1 & node2T)r  r  r   r   r   r  r2  rd   r   r*   fusable_weak_depr  r   r   r(   fusable_read_and_writern  r   rE  rF  r%  r   r}  r   r_   rm  r   )r\   r  r  node1_buf_namesr  remaining_deps_by_namer!  rd   cd	remainingr  remaining_depsnode1_op_namesr_  s                 r^   r  Scheduler.can_fuse_vertical  s     002%7B47H++C((,,SXXsxx@D#w''D,A,A#e,T,T"(//4	 , ##**Bb),,.22%%))"''277;I y#B222::!((, $ + $ $
 445K5R5R5TU$
 

 +
 +,224"D&&t,==?G 7 7 @ J JJJ>?	 # ra   c                P  ^ UR                   UR                  5       ;  a  gUR                  R                   Vs/ s H!  nUR                   UR                  :X  d  M  UPM#     nn[        U5      S:w  a  gUS   m[        T[        5      (       d   e[        TR                  [        R                  5      (       a  gU R                  UR                     nUR                  R                   Vs/ s H  owR                   U:X  d  M  UPM     nn[        U4S jU 5       5      $ s  snf s  snf )NFr   r   c              3  $  >#    U  H  n[        U[        5      =(       ai    [        UR                  [        R
                  5      (       + =(       a9    UR                  TR                  :H  =(       a    UR                  TR                  :H  v   M     g 7frZ   )r   r(   r   rE  r   TMPrF  )r&  r  rn  s     r^   r'  -Scheduler.fusable_weak_dep.<locals>.<genexpr>  sn      

 '	 tY' ('

DHH==(

ekk)( 		UZZ'( 's   BB)rd   r  r   r   r  rs   r   r(   r   rE  r   r  r4  r   r  )	r\   weak_depr  r  rn  mutating_writesr  r  relevant_readss	       `    r^   r  Scheduler.fusable_weak_dep  s    == 6 6 88 **11
1zzX222 1 	 

 1$"%++++u{{DHH55++H,A,AB	"..44
4T		Y8ND4 	 
  

 '
 
 	
#

s   DD*D#D#c                8   [        U[        5      (       Gab  U R                  R                  UR                  UR                  5      nX2R                  :w  dR  [        UR                  [        R                  5      (       d)  [        UR                  [        R                  5      (       a  g[        R                  (       a:  UR                  UR                  :w  a   UR                  5       nUR                  5       nUR                  UR                  :H  =(       aa    [        UR                  5      [        UR                  5      :  =(       a/    UR                  S [        UR                  5       UR                  :H  $ [        U[        5      (       a  U R                  R                  UR                  UR                  5      nU R                  R                  UR                  UR                  5      nUR                   UR                   :X  a  UR                   b  X4:X  a  ggr   )r   r(   r  r2  rd   r   rE  r   r  r   r  r  r  rs   rF  r)   rL  )r\   r  rn  	read_name
write_names        r^   r   Scheduler.fusable_read_and_write  sh   dI&&--11$))TYYGI ZZ'&tzz488<<&u{{DHH==00T]]enn5T ~~') 

ekk) ?		Nc%**o5?II/EJJ0EJJ>
 g&&--11$))TYYGI..225::uzzJJ		UZZ'JJ*+ra   c                    SnXR                   ;  a6   UR                  5       (       d  UR                  5       nX R                   U'   U$ U R                   U   nU$ ! [         a     N-f = fr  )r  has_unbacked_symbolsnumbytes_hintKeyError)r\   r!  ress      r^   rd  Scheduler.dep_size_hint*  sy    000//11++-C /2&&s+ 
 ,,S1C
   	s   %A 
A&%A&c                B  ^  [        UR                  R                  5      [        UR                  R                  5      -   n[        UR                  R                  5      [        UR                  R                  5      -   n[	        X45      S-  [        X45      :  a  X4:  a  UnUnUnUR                  R                  UR                  R                  -   Vs/ s H9  nXbR                  R                  ;   d  XbR                  R                  ;   d  M7  UPM;     nn[        U 4S jU 5       5      $ UR                  R                  UR                  R                  -  UR                  R                  UR                  R                  -  -  n[        U 4S jU 5       5      $ s  snf )zV
The first term in our fusion score that estimates number of saved
memory operations.
r  c              3  F   >#    U  H  nTR                  U5      v   M     g 7frZ   rd  rW  s     r^   r'  0Scheduler.score_fusion_memory.<locals>.<genexpr>Q  s     ?$3t))#..$r6  c              3  F   >#    U  H  nTR                  U5      v   M     g 7frZ   r  rW  s     r^   r'  r  V  s!     I6Hs4%%c**6Hr6  )rs   r   r   r   r3  rW  r  )	r\   r  r  node1_dep_lennode2_dep_lentmpr!  rR  common_memory_depss	   `        r^   r@  Scheduler.score_fusion_memory:  sa    E--334s5;L;L;S;S7TTE--334s5;L;L;S;S7TT },q03}3TT, !,,22U5F5F5M5MMMC++111S<M<M<T<T5T M   ?$???#//558I8I8P8PP##e&7&7&>&>>
 I6HIIIs   6FFc                   [        U5      S:X  a  U$ 0 nU H  u  p4UR                  5       UR                  5       :X  d   eUR                  5       n[        U R                  U5      R	                  X45      5      nXb;  a  X44/X&'   Mo  X&   R                  X445        M     [        UR                  5       [        R                  " S5      S9S   n[        U5      S:  d   eU$ )Nr   r  r   )
rs   r   r   r  get_fusion_pair_priorityr  r3  r  r  r  )r\   r  "possible_fusions_group_by_priorityr  r  r  fusion_pair_priority&possible_fusions_with_highest_prioritys           r^   r  4Scheduler.get_possible_fusions_with_highest_priorityX  s    
  A%##  	+ -LE##%)9)9);;;;%%'F#&  (AA%O$  $MNL2H 3HOON - 25.446H<O<OPQ<R2

2. 9:Q>>>55ra   c                D    [         R                  R                  " U /UQ76 $ )z
Shim for list.sort(key=...)
)rI   r  score_fusionr  s     r^   r  Scheduler.score_fusion_keyx  s     yy%%d3U33ra   c                    [        [        R                  R                  5       5      n[	        U R
                  5       H9  nUR                  XR                  5        UR                  UR                  5        M;     g)zW
Populate node.last_usage recursively (also for the nodes within a FusedSchedulerNode)
N)
r   rI   r   r  r1  rs  r:  r4  r  r   )r\   r8  rR   s      r^   r  Scheduler.compute_last_usage  sV    
 ))A)A)CDTZZ(D 35L5LM&&t7 )ra   c                   [        U R                  [        R                  R                  -
  [        R                  R
                  R                  -
  5       GH  nXR                  ;   a[  U R                  U   nUR                  5       (       a5  [        R                  R
                  R                  UR                  5        Ml  Mn  U[        R                  R                  ;   d  M  [        R                  R                  U   n[        U[        R                  5      (       a+  [        R                  R
                  R                  U5        M  [        U[        R                  5      (       a  GM  UR                   n[        U[        R"                  5      (       a  UR%                  5       (       d   e[        R                  R
                  R                  UR                   5        GM     U R                  R'                  5         g)z*Free any buffers that are no longer neededN)r  r  rI   r   r  r   freedr   r   codegen_freerR   rB  r   r    r1  GeneratorStater   r  is_input_bufferclear)r\   rd   r   r  storages        r^   free_buffersScheduler.free_buffers  sU   %%gg%%&gg""(()
D
 '''&&t,<<>>GG((55chh? "---gg**40c2#5#566GG((55c:R%6%677!hhG"7BMM::w?V?V?X?XXGG((55gllC)
, 	!!'')ra   c                    U R                   R                  5        H  nUR                  5         M     U R                  5         g rZ   )r  r   flushr  )r\   r^  s     r^   r  Scheduler.flush  s.    }}++-GMMO .ra   c                   [        U[        5      (       d   e[        S   S==   S-  ss'   [        R                  " [        SS95         UR                  5         UR                  5         S S S 5        UR                  n[        U[        R                  5      (       d   S[        U5      < 35       eUR                  [        R                  R                  5        U R                  5         g ! , (       d  f       N= f)NrH  extern_callsr   F)increase_kernel_countztype(node)=)r   r:  r   rI   set_kernel_handlerr%   r  r=  rR   r    r  rm   r  r   r   r  )r\   scheduler_noderR   s      r^   codegen_extern_callScheduler.codegen_extern_call  s    .*CDDDD
 	^,1,!!&u"EF002##% G ""$00B[T$ZM2BB0QWW))* GFs   	!C++
C9c                |   [        UR                  5      (       a  UR                  c
   U S35       e[        R                  R                  U5        [        UR                  5      nUc  [        SUR                   35      e[        5       (       d  UR                  S:X  aN  [        R                  R                  U5      =nR                  S:  a  [        U[        R                  " 5       5      e[        UR                  5      (       a.  UR                  S:X  d  [!        [        R                  " 5       5      eU" U 5      $ )Nz( should have been normalized in loweringzUnsupported device type: cuda   mps)rD   rm   rE  rI   r   add_device_infor$   r;  r   r  r  get_device_propertiesmajorr+   inspectcurrentframer,   )r\   r  device_schedulingdevice_propss       r^   create_backendScheduler.create_backend  s    &++&&&,,*B 	
h>?	
B 	
'5fkkB$!:6;;-HII||v%%*ZZ%E%Ef%MM\TTWXX(w7K7K7MNN$$V[[E-A#G$8$8$:;; &&ra   c                    Uc   eXR                   ;  a  U R                  U5      U R                   U'   U R                   U   $ rZ   )r  r  r  s     r^   r  Scheduler.get_backend  s@    !!!&$($7$7$?DMM&!}}V$$ra   c                  ^  SU 4S jjnUR                  5        VVs0 s H?  nUR                  c  M  UR                  R                  5         H  nU" U5      U4S _M     MA     nnn[        UR	                  5       5      nU(       aJ  [        U[        R                  " S5      S9u  pg[        R                  R                  R                  U5        g g s  snnf )Nc                   > U TR                   ;  aM  TR                   R                  [        U R                  R                  5       VV s0 s H  u  pX_M	     sn n5        TR                   W    $ s  sn nf rZ   )r  r  rr  r   rs  )r  ro  r\   s     r^   	get_order*Scheduler.enter_context.<locals>.get_order  s^    ,,,$$++i>V,W>VdaQT>V,WX''** -Xs   	A.
r   r  )r  ztorch.fx.Noder   r   )r|  rR   r  r   r  rW  r  r  rI   r   r   enter_context)r\   rR   r  r  rX  r  r  lasts   `       r^   r  Scheduler.enter_context  s    	+ ^^%
%vv  VV'') q\1t# * % 	 
 w||~&'x':':1'=>GAGG  ..t4 
s
   C1Cc                   ^  U R                   U   R                  n[        U4S jU 5       5      =(       a#    XR                  ;  =(       a    XR
                  ;  $ ! [         a     gf = f)NFc              3  n   >#    U  H*  oR                   =(       d    UR                  5       T;   v   M,     g 7frZ   )r{  r[   )r&  rw   fused_node_namess     r^   r'  AScheduler.can_buffer_be_removed_through_fusion.<locals>.<genexpr>  s)     VPUC3C CCPUr0  )r   rW   r  r  r  r4  )r\   rd   r  rW   s     ` r^   $can_buffer_be_removed_through_fusion.Scheduler.can_buffer_be_removed_through_fusion  sj    	$$T*00E VPUVV 41114333	
  		s   A 
A('A(c                  ^  [        U[        5      (       a  [        U 4S jUR                   5       5      $ UR	                  5       (       d  gUR
                  c  g[        UR
                  [        R                  5      (       a  g[        UR
                  [        R                  5      (       a  g[        UR
                  SS5      (       a  g[        UR
                  5      (       a  gg)zBReturn True if we should partition the inductor graph on this nodec              3  F   >#    U  H  nTR                  U5      v   M     g 7frZ   )should_partition)r&  r  r\   s     r^   r'  -Scheduler.should_partition.<locals>.<genexpr>  s     Mt,,U33r6  TNunbacked_bindingsF)r   rA  r)  r"  rD   rR   r    
DeviceCopyConditionalr   rC   r  s   ` r^   r  Scheduler.should_partition  s    d.//MMMM{{}}99dii//dii004991488!$)),,ra   c                    0 nUR                  [        R                  R                  5        U R                   H4  nUR
                  R                  5        H  u  p4UR                  X'   M     M6     U$ )zf
Return a mapping from name strings to the corresponding graph inputs or
base scheduler node outputs.
)r  rI   r   rB  rs  r   r  rR   )r\   r  rR   rd   scheduler_buffers        r^   get_name_to_nodesScheduler.get_name_to_nodes  sd     UWAGG001JJD*.*>*>*D*D*F&%5%:%:" +G  ra   c           	        [        [        R                  R                  5       VVs0 s H  u  p#X2_M	     nnn[        [        R                  R	                  5       5       VVs0 s H  u  p#X2_M	     nnn/ [        R                  l        [        U5       H  u  pgUR                  (       a  M  / nUR                   H#  nUR                  UR                  U5      5        M%     / n	UR                   H1  n
U	R                  UR                  U
R                  5       5      5        M3     [        R                  R
                  R                  [        UUU	UR                  5      5        M     gs  snnf s  snnf )zj
computes a mapping from partition input/output indices to graph input/output
indices for each partition.
N)rr  rI   r   rB  r  partition_mapsskip_cudagraphinput_nodesr  r2  output_nodesr[   r@   constant_names)r\   
signaturesidxrd   name_to_graph_input_indexname_to_graph_output_indexpartition_id	signatureinput_mappingoutput_mappingrR   s              r^   compute_graph_partition_maps&Scheduler.compute_graph_partition_maps  s;    (11E1E'F%
'F)#DI'F 	" %
 (11I1I1K'L&
'L)#DI'L 	# &
 "$'0'<#L''
 M!--$$%>%B%B4%HI .  N!..%%&@&D&DT]]_&UV / GG""))! !",,	! (=%
&
s   E'"E-c                  ^^	^
 S	U	4S jjm	    S
U	U
4S jjm
    SU	4S jjm    SS jn[        5       R                  " U
4S jU 5       6 nUR                  " U4S jUR                  5        5       6   U" U5      n[        5       nU HG  n[        R                  R
                  R                  U5      nUR                  UR                  5        MI     [        [        U[        R                  " S5      S95      $ )a9  
Returns all symbol inputs which are required to be in scope to successfully
perform codegen for this graph partition, including:
- free symbols used in partition nodes
- free symbols in partition input/node shapes, strides, and offsets. This is needed
  for recording cudagraphs for tensors with dynamic shapes.
c                  > [        5       nU R                  5       n[        U[        R                  5      (       a  UR                  [        UR                  5      [        UR                  5      -  [        UR                  5      -  5        [        U[        R                  5      (       a!  UR                  T" UR                  5      5        U$ Ub
   SU 35       eU$ )Nz*Expect layout to be None but found layout=)r   maybe_get_layoutr   r    Layoutr  r   rF  strideoffsetr  r  )rR   free_symbol_usesro   get_layout_symintss      r^   r  GScheduler.get_graph_partition_symbol_inputs.<locals>.get_layout_symintsR  s    9C**,F&")),, '' -"6==12"6==12
 fb&C&CDD$++,>v}},MN
 $# ~ @I~ $#ra   c                <  > [        U [        5      (       a+  [        5       R                  " U4S jU R                   5       6 $ U R
                  c   eU R
                  R                  5       nUR                  " U4S jU R
                  R                  5        5       6   U$ )z
Gets symbols used in node.
c              3  4   >#    U  H  nT" U5      v   M     g 7frZ   r   )r&  r  get_scheduler_node_symbol_usess     r^   r'  fScheduler.get_graph_partition_symbol_inputs.<locals>.get_scheduler_node_symbol_uses.<locals>.<genexpr>l  s     U4U;;r'  c              3  4   >#    U  H  nT" U5      v   M     g 7frZ   r   )r&  ir_noder  s     r^   r'  r$  q  s     U=T'$W--=Tr'  )	r   rA  r   r^  r"  rR   r  r  r   )rR   r  r  r#  s     r^   r#  SScheduler.get_graph_partition_symbol_inputs.<locals>.get_scheduler_node_symbol_usesd  s     $ 233!|))UU  99(((#yy==?##UTYY=R=R=TU $#ra   c                   > [        U [        R                  5      (       a
  [        5       $ [        U [        R                  5      (       a  T" U 5      $ [        S[        U 5       35      e)z?
Gets symbols used in input node shapes, strides, and offsets.
zUnsupported input node type: )r   r    r1  r   rR  r  rm   )rR   r  s    r^   get_input_node_symbolsKScheduler.get_graph_partition_symbol_inputs.<locals>.get_input_node_symbolsu  sU     $ 2 233!|#D")),,)$// *,I$t**VWWra   c                &    [        S U  5       5      $ )z
Filters a set of symbols that are required for codegen. Skip symbols
that are always internal to kernels, such as SymT.TMP, SymT.INDEX,
and SymT.R0_INDEX.
c              3     #    U  HV  n[        U[        R                  [        R                  [        R                  [        R
                  45      (       d  MR  Uv   MX     g 7frZ   )r   r   SIZEFLOATUNBACKED_INTUNBACKED_FLOAT)r&  r  s     r^   r'  VScheduler.get_graph_partition_symbol_inputs.<locals>.filter_symbols.<locals>.<genexpr>  sI       A!		

))++	  s   AA 	A r   )symbolss    r^   filter_symbolsCScheduler.get_graph_partition_symbol_inputs.<locals>.filter_symbols  s         ra   c              3  4   >#    U  H  nT" U5      v   M     g 7frZ   r   )r&  rR   r#  s     r^   r'  >Scheduler.get_graph_partition_symbol_inputs.<locals>.<genexpr>  s     Iyt,T22yr'  c              3  8   >#    U  H  u  pT" U5      v   M     g 7frZ   r   )r&  r  rR   r)  s      r^   r'  r6    s     N:Mwq$T**:Ms   rd   r  )rR   z	ir.IRNoder   OrderedSet[sympy.Symbol])rR   rL   r   r8  )rR   z0Union[ir.IRNode, sympy.Expr, ir.TorchBindObject]r   r8  )r2  r8  r   r8  )r   r^  r  rI   r   r  simplifyr  r   r  r  
attrgetter)r\   	partitionr  r3  candidate_symbolsr  r  symplified_sr)  r  r#  s           @@@r^   !get_graph_partition_symbol_inputs+Scheduler.get_graph_partition_symbol_inputsE  s    	$$	$#	$%	$ 	$"	XB	X%	X 	-	%	, 7Al6H6HIyI7
 	N+:K:K:MN	
 ++<=(2"A77++44Q7LJJ|001 #
 &(*=*=f*EFGGra   c           	       ^ ^ / n[        [        R                  R                  5       5      nT R	                  5       nSUU 4S jjm[        [        U5      [        U5      5       GHE  u  pg[        5       nU H,  n	UR                  U	R                  R                  5       5        M.     UR                  U5      n
[        R                  R                  U V	s/ s H  oR                  PM     sn	5      n[        UR                  UR                   -   Vs/ s H(  nT" UR"                  5      (       a  M  UR"                  PM*     sn5      U-
  n[        U 4S jU 5       5      n[        5       nU H  n	UR                  U	R$                  5        M      U Vs0 s H  nX;   d  M
  XU   _M     nnU Vs0 s H  nX;   d  M
  XU;   a  SOS_M     nnU Vs/ s H  nX;   d  M
  X;  d  M  UPM     nnU
R                  U5        [        U 4S jU
 5       5      n
U
 Vs/ s H  nT" U5      (       a  M  X_   PM     nnU Vs/ s H$  o[        R                  R&                  ;   d  M"  UPM&     nnT R)                  UU5      n[+        UUUUUU5      nUR-                  U5        UR/                  XJ-
  5      nGMH     USSS2   $ s  sn	f s  snf s  snf s  snf s  snf s  snf s  snf )	z
Gets signature for each graph partition, including input nodes, output nodes, and
whether deallocating an input within graph partition.
c                B  > TR                   R                  U S5      nUc  g[        UR                  R                  [
        5      (       aU  [        UR                  [        R                  5      (       a+  TR                  R                  U S5      =n(       a  T" U5      $ gg)z
Checks if buf_name is NoneLayout. Buffers with NoneLayout is not allocated
so graph partition should not take it as inputs or outputs.
NFT)	r   r2  r   rR   ro   r3   r    MutationOutputr4  )r  r   r  is_none_layoutr\   s      r^   rC  ?Scheduler.get_graph_partition_signature.<locals>.is_none_layout  s    
 ""&&x6C{#((//:66chh(9(9::!%!8!8!<!<Xt!LLIL))44ra   c              3  Z   >#    U  H   nTR                   R                  X5      v   M"     g 7frZ   r4  r2  r&  rd   r\   s     r^   r'  :Scheduler.get_graph_partition_signature.<locals>.<genexpr>  ,      /1D ''++D771   (+TFc              3  Z   >#    U  H   nTR                   R                  X5      v   M"     g 7frZ   rF  rG  s     r^   r'  rH  	  rI  rJ  Nr  )r  r   r   r   )r   rI   r   r  r  r$  r1  r  r   r  r?  r   r\  r]  r   r   r   rd   r   r  r>  r0   r  r^  )r\   
partitionsskip_cudagraphsr  unmet_output_namesr  r;  r
  output_namesrR   returned_output_namesr   r  partition_input_namesr  rd   r  input_deallocationextra_output_namesr  r  symbol_inputspartition_signaturerC  s   `                      @r^   get_graph_partition_signature'Scheduler.get_graph_partition_signature  s     
'(@(@(BC--/	 	( *-Z (?";*
%I -7LL!##D$8$8$=$=$?@ " %1$=$=>P$Q! '11<<.78id!!i8K  "-!2!2[5G5G!G!GA-aff5 !G  " %/ /1/ %!
 5?L !$++DOO< "
 21D' )4((1   2"1D' F&::dE1  " 2"1D' ,0,L 1  " "(();<$. /1/ %! 21D%d+ #"1   "7!6!''BSBS:S!6   !BB;M #:"# 12!6!<!<":"w*
~ $B$e 9$
""sT   J5
J:
.J:
	J?	J?+	K8K	K	K	 K	K'K5!KKc                   UR                   R                  5        VVs0 s H'  u  p#U[        R                  R                  ;  d  M%  X#_M)     nnnUR
                  R                  5        VVs0 s H'  u  p%U[        R                  R                  ;  d  M%  X%_M)     nnnUR                   Vs/ s H3  nUR                  5       [        R                  R                  ;  d  M1  UPM5     nnUR                   Vs/ s H%  nU[        R                  R                  ;  d  M#  UPM'     n	n[        UR                  UUUUR                  U	5      $ s  snnf s  snnf s  snf s  snf )z
Updates the partition signature by removing buffers specified in
V.graph.removed_buffers. See [Note: Removed Graph Partition Arguments]
)r  r  rI   r   r  rR  r  maybe_get_namer  r0   rT  r
  )
r\   r  rd   r  r  r  rR  rR   r  r  s
             r^   .clean_removed_buffer_from_partition_signatures8Scheduler.clean_removed_buffer_from_partition_signatures-  sR    !* 5 5 ; ; =
 =177222 DL = 	 
 '99??A
A	177222 DIA 	 
 "..
.""$AGG,C,CC . 	 
 "00
0177222 0 	 

 '##$$
 	
)






s/   $EE,$EE+0EE5"EEc                  ^ ^^	^
^^^ SSK m	[        5       m/ m/ m[        U5       VVs0 s H  u  p#X2_M	     snnmSUU	UUU 4S jjm
SU
U4S jjnU H8  n[        UR                  R
                  5      TU'   TU   S:X  d  M0  T
" U5        M:     / nSnU[        U5      :  a  T(       d  T(       a  T(       a5  T	R                  T5      u  psUR                  U5        U" U5        T(       a  M5  T(       a5  T	R                  T5      u  psUR                  U5        U" U5        T(       a  M5  US-  nU[        U5      :  a  T(       a  M  T(       a  M  U[        U5      :  a  [        S5      eU$ s  snnf )ad  
Reorder nodes to minimize the number of partitions via a bfs
topological sort. This is the optimal reordering such that the
number of partitions cannot be reduced further. This may be
sub-optimal for other metrics such as peak memory. This does not
change relative orders of two cudagraphable nodes, nor the
relative order of two non_cudagraphable nodes.
r   Nc                   > TU    U 4nTR                  U 5      (       a  TR                  TU5        g TR                  TU5        g rZ   )r  heappush)rR   node_with_indexcudagraphable_nodesheapqnode_to_indexnon_cudagraphable_nodesr\   s     r^   insert_pending_nodesHScheduler.reorder_for_minimizing_partition.<locals>.insert_pending_nodesd  sA    ,T2D9O$$T**6H2ODra   c                   > U R                   R                   H.  nTU   S:  d   eTU==   S-  ss'   TU   S:X  d  M&  T" U5        M0     g )Nr   r   )r   
succ_nodes)rR   	succ_noderd  node_to_indegrees     r^   update_indegreeCScheduler.reorder_for_minimizing_partition.<locals>.update_indegreek  sO    !]]55	'	2Q666 +q0+#I.!3(3	 6ra   r   z
                Failed to schedule, while loop ran too long when
                reordering for minimizing the num of partitions
                rR   rL   r   r   )	ra  r#  rr  rs   r   
pred_nodesheappopr  r;  )r\   rs  r  rR   rj  schedule	num_itersr  r`  ra  rd  ri  rb  rc  s   `       @@@@@@r^    reorder_for_minimizing_partition*Scheduler.reorder_for_minimizing_partitionQ  s_    	9=CEGI4=e4DE4Dys4DE	E 	E	4 	4 D%()A)A%BT"%*$T* 
 -/	#e*$#':)--(?@%% *)
 &--(;<%% &%
 NI #e*$##':': s5z!  ] Fs   E(c           	     R   SSK JnJn  [        [        R
                  R                  5       5      nU" UU R                  U R                  [        [        R
                  R                  R                  5       5      U5      u  pVU R                  U5      nU" XvU5      u  pXS-  :  a  U$ U$ )z`
Reorder nodes to minimize the number of partitions if this only slightly
increase peak memory.
r   )estimate_peak_memoryprepare_planning_inforp  )r  rt  ru  r   rI   r   r  r   rm  rB  r  rq  )
r\   rs  rt  ru  r  default_peak_memoryname_to_freeable_input_bufreordered_nodesreorder_peak_memoryr  s
             r^   r  0Scheduler.maybe_reorder_for_minimizing_partition  s     	H"177#;#;#=>:O##qww++0023;
7 ??F!5"

 s!::""ra   c                0   / n/ n/ nSS jnU H  nU R                  U5      nU(       a,  [        UR                  5      S:X  a  UR                  U5        MG  U(       a   U" U5      (       a  UR                  U5        Mn  UR                  U5        M     X#-   U-   $ )z
Reorder a node if it should be partitioned and has simple dependency:
1. move a partitioned node to the front if it has no dependency
2. move a partitioned node to the back if it is only used by OutputNode
3. otherwise do not reorder
c                    U R                  5        H8  nUR                   H%  n[        UR                  [        5      (       a  M$      g   M:     gr   )r   rW   r   rR   r   )rR   r   r   s      r^   only_output_userPScheduler.reorder_for_partition_with_simple_dependency.<locals>.only_output_user  s<    '')99C%chh
;;$ % * ra   r   r/  )r  rs   r   r  )r\   rs  frontmiddlebackr}  rR   r  s           r^   r  6Scheduler.reorder_for_partition_with_simple_dependency  s     *,*,(*	 D#44T:C(?(?$@A$ET"!&6t&<&<D!d#  ~$$ra   c                x   / nSn/ n/ nU R                    HW  nU R                  U5      nU(       a)  X&:w  a$  UR                  U5        UR                  U5        / nUnUR                  U5        MY     U(       a"  UR                  U5        UR                  U5        U R                  XS9nU R	                  U5        X4$ )zz
Given a list of BaseSchedulerNodes, split into a list of
graph partitions and compute partition input/output signatures.
T)rL  rM  )rs  r  r  rV  r  )r\   rL  r
  cur_partitionrM  rR   r  r  s           r^   r  Scheduler.graph_partition  s     +-
')JJD#44T:!C!!-0&&~6 "-N  &  m,"">277! 8 

 	))*5%%ra   c                    [        S5         [        R                  R                  R                  (       a  U R                  5       OU R                  U R                  5       sS S S 5        $ ! , (       d  f       g = f)NzScheduler.codegen)r   r  r  r   r  _codegen_partitions_codegenrs  re   s    r^   r  Scheduler.codegen  sO    -. ??))99 ((*]]4::. /..s   AA++
A9c                J   SSK Jn  [        R                  R                  n[        U R                  5      n[        R                  R                  5          [        R                  R                  SSU 3UUS9  U R                  U5        [        [        R                  R                  U5      (       d   eU R                  U5      nU[        R                  R                  l        [        R                  R                  R                  5         [        R                  R                  R                  [        R                  R                  5      u  pgSSS5        [        R                  R                  R!                  WR"                  5        [        R                  R                  R%                  XR5        [        R                  R                  R&                  R)                  UR*                   Vs/ s H  oR-                  5       PM     sn5        g! , (       d  f       N= fs  snf )z,Codegen a partition given its inputs/outputsr   )SubgraphPythonWrapperCodegenT
partition_)is_subgraphsubgraph_nameparent_wrapper_codepartition_signaturesN)r  r  rI   r   r   rq  r  set_current_wrapper_codeinit_wrapper_coder  r   rZ  r  write_prefixgenerateis_inferencedefine_subgraph_launcher_fnvaluecodegen_partition_call	allocatedr  r  r[   )	r\   r;  r  r  r  graph_partition_idpartition_coder  rR   s	            r^   _codegen_partition_wrapper$Scheduler._codegen_partition_wrapper  sz    	Bgg22!$"?"?@WW--/GG%%  *+=*>?$7%.	 &  MM)$ agg224PQQQQKKIVI8AAGG  5GG  --/ ! 4 4 = =agg>R>R SN- 00 	
889M9MN	334FR	&&--)2)?)?@)?]]_)?@	
7 0/8 As   C;H.H 
Hc                l   U R                  5       u  p[        X5       H\  u  p4[        U5      S:  d   S[        U5       35       eUR                  (       a  U R	                  U5        MK  U R                  X45        M^     [        U R                  5      n[        R                  R                  R                  U5        US:  as  [        R                  R                  c   eU[        [        R                  R                  5      :X  d.   SU S[        [        R                  R                  5       35       egg)z
Split nodes into partitions and codegen each partition into separate functions.
This allows further applying different optimizations (e.g., cudagraph) to
each function.
r   z5Each partition must have at least one node but found r   NzExpect z partition maps but got )r  r$  rs   r
  r  r  rq  r  rI   r   r   set_all_partition_namesr	  )r\   rL  r  r;  r  num_partitionss         r^   r  Scheduler._codegen_partitions#  s
    "&!5!5!7
$'
$? Iy>Q& GIGWX& ''i(//	E %@ d;;<	44^D A77))555!S)?)?%@@ .))A#aggF\F\B]A^_@ ra   c                   [         R                  (       a  SS Kn[        R                  " 5       n[        5       n[        U5       H  nUR                  S:X  a0  UR                  UR                  R                  R                  :X  a    OTUR                  UR                  4nXd;  d"   SUR                   SUR                   S35       eUR                  U5        M     S U l        U GHf  n[        R!                  ["        R$                  5      (       a4   [        R'                  SUR)                  5       UR+                  5       5        U R/                  U5        UR1                  5       =n(       Ga  XR                  :w  d*  UR3                  5       (       d  UR5                  5       (       a  U R7                  5         XR                  :w  a  U R                  (       aL  [9        U R                  R:                  5      (       a(  [<        R>                  R@                  RC                  5         Xl        [9        UR:                  5      (       aG  URD                  c   S5       e[<        R>                  R@                  RG                  URD                  5        U RH                  RK                  URL                  5        UR5                  5       (       aN  URO                  [Q        URS                  5       5      5      u  pnU RU                  U5      RW                  XU	5        GO1UR3                  5       (       a-  [X        RZ                  " [\        U5      nU R_                  U5        OURa                  5       (       aw  [X        RZ                  " [b        U5      nU RU                  U5      nS	S
K2J3n  S	SK4J5n  [m        XU45      (       a  UnO[o        S[;        U 5      < 35      eURq                  U5        Oc[m        U[r        [t        45      (       a!  U RU                  U5      Rw                  U5        O'[m        U[x        5      (       d   eUR{                  5         [         R|                  R~                  (       a  U RU                  U5      R                  5         U R                  RK                  UR                  5       5        U R                  RK                  UR                  5       5        [m        U[x        5      (       a  GM  UR1                  5       nUc  GM  UR:                  S:w  d  GM/  U RU                  U5      R                  5       (       d  GMV  U R7                  5         GMi     U R                  (       aL  [9        U R                  R:                  5      (       a(  [<        R>                  R@                  RC                  5         U R7                  5         g ! [,         a(    [        R'                  SUR)                  5       5         GNf = f)Nr   _compile_innerzDuplicate stack frame :zs; did you add a decorator to one of the functions in this stack trace?  If so, try using a context manager instead.z5Generating code for node %s with estimated runtime %fz6Generating code for node %s with estimated runtime 0.0zdevice should have an indexr   )CUDACombinedSchedulingr  ztype(self)=r  )Fr   "check_stack_no_cycles_TESTING_ONLYtorch._dynamo.convert_frame	tracebackextract_stackr   r1  rd   filename_dynamoconvert_frame__file__linenorO  r  r   r  r  r  r  r[   ra  r   r  r   r  r  r  r<   rm   rI   r   r   codegen_device_guard_exitrE  codegen_device_guard_enterr  r  r   rx  r   r|  r  codegen_templater  r  r:  r  r  r   codegen.cuda_combined_schedulingr  r  r  r   rV  codegen_combo_kernelrA  r  codegen_noder  r=  r  debug_sync_kernelcodegen_syncrV  r  r  r}  ready_to_flush)r\   rs  r  stackr1  framer  rR   r  ru  rv  rw  backend_r  r  r^  s                   r^   r  Scheduler._codegen?  sS   44.++-E7A|D!% JJ"22%--*E*E*N*NN~~u||4 ,U^^,<Aell^ LJ J
  ) #D..
IIO224 t$**v*111~~''''))JJL000**/@++000 0 ,,FFH*0'(55%||7V9VV7,,GGU%%,,T__=!!484W4W)*51   (99!X !!{{#<dC((.""{{#=tD++F3T8h9O(PQQ&G(KDJ=)9::,,T2D#5}"EFF  (55d;!$(>????}}..  (557''..t/D/D/FG%%,,T-E-E-GHd$:;;*&v-((0??AAJJLW Z #4T5H5H5M5M#N#N GG  ::<

U ! IIPs   3V==.W/.W/c                    US   R                  5       nU [        R                  l        X l        Uc   eU R                  U5      nUR                  U5      $ )rZ  r   )r   rI   r   rQ   r  r  benchmark_combo_kernel)r\   r  r  r^  s       r^   r   Scheduler.benchmark_combo_kernel  sU     1((* $!!!""6*--i88ra   c                2   [         R                  (       d  gUnUS   R                  5       nUb  UR                  S:X  a  gSSKJn  S/ pe[        U5       H  u  pxUR                  5       n	U R                  U	5      (       a  [        R                  S5         U R                  U	5      u  p[        R                  " U
5      (       a  [        R                  SU5          g	 XZ-  nUR                  U5        M      U R                  U5      u  pnX-
  S:  =(       d    US:  n[        R!                  ["        R$                  5      (       aS  X]:  d  U(       a$  [        R                  S['        X]-  S 5      5        O#[        R                  S[)        X]-  S 5      5        X-
  U:  =(       d    U$ ! U a0  nS
[        U5      ;   a  [        R                  S5         SnA  ge SnAff = f! U a/  nS
[        U5      ;   a  [        R                  S5         SnAge SnAff = f)r  Tr   Nr  r  g        z<ComboKernel: benchmarking may not accurate due to atomic_addz;ComboKernel benchmark: register spilling of %d-th subkernelFr  zCComboKernel benchmark: return True because of loop-carried variableg333333?z/can fuse (benchmark): fusing causes %sx speedupr  z3cannot fuse (benchmark): fusing causes %sx slowdown)r   r  r   rm   r  r  rr  r|  r  r  r  r[  r  r  r   r  r  r  r  r7   r8   )r\   rs  subkernel_nodesr  r  r  
path1_listro  r  r  msr  rX  r  	ms2_clone_path2_listsmall_kernels                    r^   r
  !Scheduler.speedup_by_combo_kernel  s   
 ,, #..0 >V[[E1;rZ!/2HA)I ##I..  R55i@::b>>$$U ! " ICd#7 3:
	*.*E*Eo*V'CK ,9c	""7==11yL  E#)C2
   I	#0
 $44M $ *c!f4$$]     	&#a&0  Y 	s=   AF(6G! (G.$GGG!H'$HHHc                r    U R                   U   nUR                  c   eUR                  R                  5       $ rZ   )r   rR   
get_layout)r\   r  r   s      r^   get_buffer_layoutScheduler.get_buffer_layout  s5    x(xx###xx""$$ra   c                   U R                    H  nUR                  5       (       d  M  UR                  R                   H  n[        R
                  R                  R                  UR                  5      nU(       d  M?  [        U5      S:X  d  MP  [        UR                  [        [        45      (       a  Mw  UR                  5       / :X  d  M  [        R
                  R                  R!                  UR                  5        M     M     g r  )rs  rD   r   r   rI   r   rP  r2  rd   r/   r   ro   r3   r2   r  zero_dim_cpu_tensor_listrO  )r\   rR   r  r  s       r^   r  $Scheduler.update_zero_dim_cpu_tensor  s    JJD{{}} ,,22DWW3377		BF+F3u< *"MMJ8I+J! ! #OO-388<<TYYG 3 ra   )__dep_size_hint_cacher  rV  r  r  r  r  r  r4  r  r   r   rm  r  rs  r  r  r  )rs  zlist[ir.Operation]r   r   )r   z!dict[str, SchedulerDonatedBuffer]r   )r  r   r   r   r   )r  r   r   r   )rR   rz  r   rL   r7  )r  rL   r   r  )r   r8  rs  r9  r   tuple[float, str]rs  r9  rc  r   r   r   )rg  r   r  r  r   r  )r  r9  r   r   )r  rL   r  rL   r   zUnion[bool, Callable[[], bool]])rR   rL   r   rL   rZ   )r  zOptional[int]r   r   r  )rs  r  r   1list[tuple[BaseSchedulerNode, BaseSchedulerNode]]r  rL   r  rL   r   r   )r  rL   r  rL   rU  z"Union[tuple[str], OrderedSet[str]]r   r   r  rL   r  rL   r   r   r/  )r~  rL   rv  rL   r  r  r   r   )r  r*   r  rL   r  rL   r   r   )r  r'   rn  r(   r   r   )r!  r'   r   r   )r  r  r   r  )rs  z+tuple[BaseSchedulerNode, BaseSchedulerNode]r   r   )r  r:  r   r   )r  r  r   BaseScheduling)r  r   r   r  rl  )rd   r   r  r  r   r   )r   ;dict[str, Union[ir.IRNode, ir.TorchBindObject, sympy.Expr]])r  list[GraphPartitionSignature]r   r   )r;  PartitionTyper  r  r   r8  )rL  zlist[PartitionType]rM  z
list[bool]r   r  )r  r0   r   r0   )r   z9tuple[list[PartitionType], list[GraphPartitionSignature]])r;  r  r  r0   r   r   r  r9  r   z(tuple[float, float, list[Optional[str]]])rs  r  r   r   )r  r   r   z	ir.Layout)Orn   r   r   r   r  r   r   r  r  propertyr  setterr  r  r  r  r  r  r  r>  r  r  r  rD  r  r[  rb  rf  r  r  r  r  rR  r  rn  r  r  rI  rN  r^  rk  r  r  r  r  r  r  rd  r@  r  r  r  r  r  r  r  r  r  r  r  r  r  r>  rV  rZ  rq  r  r  r  r  r  r  r  r  r
  r  r  r   r  r  s   @r^   rP   rP     sb   
 *)z
x	# & & ( (7#,"HVSp(#T,	 6S(4#&$6:	808	8$T0TDHT	T
> 
>*6
>	
>g@R
z(&z(/@z(	(z(x>h,h	 hT..`?4 ,4 	:4 l,&,/@,	,\7&7/@7	7r$&$/@$	$6< < !< =	<
 
<|I6&I6/@I6	I6V
9(9 )9 	9
 
9vQMf3&3/@3	3j

(9
BS
	
J D J&J/@J	J<6 Q6	:6@4@4	4	8*4
'*%5$

+:
	
2	D '1' 
'ReH eH QeH 
"	eHN - @J 	& B"
0"
	 "
H?&? 
!?B& 
!>%,%	 %@&	B&@(
 (
 +(
 
	(
T8hT949	19I5V%
H Hra   c                  F  ^  \ rS rSrSU 4S jjrSS jrSS jr      SS jr      SS jr      SS jr	      SS jr
    SS	 jr        SS
 jr      SS jrSS jrSS jrSS jrSS jr    SS jrS S jr      S!S jr    S"S jrSrU =r$ )#r  i  c                .   > [         TU ]  5         Xl        g rZ   )r  r   rQ   )r\   rQ   r  s     r^   r   BaseScheduling.__init__  s    "ra   c                \    U R                   (       a  U R                   R                  5         g g rZ   )rQ   r  re   s    r^   free_buffers_in_scheduler(BaseScheduling.free_buffers_in_scheduler  s    >>NN'') ra   c                    [        5       $ )z0Return a set of .codegen.common.BackendFeature()r   r  s     r^   get_backend_features#BaseScheduling.get_backend_features   s
    |ra   c                    [         e)z?
Check whether node1 and node2 can be vertically fused or not.
r  r  s      r^   r   BaseScheduling.can_fuse_vertical$  
     "!ra   c                    [         e)zA
Check whether node1 and node2 can be horizontally fused or not.
r  r  s      r^   r  "BaseScheduling.can_fuse_horizontal,  r  ra   c                    g)aE  
A Multi-Output Template (referenced in #144012) is a template node
with MultiOutputLayout, and its output buffers are instances of MultiOutput.
In this context, we verify whether node1 represents the Multi-Output Template
and node2 corresponds to one of its outputs. If so, we further check if
backend supports this fusion.
Fr   r  s      r^   r  .BaseScheduling.can_fuse_multi_outputs_template4  s     ra   c                    UR                  5       (       d  UR                  5       (       a  [        R                  X5      $ [        R                  X5      $ )z
Fuse two nodes
)r  r  ro  rA  r  s      r^   ro  BaseScheduling.fuse@  sC     !1!1!3!3-225@@%**588ra   c                    [         e)zK
Process the iteration sizes in case a transformation needs to be applied.
r  )r\   r  s     r^   r  BaseScheduling.group_fnK  r  ra   c                    [         e)z
Given a template node, generate a kernel.

This function is only available for triton now. If the third-party backend behaves as a sub-class
of TritonScheduling, it can override it or reuse it.
r  )r\   rv  epilogue_nodesr  s       r^   r  BaseScheduling.codegen_templateS  s
     "!ra   c                    [         ez4
Generate a kernel given a list of pre-fused nodes.
r  )r\   rs  rc  s      r^   rb  .BaseScheduling.generate_kernel_code_from_nodesa  r  ra   c                    [         er  r  r  s     r^   r  BaseScheduling.codegen_nodei  
     "!ra   c                    [         e)zd
Generate synchronization code for the kernel. This method depends on the hardware characteristics.
r  re   s    r^   r  BaseScheduling.codegen_synco  r  ra   c                    g)z}
Check whether the backend is requesting the scheduler to flush the generated kernel.
If not supported, please return False.
Fr   re   s    r^   r  BaseScheduling.ready_to_flushu  s    
 ra   c                    [         e)zM
Flush the generated kernel and python wrapper code to the source code file.
r  re   s    r^   r  BaseScheduling.flush|  r  ra   c                    [         e)rZ  r  r  s     r^   r[  $BaseScheduling.benchmark_fused_nodes  
     "!ra   c                    [         e)zi
Benchmark a compiled module and return the execution time
in milliseconds on randomly generated inputs.
r  )r\   rg  s     r^   rf  )BaseScheduling.benchmark_codegened_module  s
    
 "!ra   c                    g)zt
Return an unsigned integer which represents the priority of this fusion pair.
The smaller is with higher priority.
r   r   r  s      r^   r  'BaseScheduling.get_fusion_pair_priority  s     ra   c                    [         e)z
Benchmark the list of nodes to combine and return the execution time
and memory copy time in milliseconds on randomly generated inputs.
r  r  s     r^   r  %BaseScheduling.benchmark_combo_kernel  r  ra   r"  )rQ   zOptional[Scheduler]r   )r  r  r   zOrderedSet[BackendFeature]r  r  )r  rT  r   z"tuple[tuple[sympy.Expr, ...], ...])rv  rL   r  r9  r  r9  r   zOptional[str]r  )rR   z(Union[FusedSchedulerNode, SchedulerNode]r   r   r   r  )rg  r   r   r  r  r  )rn   r   r   r   r   r  r  r  r  r  ro  r  r  rb  r  r  r  r  r[  rf  r  r  r   r  r  s   @r^   r  r    sH   #*"&"/@"	""&"/@"	"
&
/@
	
	9&	9/@	9		9"3"	+""(" 4" 4	"
 
""0"DH"	"""""0"	""&/@	"4"	1" "ra   r  )r  r   r   r   )rR   rL   rm  r  r   zdict[str, SchedulerBuffer]r   r   )rY  /Union[FusedSchedulerNode, GroupedSchedulerNode]r   r   )rY  r  rQ   rP   r"  r  r   r   )r   )rm  zlist[list[int]]r  rU  rt  ztuple[int, ...]r   z	list[int])
__future__r   r@  r   rp  r  rE  r  r  r  r  r  r  r  r  r   r   r   r   r   r	   r
   r   r   collections.abcr   typesr   r2  r  torch._inductor.async_compiletorch._dynamo.utilsr   r   torch._inductor.codecacher   r   torch._inductor.metricsr   r   %torch.fx.experimental.symbolic_shapesr   torch.utils._ordered_setr   torch.utils._sympy.symbolr   r   r   torch.utils._tritonr   r   r   r   r   r    r!   analyze_preserves_zero_maskr"   codegen.commonr#   r$   r%   comm_analysisr&   r'   r(   r)   r*   excr+   r,   fx_utilsr-   r.   r/   r0   r1   r2   r3   	loop_bodyr4   r  r5   r6   runtime.runtime_utilsr7   r8   r  r9   utilsr:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   virtualizedrI   	getLoggerrn   r   _logginggetArtifactLoggerr  r  r   r  	dataclassrN   r   rL   r  rq   r   rl  r:  r  r  r_  rh  rA  r  rg  rw  ry  r  r  rP   r  r   ra   r^   <module>r!     s   "         	     , R R R (    $ 6 ? M > / O O * 6 6 D M M ; : : 2 2    J 7 &    "  !^^--hA
NN44XO () h8 h8 h8V 4_ 4 4u
1 u
1p
 
,  &K
&K4&K ,&K 
	&KRW 1 W"5. 5~+% ~+B@	$@ $ 
	,b** b*J~:!3 ~:B
Q, Qn %'+#++ "+ 	+\ 
 
 
> %??, u,H u,HpYK" K"ra   