
    7hi                    	   % S SK r S SKrS SKrS SKrS SKrS SKrS SKrS SKrS SKrS SK	J
r
  S SKJr  S SKJrJrJrJrJr  S SKrS SKrS SKrS SKJr  S SKJrJr  S SKJr  S SKJrJrJ r   S S	K!J"r"J#r#J$r$  S
SK%J&r&  SSK'J(r(J)r)J*r*J+r+J,r,  SSK-J.r.  SSK/J0r0J1r1J2r2J3r3J4r4J5r5J6r6  SSK7J8r8J9r9J:r:J;r;J<r<J=r=J>r>J?r?J@r@JArAJBrBJCrCJDrD  SSKEJFrFJGrGJHrHJIrI  SSKJJKrKJLrLJMrMJNrNJOrOJPrPJQrQJRrRJSrSJTrTJUrUJVrV  SSKWJXrXJYrYJZrZJ[r[J\r\J]r]J^r^J_r_J`r`JaraJbrbJcrcJdrdJereJfrf  \R                  S:H  rh\R                  S 5       rj\R                  R                  \mS5      rn\" / SQ5      roSSSSSSSS S!S!S".
rp\" / S#Q5      rqS$S%S&S'S(S)S*S+S,S-S..
rrS/S0S1.rs\R                  \R                  /rv\R                  \R                  \R                  \R                  \R                  \R                  \R                  \R                  \R                  \R                  \R                  /r\\GR                     \S2'   \R                  \R                  \R                  \R                  \R                  /r\\GR                     \S3'   S4 rS5 r  SxS6\\GR                     4S7 jjrS8 rS9\RS:\GR                  S;\S<\GR                  S=\GR                  S>\L4S? jrS@\\\N4   SA\SB\SC\GR                  SD\\\4   4
SE jrSF\RSG\SH\4SI jr\GR                   S6\GR                  SJ\GR                  4SK j5       r\GR                   S6\GR                  SJ\GR                  SL\4SM j5       r\GR                    SyS6\GR                  SJ\GR                  SL\\   4SN jj5       r\GR(                   " SO SP5      5       r " SQ SR\45      r " SS ST5      rSU r " SV SW\U5      r\GR5                  SX5         " SY SZ\5      r\GR5                  S[5        \GR9                  5          " S\ S]\5      r " S^ S_\S5      r " S` Sa\5      r " Sb Sc\5      rSd\.S>\\\GR                     \y4   4Se jr " Sf Sg5      r " Sh Si\5      r " Sj Sk\5      r " Sl Sm\5      r " Sn So\15      r " Sp Sq5      r " Sr Ss5      r\GR(                   " St Su5      5       r\GR(                   " Sv Sw5      5       rg)z    N)Sequence)Enum)AnyCallablecastOptionalUnion)dependencies)is_float_dtypeis_integer_dtype)
OrderedSet)CeilDivFloorDivModularIndexing)free_symbol_is_typesymbol_is_typeSymT   )counters   )configcpp_buildercpu_vec_isairmetrics)LoopBody)BaseSchedulerNodeBaseSchedulingExternKernelSchedulerNodeForeachKernelSchedulerNodeFusedSchedulerNode	SchedulerSchedulerNode)cache_on_selfget_bounds_index_exprget_fused_kernel_namehas_free_symbolsis_multi_outputs_templateis_welford_reductionparallel_num_threadsPlaceholder'set_kernel_post_grad_provenance_tracingsympy_index_symbolsympy_index_symbol_with_prefixsympy_product
sympy_subs)NullKernelHandleropsOpsValueV   )BackendFeatureBracesBufferCSECSEVariableDataTypePropagationDeferredLineDTYPE_TO_COMPUTATION_DTYPEIndentedBufferKernel
KernelArgsOpOverridesOptimizationContext)_get_dtype_from_loopbodies_get_loop_bodycexprcexpr_indexcodegen_randCppCSEVariableDTYPE_TO_CPPget_promote_dtype
INDEX_TYPELocalBufferContextmay_unify_binary_op_mask_typepromote_args(template_fusion_with_epilogues_supportedunify_mask_base_typevalue_to_cppwin32c                       [         (       a  S$ S$ )Nz__declspec(dllexport) _IS_WINDOWS     U/var/www/fran/franai/venv/lib/python3.13/site-packages/torch/_inductor/codegen/cpp.pyget_export_declarationrY   Y   s    &1k"9r9rW   schedule)+*^||minmaxr[   r\   r]   r_   r`   argminargmaxr^   welford)
sumprodxor_sumr_   r`   ra   rb   anywelford_reducewelford_combine)
r`   r_   rd   re   rf   rh   ri   ra   rb   rg   z
at::Tensorlongdoubleboolzstd::stringzc10::ScalarTypezat::MemoryFormatz
at::Layoutz
at::Devicez
at::Scalar)
Tensorintfloatrl   str
ScalarTypeMemoryFormatLayoutDevicenumberzstd::vectorzstd::optional)Listr   VECTORIZABLE_DTYPESMASKED_VECTORIZABLE_DTYPESc                    U[         ;   a  [        R                  nU S;   a  gU S:X  a  gU S;   a  [        U   nU[        R                  :X  a  U S;   a  [        [        R
                     n[        U5      (       a  SU S3OS	U S
3n[        U5      (       a  S	U S3OS	U S3nU S;   a  UOUnU S;   a  U$ SU SU S3$ [        U 5      (       a  S[        U    S3$ [        U 5      e)N)rf   rd   rg   r   re   r5   )r`   rb   r_   ra   ra   rb   -std::numeric_limits<>::infinity()std::numeric_limits<>::min()>::max())r`   rb   )r`   r_   IndexValue<z>{0, }Welford<>())	DTYPE_LOWP_FPtorchfloat32rH   rl   ro   r   r)   AssertionError)reduction_typedtypecdtypemin_varmax_varinit_vars         rX   reduction_initr      s$    22;;e$EJJ>5I#I!%++.F e$$ $F8=9'xx8 	 e$$ #6(-8'xx8 	
 -0AA7w / 	
 vhfXJb9	

 N++,u-.c22

((rW   c                     [         [        U      n[        U 5      (       a  SU S3$ U S;   a1  U[        R                  :X  a  [         [        R
                     nSU S3$ U$ )Nr   >rz   r   )rH   r<   r)   r   rl   ro   )r   r   scalar_types      rX   reduction_acc_typer      sd    9%@AKN+++a((--EJJ&u{{3K[M++rW   indexc           	      z   U[         R                  :H  nU S:X  a  U(       a  SOSnU SU SU 3$ U S:X  a  U SU 3$ U S:X  a  U SU 3$ U S	:X  a  U S
U 3$ U S;   a  U  SU SU S3$ U S:X  a	  SU SU S3$ U S:X  a8  [        U[        5      (       a  Uu  pxn	O[	        X5      u  pxn	SU SU SU SU	 S3	$ U S;   ax  [        US5      (       aK  UR                  [         R                  :X  a-  UR                  (       d  Ub  U  SU SU SU S3$ U  SU SU S3$ Ub  U  SU SU SU S3$ U  SU SU S3$ [        U 5      e)Nrd   |r[    re    * rf    ^ rg    || )r_   r`   z_propagate_nan(, )rh   welford_combine(ri   , {})rz   r   z	_combine(z, static_cast<float>(), )))	r   rl   
isinstancetuplereduction_projecthasattrr   is_vecr   )
r   var
next_valuer   	src_dtypeis_boolconjunctionmeanm2weights
             rX   reduction_combiner      s    5::%G$c#a}Aj\22c*&&"c*&&d:,''' !R
|1EE))!#bA66**j%(()Df0LDf!#d4&2$bDD--J((  EJJ.%% ()3%7LZLX[\a[bbcdd &&iu4I*UWX $%Yse2j\E7!LL$%Yse2j\CC

((rW   c                 X    [        U 5      (       a  U S3U S3U S34$ U S;   a  U S3$ U$ )Nz.meanz.m2z.weightrz   z.index)r)   )r   accs     rX   r   r     sF    N++e}SkcU'?::	/	/f~JrW   codeiter_varnew_iter_var
loop_startloop_endreturnc                    [        5       n[        R                  " 5        nUR                  S[         SU S[        U5       S3U S[        U5       SU S3-   5        UR                  UR                  5       5        [        U R                  5       H  u  px[        U[        [        45      (       d   eSn	[        U[        5      (       a  UR                  n	UR                  n[        R                   " S	U -   S	-   U U5      n
U	(       a  [        X5      n
UR                  U
5        M     SSS5        U$ ! , (       d  f       U$ = f)
a  
f(iter_var) is transformed to f(new_iter_var) under the inner loop
  \/
for (new_iter_var = loop_start; new_iter_var < loop_end; new_iter_var++) {
    f(new_iter_var)
}
Please be careful while using this function,
as the variable defined in f(iter_var) will be invalid outside the for loop.
For example:
auto tmp0 = in_ptr[x0]; ->
for (new_x0 = start; new_x0 < end; new_x0++){
    auto tmp0 = in_ptr[new_x0];
}
The tmp0 is invalid outside the loop.
zfor (r    = ; < ; ++)N\b)r7   
contextlib	ExitStack	writelinerJ   rE   enter_contextindent	enumerate_linesr   rp   r;   namelineresub)r   r   r   r   r   transformed_codestack_r   deferred_namenew_lines              rX   move_code_under_inner_loopr     s<   , $~				5""J<qc+j2I1J!Lc+h"7!8<.LM	
 	,3356 -GA     !M$-- $		yyvve
3e;~PTUH'@&&x0 . 
 , - 
 	, s   D
D44
Eacc_varacc_typer   r   lenc                     [        5       n[        R                  " 5       (       a  SU  SU SU S3O
U SU  SU S3nUR                  U 5        UR	                  SU S	3S
SU  SU" X#5       S3S/5        U$ )a  
MSVC don't support dynamic array(VLA). So we use std::unique_ptr here.
Ref: https://stackoverflow.com/questions/56555406/creating-dynamic-sized-array-using-msvc-c-compiler
MSVC is the only one compiler without VLA. support. Since MSVC can't get good performance here.
We just use unique_ptr make it works on MSVC.
For other compilers, we continue to use VLA to get best performance.
auto z_arr = std::make_unique<z[]>();r   _arr[];for (int i = 0; i < ; i++){    z
_arr[i] = r   r   )r=   r   
is_msvc_clr   
writelines)r   r   r   r   r   init_fncode_bufferacc_decls           rX   reduction_prefix_arrayr   =  s     !"K !!## y0
$se2Fz7)5R0 
 XJ("3%v.7):gn&D%EQG		
 rW   bufferr   new_namec                 `   [        U R                  5       H  u  p4[        U[        [        45      (       d   e[        U[        5      (       a1  [
        R                  " SU -   S-   U UR                  5      Ul        Mh  [
        R                  " SU -   S-   U U5      U R                  U'   M     g )Nr   )r   r   r   rp   r;   r   r   r   )r   r   r   ir   s        rX   replace_acc_namer   ^  s    V]]+
 
 	
 
 dL))u$058XJSDI!vve&7%&?H:PTUFMM! ,rW   r   c                     U R                  U5      (       d  [        R                  R                  $ XS-   0n[	        X5      n[        R
                  " X0-
  5      $ Nr5   )hassympySZeror0   simplify)r   r   replacement	new_indexs       rX   	stride_atr   m  sF    99S>> ww||a.K5.I>>)+,,rW   
vec_lengthc                   ^^^^	 SmSm	UUU4S jnU	UU4S jnU n[         R                  " SSS9nU R                  [        5      (       a  U R	                  [        TU5      U5      n [         R                  " SSS9nU R                  [
        5      (       a  U R	                  [        TXg5      U5      n [         R                  " U 5      n X:w  a  [        U TT5      $ U $ )ai  
Simplifies the index expression within the range of a vectorized loop.
Given a vectorized loop variable `var` in the range of a loop with `vec_length`,
this function transforms the `index` into an equivalent form. It handles
simplifications for cases where `var` can be expressed as `vec_length * a + b`,
where `b` ranges from 0 to `vec_length - 1`. The function reduces occurrences
of `FloorDiv` and `ModularIndexing` in the `index` with best-effort optimizations.

NOTE:
The simplified index expression is intended for analysis purposes only, not
for code generation. It replaces `FloorDiv` and `ModularIndexing` with free variables
which are not dependent on the loop variable `var` in the vectorized range. Check
https://github.com/pytorch/pytorch/pull/117221#discussion_r1449746217 for more details.

Examples:
1. If `var` is `x3` and `vec_length` is 16, and `x3 = 16*a + b`, then
   `FloorDiv(x3, div)` or `ModularIndexing(x3, div, mod)` becomes a free variable
   when `div` is divisible by 16.
2. `ModularIndexing(x3, 1, mod)` can be simplified to `x3 + c` where `c` is a free
   variable when `mod` is divisible by 16.
r   c                    > [        TU 5      n[        R                  " U T5      T:X  a   [        R                  " T ST 35      nTS-  mU$ )N_div_cr5   )r   r   gcdSymbol)divisorresultdiv_freevar_idr   r   s     rX   visit_indexing_div7simplify_index_in_vec_range.<locals>.visit_indexing_div  sK    #w'99Wj)Z7\\SE/?"@AFaNrW   c                 "  > [        TX5      n[        R                  " U T5      T:X  a"  [        R                  " T ST 35      nTS-  mU$ U S:X  a>  [        R                  " UT5      T:X  a#  T[        R                  " T ST 35      -   nTS-  mU$ )N_mod_cr5   )r   r   r   r   )r   modulusr   mod_freevar_idr   r   s      rX   visit_modular_indexing;simplify_index_in_vec_range.<locals>.visit_modular_indexing  s     g799Wj)Z7\\SE/?"@AFaN  \eii<
J5<<3%vn5E(FGGFaNrW   r   T)integerr   )r   Wildr   r   replacer   r   simplify_index_in_vec_range)
r   r   r   r   r   original_indexdivmodr   r   s
    ``     @@rX   r  r  y  s    0 NN	 N
**Y
-CyyhsC02DE
**Y
-Cyy!!oc3<>TUNN5!E*5#zBBLrW   c                 >    U(       a  [        XU5      n [        X5      $ N)r  r   )r   r   r   s      rX   stride_at_vec_ranger
    s     +E
CU  rW   c                   .    \ rS rSr% Sr\\S'   \\S'   Srg)ParallelDepthi  zo
A class representing parallel depth.
Includes the starting depth of parallelism and the depth of parallelism.
parallel_depthstart_depthrV   N)__name__
__module____qualname____firstlineno____doc__rn   __annotations____static_attributes__rV   rW   rX   r  r    s    
 rW   r  c                   r   ^  \ rS rSr\S\S\4S j5       rSSS\\\	\
4      4U 4S jjrS	 rS
 rS rSrU =r$ )OuterLoopFusedSchedulerNodei  node1node2c                    UR                   UR                   L d   e[        S X4 5       5      (       d   e[        S X4 5       5      (       av  U " UR                   [        U5      [        L a  [        UR                  5       5      OU/[        U5      [        L a   [        UR                  5       5      -   U5      $ U/-   U5      $ U " UR                   X/U5      $ )Nc              3   \   #    U  H"  n[        U5      [        [        [        4;   v   M$     g 7fr	  )typer  r#   r!   .0nodes     rX   	<genexpr>3OuterLoopFusedSchedulerNode.fuse.<locals>.<genexpr>  s2      
 ' J+" '   *,c              3   D   #    U  H  n[        U5      [        L v   M     g 7fr	  r  r  r  s     rX   r   r!         T^TtDz88^    )	schedulerallrg   r  r  listget_outer_nodes)clsr  r  outer_loop_fusion_depths       rX   fuse OuterLoopFusedSchedulerNode.fuse  s     %//111 
 
 
 
 	
 
 Te^TTT E{&AA ..01  E{&AA ..01 (!   (! & u8OPPrW   r'  r"   outer_fused_nodesc                    > UU l         X0l        / nU R                    HH  n[        U[        [        45      (       d   eUR                  [        UR                  5       5      5        MJ     [        TU ]%  X5        g r	  )
r/  r,  r   r#   r!   extendr)  	get_nodessuper__init__)selfr'  r/  r,  flatten_snodes_node	__class__s         rX   r4  $OuterLoopFusedSchedulerNode.__init__  so      	 (?$++Eem5G%HIIII!!$u'8"9: , 	3rW   c                     U R                   $ r	  )r/  r5  s    rX   r*  +OuterLoopFusedSchedulerNode.get_outer_nodes  s    %%%rW   c           
      j  ^ S[         S[         S[        S[        S[        4
U4S jjm[        [	        U5      S-
  5       H4  nX   R
                  nXS-      R
                  nT" UUUS5      (       a  M4    g	   U H  n[        R                  " [        R                  UR                  S U 5      n[	        UR                  5      U:  d  MP  [        U[        R                  5      (       d  Mq  [        UR                  U   [        R                  5      (       d  M  US
-  UR                  U   :  d  M    g	   g)Nleft_loop_nestright_loop_nestloop_fusion_depthcurrent_checking_depthr   c                   >^^ U R                   (       d   eUR                   (       d   eU R                   U   mUR                   U   m/ SQn[        UU4S jU 5       5      (       d  gUS:  d   eUS-
  =nS:  aL  US-   nU[        U R                   5      :  d   eU[        UR                   5      :  d   eT" U UUU5      (       d  gg)N)r   sizeoffsetstepsc              3   V   >#    U  H  n[        TU5      [        TU5      :H  v   M      g 7fr	  )getattr)r  attr_compareleft_loop_levelright_loop_levels     rX   r   aOuterLoopFusedSchedulerNode.check_outer_fusion_loop_level_attr.<locals>._inner.<locals>.<genexpr>  s2       )F O\:/>?(Es   &)Fr5   r   T)loopsr(  r   )r>  r?  r@  rA  outer_loops_attr_compare_listrI  rJ  _inners        @@rX   rN  NOuterLoopFusedSchedulerNode.check_outer_fusion_loop_level_attr.<locals>._inner  s     "''''"((((,223IJO.445KL-)   )F   $)))%6%::!a?)?!)C&-N4H4H0IIII-O4I4I0JJJJ"#%*	  !rW   r5   r   F,  T)LoopNestrn   rl   ranger   	loop_nest	functoolsreduceoperatormulrangesr   r   Integer)	r5  cpp_kernel_proxy_listr,  idxr>  r?  cpp_kernel_proxyouter_rangesrN  s	           @rX   "check_outer_fusion_loop_level_attr>OuterLoopFusedSchedulerNode.check_outer_fusion_loop_level_attr  s;   (	$(	%(	  #(	 %(	(	
 (	T 23a78C27AAN3!G<FFO'	   9 !6$++ ''(@)@AL $++,/FF|U]];;$++,CDMM  !3&"))*ABC ) !6, rW   c                 H   US   R                   n[        U5      nU Vs/ s H(  nUR                  R                  U R                  5      PM*     snUl        US   nX5R                  l        UR                  R                  S U R                   UR                  l        U$ s  snf Nr   )kernel_groupOuterLoopFusedKernelrS  from_loop_levelr,  innerkernelrL  )r5  rZ  rb  outer_loop_fused_kernelproxyouter_fused_proxys         rX   merge_outer_fusion_kernels6OuterLoopFusedSchedulerNode.merge_outer_fusion_kernelsZ  s     -Q/<<"6|"D /)
. OO++D,H,HI.)
% 2!4-D##*,=,G,G,M,M*d**-
##) ! )
s   /B)r/  r,  )r  r  r  r  classmethodr   r-  r)  r	   r!   r#   r4  r*  r^  rj  r  __classcell__r8  s   @rX   r  r    sg    !Q%!Q.?!Q !QF44  &8-&G HI4 &Tl! !rW   r  c                   >    \ rS rSrS
S\4S jjrS rS rS rS r	Sr
g	)RecordOptimizationContextil  	func_namec                 ,    Xl         S U l        S U l        g r	  )rq  current_nodeopt_ctx)r5  rq  s     rX   r4  "RecordOptimizationContext.__init__m  s    "596:rW   c                    [         R                  (       d   e[         R                  R                  (       d   e[         R                  R                  U l        U R                  c   e[        R                  U R                  R
                  ;   a-  U R                  R
                  [        R                     U l        O[        5       U l        U R                  c   eU R                  U R                  l        U $ r	  )	r4   interpreterrs  rA   keymetart  rq  ops_namer;  s    rX   	__enter__#RecordOptimizationContext.__enter__r  s    }}}}}))))MM66  ,,,""d&7&7&<&<<,,112E2I2IJDL.0DL||''' $rW   c                     U R                   (       d   eU R                  (       d   eU R                  U R                   R                  [        R                  '   g r	  )rs  rt  ry  rA   rx  r5  exc_typeexc_valexc_tbs       rX   __exit__"RecordOptimizationContext.__exit__  s>        |||:>,,2667rW   c                     U R                   $ r	  )rt  r;  s    rX   get_opt_ctx%RecordOptimizationContext.get_opt_ctx  s    ||rW   c                 @    U R                   (       d   eU R                   $ r	  )rs  r;  s    rX   get_fx_node%RecordOptimizationContext.get_fx_node  s           rW   )rs  rq  rt  N)rS   )r  r  r  r  rp   r4  r{  r  r  r  r  rV   rW   rX   rp  rp  l  s#    ;# ;
G
!rW   rp  c                  ~    [        S U  5       5      (       a   S5       e[        U 5      =nb	  [        U   $ SU S    S3$ )Nc              3   h   #    U  H(  n[        U[        5      =(       a    UR                  v   M*     g 7fr	  )r   rG   r   )r  args     rX   r   $decltype_promoted.<locals>.<genexpr>  s#     RTc:c>2AszzATs   02z*Promotion of vector types is not supported	decltype(r   r   )rg   rI   rH   )argsdts     rX   decltype_promotedr    sS    RTRRR 4R  %%2B47)1%%rW   c                   (   \ rS rSrSr\S 5       r\S 5       r\S 5       r\SNS j5       r	\S 5       r
\S	 5       r\S
 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r \S 5       r!\S  5       r"\S! 5       r#\S" 5       r$\S# 5       r%\S$ 5       r&\S% 5       r'\S& 5       r(\S' 5       r)\S( 5       r*\S) 5       r+\S* 5       r,\S+ 5       r-\S, 5       r.\S- 5       r/\S. 5       r0\S/ 5       r1\S0 5       r2\S1 5       r3\S2 5       r4\S3 5       r5\S4 5       r6\S5 5       r7\S6 5       r8\S7 5       r9\S8 5       r:\S9 5       r;\S: 5       r<\S; 5       r=\S< 5       r>\S= 5       r?\S> 5       r@\S? 5       rA\S@ 5       rB\SA 5       rC\SB 5       rD\SC 5       rE\SD 5       rF\SE 5       rG\SF\HR                  SG\HR                  4SH j5       rJ\SF\HR                  SG\HR                  4SI j5       rK\SF\HR                  SG\HR                  4SJ j5       rL\SK 5       rM\SL 5       rNSMrOg)OCppOverridesi  zMap element-wise ops to C++c                 *    [        X5       SU  SU S3$ )N( + r   r  abs     rX   addCppOverrides.add  !    #A)*!A3c!A66rW   c                 *    [        X5       SU  SU S3$ )Nr   - r   r  r  s     rX   r   CppOverrides.sub  r  rW   c                 *    [        X5       SU  SU S3$ )Nr  r   r   r  r  s     rX   rW  CppOverrides.mul  r  rW   Nc                    [        U [        5      (       d   eUc  U R                  n[        R                  R                  XU5      n[        R                  R                  R                  [        R                  R                  U5      nUR                  SX4SU05        U[        ;   a5  U[        R                  :X  a!   [        R                  R                  XXQ5        U$ )Nto_dtyper   )r   rG   r   r4   rf  get_to_dtype_exprcsegeneratecomputeupdate_on_argsr   r   ro   cache_dtype_convert)xr   r   use_compute_typesexprcsevars         rX   r  CppOverrides.to_dtype  s    !^,,,,Ixx))!I>&&qxx'7'7>j1*{I6NOM!i5;;&>> HH((vErW   c                 X    U[         ;   d   U S[         S35       eS[         U    SU  S3$ )Nz missing from z.DTYPE_TO_CPPzc10::bit_cast<>(r   )rH   r  )r  r   r   s      rX   to_dtype_bitcastCppOverrides.to_dtype_bitcast  s=    $U~hZ}&UU$U 34Bqc;;rW   c                     SU  S3$ )Nz	std::abs(r   rV   r  s    rX   absCppOverrides.abs      1#QrW   c                     SU  S3$ )Nz	std::sin(r   rV   r  s    rX   sinCppOverrides.sin  r  rW   c                     SU  S3$ )Nz	std::cos(r   rV   r  s    rX   cosCppOverrides.cos  r  rW   c                     SU  SU  S3$ )Nr  z)(-r   rV   r  s    rX   negCppOverrides.neg      1#S1%%rW   c                     SU  S3$ )Nz	std::exp(r   rV   r  s    rX   expCppOverrides.exp  s     1#QrW   c                     SU  S3$ )Nz
std::exp2(r   rV   r  s    rX   exp2CppOverrides.exp2      A3a  rW   c                     SU  S3$ )Nzstd::expm1(r   rV   r  s    rX   expm1CppOverrides.expm1      QCq!!rW   c                     SU  S3$ )Nz	std::erf(r   rV   r  s    rX   erfCppOverrides.erf  r  rW   c                     SU  S3$ )Nz
std::erfc(r   rV   r  s    rX   erfcCppOverrides.erfc  r  rW   c                     SU  S3$ )Nzcalc_erfinv(r   rV   r  s    rX   erfinvCppOverrides.erfinv      aS""rW   c                     SU  S3$ )Nz
std::sqrt(r   rV   r  s    rX   sqrtCppOverrides.sqrt   r  rW   c                     SU  S3$ )Nz1 / std::sqrt(r   rV   r  s    rX   rsqrtCppOverrides.rsqrt  s    s!$$rW   c                     [         R                  R                  nUS:X  a  U  SU  S3$ Uc  SU  S3$ [        SU< 35      e)Naccuracy + decltype()(1)zstd::log1p(r   8unrecognized config cpp.inject_log1p_bug_TESTING_ONLY = r   cppinject_log1p_bug_TESTING_ONLYr   r  bugs     rX   log1pCppOverrides.log1p  sW    jj66*SQCt,,[ 1%% J3'R rW   c                     SU  S3$ )Nz	std::tan(r   rV   r  s    rX   tanCppOverrides.tan  r  rW   c                     SU  S3$ )Nz
std::tanh(r   rV   r  s    rX   tanhCppOverrides.tanh  r  rW   c                 0    [         (       a  SU  S3$ SU  S3$ )z
On windows std::signbit only support float type.
Ref: https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/signbit?view=msvc-170
z std::signbit(static_cast<float>(r   zstd::signbit(r   rT   r  s    rX   signbitCppOverrides.signbit  s/     { /qc4	
 !1%	
rW   c                     SU  SU S3$ )Nz	std::pow(r   r   rV   r  s     rX   powCppOverrides.pow(  s    1#Rs!$$rW   c                     SU  S3$ )Nz	std::log(r   rV   r  s    rX   logCppOverrides.log,  r  rW   c                     SU  S3$ )Nzstd::nearbyint(r   rV   r  s    rX   roundCppOverrides.round0  s     1%%rW   c                     SU  S3$ )Nzstd::floor(r   rV   r  s    rX   floorCppOverrides.floor4  r  rW   c                 H    U  SU 3nU  SU 3nSU  SU SU SU SU SU S	3$ )
N /  % ((z
 < 0) != (z	 < 0) ? (z != 0 ? z - 1 : z) : r   rV   )r  r  quotrems       rX   floordivCppOverrides.floordiv8  sR     Cs|3qclA3j9SE$wtfDQUPVVWXXrW   c                     SU  S3$ )Nz
std::ceil(r   rV   r  s    rX   ceilCppOverrides.ceil?  r  rW   c                     SU  S3$ )Nzstd::trunc(r   rV   r  s    rX   truncCppOverrides.truncC  r  rW   c                     U  SU 3$ Nr   rV   r  s     rX   truncdivCppOverrides.truncdivG  s     Cs|rW   c                     SU  SU S3$ )Nz
std::fmod(r   r   rV   r  s     rX   fmodCppOverrides.fmodL  s    A3b1%%rW   c                     SU  S3$ )Nzstd::isinf(r   rV   r  s    rX   isinfCppOverrides.isinfP  r  rW   c                     SU  S3$ )Nzstd::isnan(r   rV   r  s    rX   isnanCppOverrides.isnanT  r  rW   c                     SU  S3$ )Nzstd::lgamma(r   rV   r  s    rX   lgammaCppOverrides.lgammaX  r  rW   c                     SU  S3$ )Nz
std::acos(r   rV   r  s    rX   acosCppOverrides.acos\  r  rW   c                     SU  S3$ )Nzstd::acosh(r   rV   r  s    rX   acoshCppOverrides.acosh`  r  rW   c                     SU  S3$ )Nz
std::cosh(r   rV   r  s    rX   coshCppOverrides.coshd  r  rW   c                     SU  S3$ )Nz
std::sinh(r   rV   r  s    rX   sinhCppOverrides.sinhh  r  rW   c                     SU  S3$ )Nz
std::asin(r   rV   r  s    rX   asinCppOverrides.asinl  r  rW   c                     SU  S3$ )Nzstd::asinh(r   rV   r  s    rX   asinhCppOverrides.asinhp  r  rW   c                     SU  SU S3$ )Nzstd::atan2(r   r   rV   r  ys     rX   atan2CppOverrides.atan2t      QCr!A&&rW   c                     SU  S3$ )Nz
std::atan(r   rV   r  s    rX   atanCppOverrides.atanx  r  rW   c                     SU  S3$ )Nzstd::atanh(r   rV   r  s    rX   atanhCppOverrides.atanh|  r  rW   c                     SU  SU S3$ )Nzstd::copysign(r   r   rV   r0  s     rX   copysignCppOverrides.copysign  s    s"QCq))rW   c           	         SU  S3SU  S34n[        S U 5       5      (       a  [        S U 5       5      $ [        5       n[        R                  R
                  R                  [        R                  S9n[        R                  R
                  R                  U R                  S9nUR                  SU S35        UR                  S	U S
U  SU S35        [        R                  R                  R                  U5        XC4n[        X5       H.  u  pg[        R                  R
                  R                  Xg5        M0     XC4$ )Nfrexp()[0])[1]c              3   x   #    U  H0  n[         R                  R                  R                  U5      S Lv   M2     g 7fr	  r4   rf  r  try_getr  	cache_keys     rX   r   %CppOverrides.frexp.<locals>.<genexpr>  (     WJyqxx||##I.d:J   8:c              3   t   #    U  H.  n[         R                  R                  R                  U5      v   M0     g 7fr	  rC  rE  s     rX   r   rG    &     U*Y--i88*   68r   zint32_t r   r   z = std::frexp(, &r   )r(  r   r7   r4   rf  r  newvarr   int32r   r   r  splicezipput)r  
cache_keysr   exponentmantissacse_varsrF  cse_vars           rX   frexpCppOverrides.frexp  s   aS%s$'77
WJWWWU*UUU~88<<&&U[[&988<<&&QWW&5(1-.xjqcXJbIJ	%'"%j";IHHLLY0 #<!!rW   c                     SU  SU S3$ )Nzstd::hypot(r   r   rV   r0  s     rX   hypotCppOverrides.hypot  r4  rW   c                     SU  S3$ )Nzstd::log10(r   rV   r  s    rX   log10CppOverrides.log10  r  rW   c                     SU  S3$ )Nz
std::log2(r   rV   r  s    rX   log2CppOverrides.log2  r  rW   c                     SU  SU S3$ )Nzstd::nextafter(r   r   rV   r0  s     rX   	nextafterCppOverrides.nextafter  s     2aS**rW   c                     [         R                  R                  nUS:X  a  gUS:X  a  U  S3$ US:X  a  U  SU  S3$ Uc	  SU  S	U  S
3$ [        SU< 35      e)Ncompile_errorcompile error!runtime_error	; throw 1r  r  r  z	std::max(, decltype()(0))7unrecognized config cpp.inject_relu_bug_TESTING_ONLY = r   r  inject_relu_bug_TESTING_ONLYr   r  s     rX   reluCppOverrides.relu  s|    jj55/!#O#S	?"JSQCt,,[qcQCu55 I#Q rW   c                     SU  SU S3$ )Nzmin_propagate_nan(r   r   rV   r  s     rX   minimumCppOverrides.minimum      #A3b1--rW   c                     SU  SU S3$ )Nzmax_propagate_nan(r   r   rV   r  s     rX   maximumCppOverrides.maximum  rv  rW   c                     U  SU SU 3$ )N ?  : rV   )r  r  cs      rX   whereCppOverrides.where  s    Cs#aS!!rW   c                     SU  SU S3$ )Nzmod(r   r   rV   r  s     rX   r  CppOverrides.mod  s    aS1#QrW   c                 (    [        U [        U   5      $ r	  )rP   rH   )valr   s     rX   constantCppOverrides.constant  s    Ce!455rW   c                    [        [        R                  R                  U 5      5      n[        R                  R                  R                  [        R                  R                  U[        U 5      S9n[        R                  " X15      $ )Nbounds)
rD   r4   rf  rename_indexingr  r  r  r%   r2   r  )r  r   idx_strr   s       rX   
index_exprCppOverrides.index_expr  s`    0067hhll##HHg.CD.I $ 
 ||C''rW   c                 6   [        5       n[        R                  R                  R	                  5       nUR                  SU S35        [        R                  R                  U5         UR                  5          U" 5       nUR                  SU S35        S S S 5        S S S 5        UR                  S5        [        R                  R                  R                  U5        [        USU S35      nU  SU SU 3$ ! , (       d  f       Nj= f! , (       d  f       Ns= f)	Nr    = [&]return r   r  z())r{  z() : )r7   r4   rf  r  rO  r   swap_buffersr   r  rQ  rP   )maskbodyotherr   body_varr   
other_codes          rX   maskedCppOverrides.masked  s    ~ 88<<&&(xj/0XX""4($++-VFNNWVHA./ +8( 	s	% "%9XJc)BC
s8*E*66 +8-((s$   'D
8C9D
9
D	D


Dc                     U  SU 3$ )N && rV   r  s     rX   logical_andCppOverrides.logical_and      D}rW   c                     SU  3$ )N!rV   r  s    rX   logical_notCppOverrides.logical_not      1#wrW   c                     U  SU 3$ )Nr   rV   r  s     rX   
logical_orCppOverrides.logical_or  r  rW   c                     U  SU 3$ )N != rV   r  s     rX   logical_xorCppOverrides.logical_xor  r  rW   c                     SU  SU  SU S3$ )Nr  )( & r   rV   r  s     rX   bitwise_andCppOverrides.bitwise_and      1#Rs#aS**rW   c                     SU  SU  S3$ )Nr  z)(~r   rV   r  s    rX   bitwise_notCppOverrides.bitwise_not  r  rW   c                     SU  SU  SU S3$ )Nr  r   | r   rV   r  s     rX   
bitwise_orCppOverrides.bitwise_or  r  rW   c                     SU  SU  SU S3$ )Nr  r  r   r   rV   r  s     rX   bitwise_xorCppOverrides.bitwise_xor  r  rW   c                    [        5       nUR                  S5        UR                  5          [        U R                     nUR                  SU SU S35        UR                  SU SU SU S35        UR                  5          UR                  S	U  S
35        S S S 5        UR                  S	U  SU SU  SU S3	5        S S S 5        UR                  S5        U$ ! , (       d  f       NG= f! , (       d  f       N2= f)N[&]()constexpr decltype() max_shift = sizeof(z) * CHAR_BIT;$if ((static_cast<std::make_signed_t<>>() < 0) || ( >= max_shift))return decltype(z)(0);z#)(static_cast<std::make_unsigned_t<z) << r   ()r7   r   r   rH   r   r  r  r   scalar_ts       rX   bitwise_left_shiftCppOverrides.bitwise_left_shift  s    ~w[[]#AGG,HNN%aS(=hZ}U NN6xjA3kRSQTTcd !1!E:; NN"1#%H
RUVWUXX]^_]``bc  	t  ]s$   AC+C&C+
C(	$C++
C9c           
         [        5       nUR                  S5        UR                  5          [        U R                     nUR                  SU SU SU S35        UR                  SU SU SU S	35        UR                  5          UR                  S
U  SU  S35        S S S 5        UR                  S
U  SU  SU S35        S S S 5        UR                  S5        U$ ! , (       d  f       ND= f! , (       d  f       N2= f)Nr  r  r  z ) * CHAR_BIT - std::is_signed_v<z>;r  r  r  r  r  r  z >> max_shift); >> r   r  r  r  s       rX   bitwise_right_shift CppOverrides.bitwise_right_shift  s    ~w[[]#AGG,HNN%aS(=hZGghpgqqst NN6xjA3kRSQTTcd !1!BqcIJ NN-aS1#T!B?@  	t	  ]s$   AC.C#C.
C+	'C..
C<seedrD  c                     SU  SU S3$ )Nznormalized_rand_cpu(r   r   rV   r  rD  s     rX   randCppOverrides.rand(  s    %dV2fXQ77rW   c                     SU  SU S3$ )Nz
randn_cpu(r   r   rV   r  s     rX   randnCppOverrides.randn,  s    D6F81--rW   c           	           SU  SU SU SU S3	$ )Nzrandint64_cpu(r   r   rV   )r  rD  lowhighs       rX   	randint64CppOverrides.randint640  s#    vRxr#ba@@rW   c                     SU  SU  SU  S3$ )Nr  z)(1) / (decltype(z)(1) + std::exp(-r   rV   r  s    rX   sigmoidCppOverrides.sigmoid4  s    1#.qc1B1#RHHrW   c           
      X   [        5       nSU  S3nSU  S3nUR                  S5        UR                  5          UR                  SU  SU SU S35        UR                  S	U  S
U SU S35        UR                  S5        S S S 5        UR                  S5        U$ ! , (       d  f       N!= f)Nr  )(0)r  r  auto left = z > 0 ? r|  r   auto right = z < 0 ? return left - right;r  r7   r   r   )r  r   scalar_zero
scalar_ones       rX   signCppOverrides.sign8  s    ~!!D) 4(
w[[]NN\!GJ<s;-qQRNN]1#WZLK=PQRSNN12  	t ]s   AB
B)rV   NT)Pr  r  r  r  r  staticmethodr  r   rW  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r!  r$  r'  r*  r-  r2  r6  r9  r<  rY  r\  r_  rb  re  rq  rt  rx  r~  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   Exprr  r  r  r  r  r  rV   rW   rX   r  r    s   %7 7 7 7 7 7 ( (T < <             & &     ! ! " "     ! ! # # ! ! % % 	 	     ! ! 	
 	
 % %     & & " " Y Y ! ! " "   & & " " " " # # ! ! " " ! ! ! ! ! ! " " ' ' ! ! " " * * " "  ' ' " " ! ! + +   . . . . " "     6 6 ( ( 7 7          + + & & + + + +  &  " 85:: 8uzz 8 8 .EJJ .

 . . A

 AEJJ A A I I 
 
rW   r  r  c                      ^  \ rS rSrSrU 4S jr\S 5       r\S 5       r\S 5       r	\S 5       r
\S 5       r\S	 5       r\S
 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r\S 5       r \S 5       r!\S 5       r"\S  5       r#\S! 5       r$\S" 5       r%\S# 5       r&\S$ 5       r'\S% 5       r(\S& 5       r)\S' 5       r*\S( 5       r+\S) 5       r,\S* 5       r-\S+ 5       r.\S, 5       r/\S- 5       r0\S. 5       r1\S/ 5       r2\S0 5       r3\S1 5       r4\S2 5       r5\S3 5       r6\S4 5       r7\S5 5       r8\S6 5       r9\S7 5       r:\S8 5       r;\S9 5       r<\S: 5       r=\S; 5       r>\S< 5       r?\S= 5       r@\S> 5       rA\S? 5       rB\S@ 5       rC\SA 5       rD\SB 5       rE\SC 5       rF\SD 5       rG\SE 5       rH\SF 5       rI\SG 5       rJ\SH 5       rK\SI 5       rL\SJ 5       rM\SK 5       rN\STSL j5       rO\SM 5       rP\SN 5       rQ\SO 5       rR\SP 5       rS\TSQ 5       rU\TSR 5       rVSSrWU =rX$ )UCppVecOverridesiI  z.Map element-wise ops to aten vectorization C++c                    >^ [         TU ]  U 5      mUU4S jn[        [        5      R	                  5        H@  u  pE[        USS 5      [        :X  d  M  US;  d  M$  [        TXC" UR                  5      5        MB     T$ )Nc                    >^  UU U4S jnU$ )Nc                    > U  Vs/ s HT  n[        U[        [        R                  45      (       d*  [        U[        5      (       d  M?  UR
                  (       a  MR  UPMV     nnU  Vs/ s H/  n[        U[        5      (       d  M  UR
                  (       d  M-  UPM1     nn[        U 5      nU(       a  U(       a  / nU  H  n[        U[        [        R                  45      (       a  [        U[        R                  5      (       a7  UR                  (       d&  [        R                  " U[        R                  5      nO%[        R                  " U[        R                  5      n[        U[        5      (       a  UR                  OUnUR                  U5        M     U(       a@  [!        U5      S:X  a  [#        U5      nO%T
[$        R&                  :X  a  [#        USS  5      USS & U(       a  U(       a  [        [(        R*                  [,        5      (       d   eU Vs/ s H}  n[        U[        5      (       ac  UR
                  (       dR  T
[$        R.                  [$        R0                  [$        R2                  4;  a  [(        R*                  R5                  U5      OUPM     nnU(       a  T
" U0 UD6$ [7        [$        T5      n[9        UT
R:                  5      nUc   eU" U 0 UD6$ s  snf s  snf s  snf )Nr   r5   )r   rn   r   r  rG   r   r)  	is_numberr2   r  r   int64r  r3   valueappendr   rM   r  r~  r4   rf  CppVecKernelr  r  r  	broadcastr3  rG  r  )r  kwargsr  scalarsvectorsnew_argsnew_arg
scalar_opsscalar_funcr8  funcr5  s            rX   wrapper6CppVecOverrides.__new__.<locals>.wrap.<locals>.wrapper\  s`     $#!#UZZ'899"37  AD

 #    $#!#~6 ;>:: #  
  :w!H#%cC+<==)#uzz::3==&)nnS%++&F&)ll3&D/9#x/H/H#))cC ,  $  8})#/#9!6!66'3HQRL'A w%ahh====  (0  (0G !+7N C C(/$($3$8$8$3$9$9$3$=$=("%"	 HH..w7 ")) (0   $ 4V44 "'!=J")*dmm"DK&222&777@ s*   >KKK'K"K"K"BK'rV   )r  r  r8  r5  s   ` rX   wrap%CppVecOverrides.__new__.<locals>.wrapO  s    @8D NrW   r8  )r  r  )	r3  __new__varsr  itemsrG  r  setattr__func__)r+  r  kargsr  r   methodr5  r8  s         @rX   r  CppVecOverrides.__new__L  sq    ws#O	b !1779LDv{D1\Ad S G dD$9: : rW   c                     U  SU 3$ )Nr  rV   r  s     rX   r  CppVecOverrides.add      Cs|rW   c                     U  SU 3$ )Nr  rV   r  s     rX   r   CppVecOverrides.sub  r  rW   c                     U  SU 3$ Nr   rV   r  s     rX   rW  CppVecOverrides.mul  r  rW   c                     U  SU 3$ r  rV   r  s     rX   truedivCppVecOverrides.truediv  r  rW   c                     U  S3$ )Nz.abs()rV   r  s    rX   r  CppVecOverrides.abs      F|rW   c                     U  S3$ )Nz.sin()rV   r  s    rX   r  CppVecOverrides.sin  r  rW   c                     U  S3$ )Nz.cos()rV   r  s    rX   r  CppVecOverrides.cos  r  rW   c                     U  S3$ )Nz.exp()rV   r  s    rX   r  CppVecOverrides.exp  r  rW   c                     U  S3$ )Nz.exp2()rV   r  s    rX   r  CppVecOverrides.exp2      G}rW   c                     SU  S3nU  SU 3$ )Nr  r  z	.exp() - rV   )r  vec_ones     rX   r  CppVecOverrides.expm1  s#     aS%IgY''rW   c                     U  S3$ )Nz.erf()rV   r  s    rX   r  CppVecOverrides.erf  r  rW   c                     U  S3$ )Nz.erfc()rV   r  s    rX   r  CppVecOverrides.erfc  r  rW   c                     U  S3$ )Nz	.erfinv()rV   r  s    rX   r  CppVecOverrides.erfinv      IrW   c                     U  S3$ )Nz.sqrt()rV   r  s    rX   r  CppVecOverrides.sqrt  r  rW   c                     [        [        R                  [        5      (       d   e[        U [        5      (       d   eU R
                  c   e[        R                  R                  U R
                  5       SU  SU S3$ )Nr   == r   r   r4   rf  r  rG   r   _get_mask_typer0  s     rX   eqCppVecOverrides.eq  i    !((L1111!^,,,,ww"""(())!''231QCtA3a@@rW   c                    [        [        R                  [        5      (       d   e[        U [        5      (       d   eU R
                  [        R                  :X  aN  UR
                  [        R                  :X  d   e[        [        R                  R                  X45      u  p#U SU 3$ U R
                  c   e[        R                  R                  U R
                  5       SU  SU S3$ )Nr  r  r   )r   r4   rf  r  rG   r   r   rl   rO   r  r/  )r  r1  x_casty_casts       rX   neCppVecOverrides.ne  s    !((L1111!^,,,,77ejj 77ejj(((1!((2B2BQFKNFXT&**77&&&hh--agg67q4s!DDrW   c                     [        [        R                  [        5      (       d   e[        U [        5      (       d   eU R
                  c   e[        R                  R                  U R
                  5       SU  SU S3$ )Nr  r   r   r.  r0  s     rX   ltCppVecOverrides.lt  i    !((L1111!^,,,,ww"""(())!''231QCs1#Q??rW   c                     [        [        R                  [        5      (       d   e[        U [        5      (       d   eU R
                  c   e[        R                  R                  U R
                  5       SU  SU S3$ )Nr  z > r   r.  r0  s     rX   gtCppVecOverrides.gt  r;  rW   c                     [        [        R                  [        5      (       d   e[        U [        5      (       d   eU R
                  c   e[        R                  R                  U R
                  5       SU  SU S3$ )Nr   <= r   r.  r0  s     rX   leCppVecOverrides.le  r2  rW   c                     [        [        R                  [        5      (       d   e[        U [        5      (       d   eU R
                  c   e[        R                  R                  U R
                  5       SU  SU S3$ )Nr   >= r   r.  r0  s     rX   geCppVecOverrides.ge  r2  rW   c                     U  SU 3$ Nr  rV   r0  s     rX   and_CppVecOverrides.and_  r  rW   c                     U  S3$ )Nz.rsqrt()rV   r  s    rX   r  CppVecOverrides.rsqrt      H~rW   c                     U  SU S3$ )Nz.pow(r   rV   r  s     rX   r  CppVecOverrides.pow  s    E!ArW   c                     U  S3$ )Nz.log()rV   r  s    rX   r  CppVecOverrides.log  r  rW   c                     U  S3$ )Nz.round()rV   r  s    rX   r  CppVecOverrides.round"  rM  rW   c                     U  S3$ )Nz.floor()rV   r  s    rX   r  CppVecOverrides.floor&  rM  rW   c                     U  S3$ )Nz.ceil()rV   r  s    rX   r  CppVecOverrides.ceil*  r  rW   c                     U  S3$ )Nz.trunc()rV   r  s    rX   r  CppVecOverrides.trunc.  rM  rW   c                     U  SU S3$ )Nz.fmod(r   rV   r  s     rX   r  CppVecOverrides.fmod2  s    F1#QrW   c                     U  S3$ )Nz	.lgamma()rV   r  s    rX   r  CppVecOverrides.lgamma6  r)  rW   c                 *    [        X5      u  pU  SU 3$ rH  rL   r  s     rX   r  CppVecOverrides.logical_and:      ,Q2Cs|rW   c                     SU  3$ N~rV   r  s    rX   r  CppVecOverrides.logical_not?  r  rW   c                 *    [        X5      u  pU  SU 3$ Nr  r_  r  s     rX   r  CppVecOverrides.logical_orC  ra  rW   c                 *    [        X5      u  pU  SU 3$ Nr   r_  r  s     rX   r  CppVecOverrides.logical_xorH  ra  rW   c                 *    [        X5      u  pU  SU 3$ rH  r_  r  s     rX   r  CppVecOverrides.bitwise_andM  ra  rW   c                     SU  3$ rc  rV   r  s    rX   r  CppVecOverrides.bitwise_notR  r  rW   c                 *    [        X5      u  pU  SU 3$ rg  r_  r  s     rX   r  CppVecOverrides.bitwise_orV  ra  rW   c                 *    [        X5      u  pU  SU 3$ rj  r_  r  s     rX   r  CppVecOverrides.bitwise_xor[  ra  rW   c                     U  SU 3$ )Nz << rV   r  s     rX   r  "CppVecOverrides.bitwise_left_shift`  r  rW   c                     U  SU 3$ )Nr  rV   r  s     rX   r  #CppVecOverrides.bitwise_right_shiftd  r  rW   c                     [        [        R                  [        5      (       d   e[        R                  R	                  X5       $ r	  )r   r4   rf  r  load)r   rD  s     rX   	load_seedCppVecOverrides.load_seedh  s/    !((L1111((---./rW   c                     [        [        R                  [        5      (       d   e[	        5       nSU  S3n[        XU5      $ )Nz)result[offset_idx] = normalized_rand_cpu(, offset[offset_idx]);r   r4   rf  r  r7   rF   r  rD  r   rand_functions       rX   r  CppVecOverrides.randm  sA    !((L1111~7v=ST 	 F-88rW   c                     [        [        R                  [        5      (       d   e[	        5       nSU  S3n[        XU5      $ )Nzresult[offset_idx] = randn_cpu(r}  r~  r  s       rX   r  CppVecOverrides.randnv  s<    !((L1111~9$?UVF-88rW   c                     [        [        R                  [        5      (       d   e[	        5       nSU  SU SU S3n[        XU[        R                  5      $ )Nz#result[offset_idx] = randint64_cpu(z, offset[offset_idx], r   r   )r   r4   rf  r  r7   rF   r   r  )r  rD  r  r  r   r  s         rX   r  CppVecOverrides.randint64}  sT    !((L1111~=dVCYZ]Y^^`ae`ffhiF-EErW   c                 ~    U R                   UR                   :X  d   S5       eU  S[        R                  X5       SU 3$ )Nz;remainder vec implementation expect the same inputs' dtype.z - (z) * )r   r  r  r  s     rX   	remainderCppVecOverrides.remainder  sG    ww!''! 	
I	
! D11!78QC@@rW   c                     U  S3$ )Nz.tan()rV   r  s    rX   r  CppVecOverrides.tan  r  rW   c           	          [         R                  R                  (       a"  SU  S3nSU  S3nSU  S3nU SU SU SU  SU 3	$ U  S	3$ )
Nr  r  z)(2)z)(-2)z / (z + (r   z).exp()) - z.tanh())r   r  use_decompose_tanh)r  r!  vec_twovec_minus_twos       rX   r  CppVecOverrides.tanh  sl    ::((!!D)G!!D)G's%0M)4y]O3qcWIV S= rW   c                     U  S3$ )Nz.reciprocal()rV   r  s    rX   
reciprocalCppVecOverrides.reciprocal  s    M""rW   c                     U  S3$ )Nz.atan()rV   r  s    rX   r6  CppVecOverrides.atan  r  rW   c                     U  S3$ )Nz.acos()rV   r  s    rX   r  CppVecOverrides.acos  r  rW   c                     U  S3$ )Nz.asin()rV   r  s    rX   r*  CppVecOverrides.asin  r  rW   c                     U  S3$ )Nz.cosh()rV   r  s    rX   r$  CppVecOverrides.cosh  r  rW   c                     U  S3$ )Nz.sinh()rV   r  s    rX   r'  CppVecOverrides.sinh  r  rW   c                     U  S3$ )Nz.log10()rV   r  s    rX   r_  CppVecOverrides.log10  rM  rW   c                     U  S3$ )Nz.log2()rV   r  s    rX   rb  CppVecOverrides.log2  r  rW   c                     U  SU S3$ )Nz.nextafter(r   rV   r0  s     rX   re  CppVecOverrides.nextafter  s    Ks!$$rW   c                     U  SU S3$ )Nz
.copysign(r   rV   r  s     rX   r<  CppVecOverrides.copysign  s    Jqc##rW   c                     U  SU S3$ )Nz.atan2(r   rV   r  s     rX   r2  CppVecOverrides.atan2      GA3a  rW   c                     U  SU S3$ )Nz.hypot(r   rV   r  s     rX   r\  CppVecOverrides.hypot  r  rW   c           
      <    SU  S3nSU  S3nU SU SU  SU SU  S3
$ )	Nr  r  z)(0.5)z * ((r  z)/(r  z)).log()rV   )r  r!  vec_one_halfs      rX   r9  CppVecOverrides.atanh  sE     aS%"1#V,uWIS3wis1#XNNrW   c                     U  S3$ )Nz.asinh()rV   r  s    rX   r-  CppVecOverrides.asinh  rM  rW   c                     U  S3$ )Nz.acosh()rV   r  s    rX   r!  CppVecOverrides.acosh  rM  rW   c                     [         R                  R                  nUS:X  a  gUS:X  a  U  S3$ US:X  a  U  SU  S3$ Uc	  SU  S	U  S
3$ [        SU< 35      e)Nrh  ri  rj  rk  r  r  r  zat::vec::clamp_min(rl  rm  rn  ro  r  s     rX   rq  CppVecOverrides.relu  s|    jj55/!#O#S	?"JSQCt,,[(;qc?? I#Q rW   c                     SU  SU  SU  S3$ )Nr  z)(1)/(decltype(z)(1) + z.neg().exp())rV   r  s    rX   r  CppVecOverrides.sigmoid  s    1#_QCwqcGGrW   c                     U  S3$ )Nz.neg()rV   r  s    rX   r  CppVecOverrides.neg  r  rW   c                    [        U R                  5      (       a*  U R                  UR                  :X  d   S5       eSU  SU S3$ [        S X4 5       5      (       d   eSU  S3n[        R                  R                  UR                  5      S:  a,  U SS[        R                  R                  -  S-
   S	U S
U S3nU  SU 3nSU  SU SU S3nSU  SU SU SU S3	nU SU SU SU S
U SU S3$ )NzDdiv_floor_floating_vec implementation expect the same inputs' dtype.zdiv_floor_floating_vec(r   r   c              3   L   #    U  H  n[        UR                  5      v   M     g 7fr	  )r   r   )r  items     rX   r   +CppVecOverrides.floordiv.<locals>.<genexpr>  s     G'

33   "$r  r5   ::blend<r  (1), r   r  r  r  z(0))r  r   z	(0)) != (z(0)))z	::blendv(r  r  )r   r   r(  r4   rf  _get_raw_num_vectorstiling_factor)r  r  _tr  has_remis_negs         rX   r  CppVecOverrides.floordiv  s5   !''""77agg% V% -QCr!A66GGGGGGQCq!Bxx,,QWW59d(A)?)?$?1#D"ERt5QRPSSTUSA3<D!Cs$rd$/G!Ct9QCs2$e<FT4&4&B4uWISPQRRrW   c                     [         R                  R                  UR                  5      S:  a2  SU S3nU SS[         R                  R                  -  S-
   SU SU S3nU  SU 3$ )Nr5   r  r   r  r  r  r   )r4   rf  r  r   r  )r  r  r  s      rX   r  CppVecOverrides.truncdiv  sp     88((1A5QCq!B$hQXX%;%; ;q@AB4uQCqQACs|rW   c                     U R                   [        R                  :X  aN  UR                   [        R                  :X  d   e[        [        R
                  R                  X45      u  p#U SU 3$ SU  SU S3$ )Nr  at::vec::minimum(r   r   r   r   rl   rO   r4   rf  r  r  r  a_castb_casts       rX   rt  CppVecOverrides.minimum  h    77ejj 77ejj(((1!((2B2BQFKNFXS))&qcA3a00rW   c                     U R                   [        R                  :X  aN  UR                   [        R                  :X  d   e[        [        R
                  R                  X45      u  p#U SU 3$ SU  SU S3$ )Nr  at::vec::maximum(r   r   r  r  s       rX   rx  CppVecOverrides.maximum  r  rW   c                     U  SU  3$ r  rV   r  s    rX   squareCppVecOverrides.square!  r  rW   c                    [        [        R                  [        5      (       d   eUR                  [
        R                  :X  aX  UR                  [
        R                  :X  d   e[        [        R                  R                  XU45      u  p4nSU SU SU SU S3	$ SU SU SU S[        R                  R                  XR                  5       S3	$ )Nr  
)::blendv(r   r   )
r   r4   rf  r  r   r   rl   rO   r  _get_mask_cast)r  r  r}  blendv_ablendv_bblendv_cs         rX   r~  CppVecOverrides.where%  s    !((L111177ejj 77ejj(((+?  1),(H xj
8*Bxj8*TUVVqcA3b2ahh6M6MaQXQX6Y5ZZ[\\rW   c                    [        5       nSU  S3nSU  S3nSU  SU SU SU SU  S3nSU  SU SU SU  SU S3nUR                  S5        UR                  5          UR                  S	U S
35        UR                  SU S
35        UR                  S5        S S S 5        UR                  S5        U$ ! , (       d  f       N!= f)Nr  r  r  r  r   r   r   r  r  r   r  r  r  r  )r  r   vec_zeror!  blendv_lblendv_rs         rX   r  CppVecOverrides.sign1  s    ~qc&aS%qcH:Ry8*CPQsRSTqcH:Ry1#S
RSTw[[]NN\(156NN]8*A67NN12  	t ]s   <B33
Cc                 .   U[         R                  [         R                  [         R                  [         R                  [         R
                  [         R                  [         R                  [         R                  [         R                  [         R                  [         R                  4;   d   [         SU 35       e[        U [        5      (       d   eU R                  n[         R"                  R%                  XU5      n[         R"                  R&                  R)                  [         R"                  R*                  U5      nUR-                  SX4SU05        U[.        ;   a4  U[         R                  :X  a   [         R"                  R1                  XXQ5        U$ )Nz does not support r  r   )r   rl   float64ro   bfloat16float16uint8int8rP  r  float8_e4m3fnfloat8_e5m2r  r   rG   r   r4   rf  r  r  r  r  r  r   r  )r  r   r   use_compute_dtypesr  r  s         rX   r  CppVecOverrides.to_dtype@  s   JJMMKKNNMMKKJJKKKK
 
 	2 Z)%1	2 
 !^,,,,GG	xx))!I>&&qxx'7'7>j1*{I6NOM!i5;;&>HH((vErW   c                     [         R                  R                  nUS:X  a  U  SU  S3$ Uc  U  S3$ [        SU< 35      e)Nr  r  r  z.log1p()r  r  r  s     rX   r  CppVecOverrides.log1pX  sT    jj66*SQCt,,[S>! J3'R rW   c                 
  ^ [        [        R                  [        5      (       d   e[	        5       n[        R                  R
                  R                  5       n[        R                  R                  U 5       nUR                  SU S35        [        R                  R                  U5         UR                  5          U" 5       nUR                  SU S35        S S S 5        S S S 5        S S S 5        UR                  S5        [        R                  R                  R                  U5        WR                  mU S3nU4S jnUR                  (       a  Un	OU" U5      n	[        U[         T   5      n
U" U
5      n[        W["        5      (       d   U5       eUR                  (       Ga&  [	        5       nUR                  S5        [        R                  R                  U5         UR                  5          UR                  SU S	35        UR                  5          UR                  SU S35        S S S 5        UR                  S
5        UR                  5          [        R                  R
                  R%                  [        R                  R                  U	5      n[        R                  R
                  R%                  [        R                  R                  U5      n[        U["        5      (       d   U5       e[        U["        5      (       d   U5       eTUl        TUl        [        R                  R&                  nUR                  SUR)                  X\U5       S35        S S S 5        S S S 5        S S S 5        UR                  S5        [        R                  R
                  R%                  [        R                  R                  U5      nOUR                  (       aK  [        R                  R
                  R%                  [        R                  R                  U  SU	 SU 35      nOJ[        R                  R
                  R%                  [        R                  R                  U  SU SU
 35      nUR+                  SXX&40 5        U$ ! , (       d  f       GN= f! , (       d  f       GN= f! , (       d  f       GN= f! , (       d  f       GN= f! , (       d  f       GNx= f! , (       d  f       GN= f! , (       d  f       GN= f)Nr   r  r  r   r  c                    > T[         R                  :X  a$  [        R                  R	                  5        SU  S3$ [        R                  R                  T5       SU  S3$ )N::from(r   r  )r   rl   r4   rf  r/  _get_vec_type)r   r   s    rX   maskify_or_vecify1CppVecOverrides.masked.<locals>.maskify_or_vecifyt  s]     EJJ& 88**,-WTF!< ..u56avQ?rW   [&]if (z.all_zero())elser{  r|  r  )r   r4   rf  r  r7   r  rO  r  r   r  r   r  rQ  r   r   rP   rH   rG   r  	overridesr~  r  )r  r  r  r   r   new_maskr   	body_coder  body_code_vecr  other_code_vecbody_vec_varother_vec_varr  r  r   s                   @rX   r  CppVecOverrides.maskedd  s   !((L1111~hhll!!#XX__T"hNNU3%v./&&t,dkkm23 /<, #
 	s	%e2J		 ==%M-i8M!%e)<=
*:6(N33=X=3???>DNN5!&&t,dkkmhZ|<=[[]NNW^,<A#>? #v&[[]#$88<<#8#8((%$L %&HHLL$9$9((&%M &lNCCQ\QC%m^DDSmSD).L&*/M'**  NN!)//(-"X!YYZ[# # /<,2 NN4 XX\\**  F ]]XX\\**  TF#m_C?O"PF XX\\**  TF#i[J<"HF
 	hU(CRHI /<m,, #"> #] #] /<m,,s   85R0-R>RR#R0#S84&S&S0)S&D
S#S&+S8
RR
R-	(R00
R?
SS&
S#S&&
S5	0S88
Tc                 `   [        [        R                  [        5      (       d   e[        R                  R	                  U 5      n[        R                  R
                  [        R                  R                     n[        R                  R                  X#5      nUS:X  a  [        R                  X5      $ Ub  [        R                  R                  R                  [        R                  R                  [        U5      [        U 5      S9n[        R                   " XQ5      n[        U["        5      (       a  UR$                  n[        R                  R'                  Xd5      nO9[        R                  R)                  S X![        R                  R                  5      nUR+                  SX40 5        U$ )Nr   r  r  )r   r4   rf  r  r  itervars
tiling_idx_try_get_const_strider  r  r  r  r  rD   r%   r2   r  r3   r  arange_load_or_store_non_contiguousr  )r  r   r   
tiling_varstrider[  r  r  s           rX   r  CppVecOverrides.index_expr  s-   !((L1111((.XX&&qxx':':;
//BQ;**477((,,''  %,7LT7R ( C LL,E%**XX__U3FXX;;eAHH$4$4F 	lTM2>rW   c           
         SU  S3SU  S34n[        S U 5       5      (       a  [        S U 5       5      $ [        U R                     n[        R
                  R                  (       a  [        R
                  R                  O[        R
                  R                  n[        5       n[        R
                  R                  R                  [        R                  S9n[        R
                  R                  R                  U R                  S9nUR                  SU 40 S9  UR                  SU 40 S9  [        R
                  R                  U R                  5      nUS	:X  a  S
U S3OSU SU S3nUR                  US	:X  a  SU S3OSU SU S35        UR                  U SU S35        UR                  S5        UR!                  5          UR                  SU S[        R
                  R                   S35        UR                  U  S[#        U5       S35        UR                  S[        R
                  R                   S35        UR                  SU S[        R
                  R                   S35        UR                  S[#        U5       S35        UR!                  5          UR                  S5        S S S 5        UR                  US	:X  a  U S[#        U5       S3OU SU S [#        U5       S35        UR                  U S!U S"[#        U5       S35        S S S 5        UR                  S#5        [        R
                  R$                  R'                  U5        Xe4n	[)        X5       H.  u  p[        R
                  R                  R+                  X5        M0     Xe4$ ! , (       d  f       N= f! , (       d  f       N= f)$Nr?  r@  rA  c              3   x   #    U  H0  n[         R                  R                  R                  U5      S Lv   M2     g 7fr	  rC  rE  s     rX   r   (CppVecOverrides.frexp.<locals>.<genexpr>  rH  rI  c              3   t   #    U  H.  n[         R                  R                  R                  U5      v   M0     g 7fr	  rC  rE  s     rX   r   r    rK  rL  rM  rY  )r  r5   at::vec::Vectorized<r   at::vec::VectorizedN<r   zat::vec::Vectorized<int32_t> r   zat::vec::VectorizedN<int32_t, > r   r  __at_align__ std::array<	> tmpbuf;.store(tmpbuf.data(), r   z!__at_align__ std::array<int32_t, z> tmpbuf_exponent;z> tmpbuf_mantissa;r   r   z@tmpbuf_mantissa[i] = std::frexp(tmpbuf[i], &tmpbuf_exponent[i]);z? = at::vec::Vectorized<int32_t>::loadu(tmpbuf_exponent.data(), z! = at::vec::VectorizedN<int32_t, z!>::loadu(tmpbuf_exponent.data(), r   z ::loadu(tmpbuf_mantissa.data(), z();)r(  r   rH   r   r4   rf  	tail_sizer  r7   r  rO  r   rP  r  _get_num_vectorsr   r   rE   r  rQ  rR  rS  )r  rT  r   rC  r   rU  rV  n_vec
mantissa_trW  rF  rX  s               rX   rY  CppVecOverrides.frexp  s_   aS%s$'77
WJWWWU*UUUagg&%&XX%7%7qxx!!QXX=S=S~88<<&&U[[&988<<&&QWW&5!b9!b9))!''2 z #6(!,(5'; 	
 	z ,H:Q71%8*AF	

 	*Qxj23w[[]NN*6("QXX5K5K4LIV NNaS 6{47H6ILMNN3AHH4J4J3KK]^ NN*6("QXX5K5K4LL^_ NN1+d2C1DFKLV  NNA: *[\ghl\m[nnpq z!B5'Ijkvw{k|j}}  A
 NN*C
|+KKX\L]K^^`a+ 0 	u	%'"%j";IHHLLY0 #<!!#  ]s&   
CN3&N"8A%N3"
N0	,N33
Oc                    ^ U4S jnU$ )Nc                  t  > U(       a   e[         R                  n[        U[        5      (       d   e[	        5       nUR                  S5        U S   R                  nUR                  U5      nUR                  (       a  UR                  OUR                  n/ n[        U   nTR                  S;   n	U	(       a  SOUn
TR                  S:X  a  [        U S      OU
n
UR                  5          [        U 5       H  u  p[        U[        5      (       a  UR                  (       d   eUR                  U:X  d   eUR                  SU SUR                   S	U S
35        UR                  U SU S[!        U5       S35        UR#                  SU S35        M  UR#                  U5        M     UR                  SU
 SUR                   S35        T" U6 nUR                  S[!        U5       S35        UR                  5          UR                  SU S
35        S S S 5        U	(       a  UR                  (       a   eSnSU SU S3nO$S[!        U5       3nUS:X  a  SU
 S3nO	SU
 SU S3nUR                  SU SU S35        S S S 5        UR                  S5        U$ ! , (       d  f       N= f! , (       d  f       N2= f) Nr  r   )r  r  r  rl   r  r  r   z> tmpbufr   z.store(tmpbufz	.data(), r   tmpbufz[i]z> tmpbuf_out;r   r   ztmpbuf_out[i] = ztmpbuf_out.data()at::vec::VecMask<,z>::fromztmpbuf_out.data(), r5   r
  z>::loaduz at::vec::VectorizedN<r  r  r  )r4   rf  r   r  r7   r   r   r  r  r  rH   r  r   r   rG   r   rE   r  )r  r  rf  r   	vec_dtyper  rC  scalar_argsr   output_maskoctypeargidxr  res	load_argsload_fnr  s                   rX   re  )CppVecOverrides._scalarize.<locals>.inner  s   :XXFfl3333>DNN7#QI++I6E'-'7'76##V=Q=QDK!),F%.. 3 K
  +VF  ((,>> T"X& 
 #,T?KF!#~66"zz)z"yyI5556vhbAUAU@VV^_e^ffgh "e=	+dBSATTVW $**VF83+?@#**3/ $3 .vhb9M9M8Nm\ ";/!5k$6G5HOP[[]NN%5cU!#<= #%//// 3I 1&5'IG"5k$6G5H IIz$8"I$:6("UG8"T	9+R@A? @ NN4 K #]' s&   )DJ)=JA*J)
J&	"J))
J7rV   )r+  r  re  s    ` rX   
_scalarizeCppVecOverrides._scalarize  s    7	r rW   c                    [        [        5      n[        [        5      R                  5        HY  u  p#[	        U[
        5      (       d  M  X!;  d  M#  U R                  UR                  5      nX$l        [        X[        U5      5        M[     g r	  )
r  r  r  r  r   r  r$  r  r  r  )r+  vec_varsr   r  r  s        rX   _initialize_scalarize%CppVecOverrides._initialize_scalarize>  s^    ( .446LD&,//D4H~~foo6 $<#56	 7rW   rV   r  )Yr  r  r  r  r  r  r  r  r   rW  r  r  r  r  r  r  r  r  r  r  r  r0  r6  r9  r=  rA  rE  rI  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  rz  r  r  r  r  r  r  r  r6  r  r*  r$  r'  r_  rb  re  r<  r2  r\  r9  r-  r!  rq  r  r  r  r  rt  rx  r  r~  r  r  r  r  r  rY  rl  r$  r(  r  rm  rn  s   @rX   r  r  I  sV   8[z                   ( (
         A A 	E 	E @ @ @ @ A A A A                                           0 0 9 9 9 9 F F A A   	! 	! # #               % % $ $ ! ! ! ! O O        H H   S S$   1 1 1 1   	] 	]    . 	 	 J JX  , 6" 6"p : :x 7 7rW   r  cppvecc                   $    \ rS rSr\S 5       rSrg)CppTile2DOverridesiL  c                     [        [        R                  [        5      (       d   e[        R                  R	                  U 5      n [
        R                  X5      $ r	  )r   r4   rf  CppTile2DKerneltransform_indexingr  r  )r  r   s     rX   r  CppTile2DOverrides.index_exprM  s>    !((O4444xx**40))$66rW   rV   N)r  r  r  r  r  r  r  rV   rW   rX   r,  r,  L  s    7 7rW   r,  c                     ^  \ rS rSr\r\rSrSr	U 4S jr
\\4S jrS rS0S\\   4S jjr\R&                  S	 5       r S1S
\R,                  4S jjrS
\R,                  S\4S jrS
\R,                  S\R4                  4S jrS
\R,                  S\R4                  4S jrS rS\R,                  S\R,                  S\S\4S jrS\S
\R,                  4S jr S0S jr!S\"\#\4   S\S\S\$RJ                  4S jr&S0S\\'   4S jjr(S  r)S! r*S" r+S# r,S$ r-S% r.\/S\4S& j5       r0S' r1\R&                  S( 5       r2S) r3S* r4S+ r5  S2S\S,\\   S-\\R4                     4S. jjr6S/r7U =r8$ )3	CppKerneliT  r   r   c                 x  > [         TU ]  U5        0 U l        / U l        S U l        / U l        / U l        S U l        [        5       U l	        / U l
        [        5       U l        [        5       U l        [        5       U l        [        5       U l        [        5       U l        SU l        [        5       U l        [        5       U l        ['        U R(                  U R*                  SS9U l        ['        U R(                  U R*                  SS9U l        [        5       U l        [        5       U l        X l        0 U l        / U l        g )NFtmp_acc)name_prefixwelford_helper)r3  r4  active_rangesinner_itervarscall_rangesrX  r  reduction_depthr=   reduction_prefixreduction_prefix_generatorsreduction_suffixparallel_reduction_prefixparallel_reduction_suffixlocal_reduction_initlocal_reduction_storesis_reductionnon_parallel_reduction_prefixnon_parallel_reduction_suffixr8   newvar_prefixsuffixreduction_csewelford_helper_csepreloads
poststoresnum_threadsreduction_omp_decreduction_var_names)r5  r  rK  r8  s      rX   r4  CppKernel.__init__Z  s    HJ 35=A(*,.# . 0 <>( . 0)7)9&)7)9&$2$4!&4&6#!-;-=*-;-=* !3!3T[[iX"%9I#
 '((*&=?.0 rW   c                 J   [         R                  R                  (       a,  U R                  (       d  U R                  R	                  S5        U S3n[         R                  R                  (       a  SO	[        5       nU S3n	U R                  R	                  U SU SU" X45       S35        U R                  R                  [        UUUUUU5      5        U R                  R	                  U	 SU S35        U R                  R                  SU S	3S
SU SU" X1XS9 S3S/5        g )Nz(int max_threads = omp_get_max_threads();_localmax_threadsz	_arr[tid]r   r   r   zfor (int tid = 0; tid < z; tid++)r   r   r   r   )r   r  dynamic_threadsr>  r   r*   r@  rQ  r   rA  r?  r   )
r5  r   r   r   r   reduction_combine_fnreduction_init_fn	acc_localrK  acc_local_in_arrays
             rX   _gen_parallel_reduction_buffers)CppKernel._gen_parallel_reduction_buffers  s6    ::%%d.L.L**44: e6N	#ZZ77M=Q=S 	 !$uI.!!++j)C(9.(P'QQRS	
 	&&--"!		
 	##--1C0DC	{RS.TU&&11*;-x@se33NI[mnnop		
rW   c                 \    U R                    H  n[        U R                  X S35        M     g )NrP  )rM  r   stores)r5  var_names     rX   %update_stores_with_parallel_reduction/CppKernel.update_stores_with_parallel_reduction  s'    00HT[[(j4GH 1rW   r   c                    Ub   e[        5       n[        R                  " 5        n[        U S5      (       aK  UR	                  U R
                  5        U R                  U5        UR                  UR                  5       5        UR	                  U R                  5        UR	                  U R                  5        UR	                  U R                  5        S S S 5        [        U S5      (       a  UR	                  U R                  5        U R                  (       a4  U R                   H$  nU R                  U   u  pE[        XU S3XE5      nM&     U$ ! , (       d  f       N= f)Ncodegen_inner_loops_tail)r7   r   r   r   rQ  rI  r`  r   r   loadsr  r[  rJ  r8  r7  r   )r5  r   r   r[  startends         rX   gen_bodyCppKernel.gen_body  s   ||~!!#ut233DMM*((.##DKKM2KK

#KK%KK$ $ 4.//KK(**!//4
1$uE]EW +  $#s   B.E
Ec              #   0  #    U R                   nU(       ac  [        R                  " X5      n[        U[        5      (       a8  UR
                  n[        U[        5      (       d   e[        R                  Ul	        Xl          Uv   X l         g! X l         f = f7f)z>Context manager to add an additional mask to loads and stores.N)

_load_maskr2   rI  r   r3   r  rG   r   rl   r   )r5  r  priors      rX   r  CppKernel.masked  sp      88D(D$))zz!$7777 #ZZ
	$J#OeOs   A=B B BBBr   c                 L    U R                   U   nXUU-  U-   0n[        X5      nU$ r	  )r  r0   )r5  r   scaleitervar_idxrD  r   r   r   s           rX   scale_index_with_offset!CppKernel.scale_index_with_offset  s3     mmK(%K&01u2	rW   r   c                 6    [        U R                  U5      5      $ )z
Convert an index expr to a string that can be used in cpp code.
e.g. a sympy expression "s2" may actually appear as "ks1" in the cpp kernel.
)rD   r  r5  r   s     rX   index_to_strCppKernel.index_to_str  s    
 T))%011rW   itervarc                 F   ^ ^ [        UU 4S jUR                   5       5      $ )zM
Check if an index has free symbol CppCSEVariable that depends on `itervar`.
c              3   D  >#    U  H  nUR                   TR                  R                  ;   d  M)  [        TR                  R                  UR                      [        5      (       d  Ma  TR                  R                  UR                      R                  T5      v   M     g 7fr	  )r   r  varname_mapr   rG   
depends_on)r  srt  r5  s     rX   r   6CppKernel.index_indirect_depends_on.<locals>.<genexpr>  su      
'vv--- = 488//7H =DHH  (33G<<'s   (B 4B '9B )rg   free_symbolsr5  r   rt  s   ` `rX   index_indirect_depends_on#CppKernel.index_indirect_depends_on  s%      
''
 
 	
rW   c                 N    X!R                   ;   =(       d    U R                  X5      $ r	  )r{  r}  r|  s      rX   index_depends_onCppKernel.index_depends_on  s(    ,,, 
0N0N1
 	
rW   c                 T    [        [        U R                  U R                  5      5      $ r	  )dictrR  r  rX  r;  s    rX   
var_rangesCppKernel.var_ranges  s    Ct{{344rW   r  rC  lowerupperc                 ,   U(       d  U(       d  g [        U[        R                  5      nU(       aJ  [        R                  " U[
        R                  5      R                  n[        R                  R                  nO[        R                  R                  n U R                  [        R                  l
        [        R                  " U[
        R                  5      R                  nU[        R                  l
        U R                  nU(       a.  [        R                  R                  U R                  U5      5      OS n	U R                  Xc(       a  SOS XR                  5      n
U R                   R#                  XzSS9  g ! U[        R                  l
        f = f)N0F)
assignment)r   r   TMPr2   r  r   r  r  r4   rf  r  rb  sexprr  indirect_assertrh  r  r  )r5  r  rC  r  r  indirectr  r   prior_computesize_strr   s              rX   check_boundsCppKernel.check_bounds  s    &tTXX6^^D%++6<<FXX%%F HH,,M1#':: ekk:@@#0 ZZFAF188>>$"6"6t"<=D##5CdHoo
 	&59 $1 s   AE< <Fr   c                 .   U R                   R                  U5      nU R                  U5      nU S[        U5       S3nU R                  R                  U R                  U[        R                  R                  U5      S9nUR                  SXU40 5        U$ )N[]rM  ry  )r  inputr  rE   r  r  rb  r4   graph	get_dtyper  )r5  r   r   r   r   r  s         rX   ry  CppKernel.load  s    iiood#$$U+aE*+1-""4::t177;L;LT;R"Sft5&92>rW   c                    SU;   d   eU R                   R                  U5      nU R                  U5      nUc  U S[        U5       SU S3nOUS:X  a  [        R
                  R                  (       d%  U R                  S:X  a  U S[        U5       SU S3nOS[        R                  R                  U5      nS[        U    S	U S
3nSU S[        U5       SU S3nO[        SU 35      eU R                  R                  [        X5      5        g )Nbufr  ] = r   
atomic_addr5   z] += zstatic_cast<r  r   zatomic_add(&z], r   store mode=)r  outputr  rE   r   r  rS  rK  r4   r  r  rH   NotImplementedErrorr[  r   r;   )r5  r   r   r  moder   r   r   s           rX   storeCppKernel.store  s   }}iit$$$U+<U!K./tE7!<D\!::--$2B2Ba2GaE 235qA))$/&|E':&;2eWAF%cU!K,>+?s5'L%D6&:;;l467rW   r   r   rtyper   c                 F   ^^^^^ SS[         [           4UUUUU4S jjjnU$ )NrC  c                 N   > U c  T ST ST" TT5       S3$ [        TTTTU T5      $ )Nr   r   r   )r   )rC  r   r   r   r   r  s    rX   re  .CppKernel._gen_reduction_prefix.<locals>.inner>  sH    |"1SEWUE-B,C1EE- rW   r	  )r   rn   )r5  r   r   r  r   r   re  s    ````` rX   _gen_reduction_prefixCppKernel._gen_reduction_prefix0  s    	 	 	 rW   c                 l    U R                    H$  nU R                  R                  U" U5      5        M&     g r	  )r<  r;  rQ  )r5  rC  gen_fns      rX   finalize_reduction_prefix#CppKernel.finalize_reduction_prefixM  s*    66F!!((6 7rW   c           
      V   US;   nX#U4nX`R                   R                  ;   a  U R                   R                  U   $ U R                   R                  U R                  SU 3SS9nU R                  R                  U 5        SU l        U(       a  UOUn[        X85      n	U R                  R                  U R                  XyX8[        5      5        U R                  c   eU R                  U R                     n
[        U R                  S-   [        U R                  5      5       H$  nXR                  U   -  U R                  U   -   n
M&     U R                   R#                  U S[%        X7XJ5       S35        U R'                  XyX85        [)        X75      nXR                   R                  U'   U$ )	Nrb   ra   
reduction FwriteTr5   r   r   )rG  reduction_cacher  rb  rM  r  rB  r   r<  r  r   r:  r  rR  r   rX  r[  r   r   rX  r   )r5  r   r   r   r  argmax_or_argminreduction_keyr   
init_dtyper   r   r   r   s                rX   	reductionCppKernel.reductionQ  s   )-AA!58..>>>%%55mDD  ))JJ*]O4E * 
 	  ''3%1 "2Y
%nA((//&&~>	

 ##///d223t++a/T]]1CDAKKN*T]]1-==E Ee3(eKLAN	
 	,,SNW">7<B**=9rW   c                     U R                  U5      nU R                  R                  U5      nU R                  R	                  [        X S[        U5       SU S35      5        g )Nr  r  r   )r  r  r  r=  r   r;   rE   )r5  r   r   r  r   s        rX   store_reductionCppKernel.store_reductionp  s[    $$U+iit$''aE(:';4waHI	
rW   c                    U R                   (       al  U R                   [        U5      [        U5      -   :X  d+   U R                    S[        U5       S[        U5       35       eU R                  [        U5      :X  d   eO[        U5      [        U5      -   U l         U R                    Vs/ s H  o0R	                  U5      PM     snU l        [        [        U R
                  5      5       Vs/ s H  n[        [        R                  U5      PM     snU l
        [        U5      U l        U R                  S U R                   U R                  U R                  S  4$ s  snf s  snf )Nr-  r  )r9  r   r:  r   r  rX  rR  r.   r   XBLOCKr  )r5  lengthsreduction_lengthsr  ns        rX   
set_rangesCppKernel.set_rangesw  s;   ##uW~>O8P'PP ##$Dw(8EBS<T;UVP ''3w<777$W~6G0HHD<@<L<LM<Lq//2<LMDK s4;;/00A /t{{A>0DM $'w<D MM0D001MM$..01
 	
 Ns   (E*$Ec                     U R                   c   e[        R                  R                  R	                  [        U R                   5      SS9$ )N    fallback)r9  r4   r  sizevars	size_hintr/   r;  s    rX   r  CppKernel.size_hint  sF    +++ww))$**+d * 
 	
rW   c                   ^^^^^^^^^^ [        U [        5      (       d   e[        5       mU R                  c   e[        UR                  [
        5      (       a+  UR                  R                  UR                  5       T5      mO U R                  UR                  5       T5      mUR                  S L=(       a#    UR                  TR                     R                  m[        R                  " 5        nTR                  (       a;  T(       a  TR                  5         OTR                  T5        UR!                  T5        O:TS:  a4  TR#                  5       (       a  UR%                  TR'                  5       5        S[(        4UUU4S jjmSS jm SS[(        S[*        4UUUUUU4S jjjmSS[(        S[*        4UU4S jjjm  SS[(        S[*        S[,        4UU4S	 jjjmUR%                  TR'                  5       5        [        UR                  [
        5      (       Ga/  [        [.        R0                  [2        5      (       Ga  [.        R0                  R4                  (       a  [.        R0                  R4                  nUR7                  5        H  n[9        UR;                  5       R<                   Vs/ s H  nU R?                  U5      PM     sn5      n[@        UR;                  5       RB                     n	S
U	 S[E        U5       S3n
URG                  5       nTRI                  SU	 SU SU
 S35        TRI                  U	 SU SU S35        M     T" U5        S S S 5        g s  snf ! , (       d  f       g = f)Nr5   
_loop_nestc                   >^  U U4S jnT R                  5       n[        U[        5      (       a  UR                   H  m T" T 5        M     g [        U[        5      (       d   eT R
                  b  U" 5       (       a  UR                  5         [        R                  " 5        nUR                  TR                  5       5        UR                  T5        S S S 5        g ! , (       d  f       g = f)Nc                     > TR                   (       d   eTR                   TR                     n U R                  =(       a    U R                  $ r	  )rL  r  rB  parallel)rootr  	par_depths    rX   is_parallel_reductionOCppKernel.codegen_loops_impl.<locals>.gen_kernel.<locals>.is_parallel_reduction  s=    %++++%++I,A,ABD,,>>rW   )
get_kernelr   rc  re  CppKernelProxyrL  r]  r   r   r   r   re  )r  r  rf  r   r   gen_loop_nestr  s   `   rX   
gen_kernel0CppKernel.codegen_loops_impl.<locals>.gen_kernel  s    ?
 $..0f&:;;&,ll
%j1 '3 &fn====!''38M8O8ODDF#--/5++DKKM:- 0//s    1C
C(c                     U(       a5  U R                   nU(       a  U R                  U-   nU$ U R                  U-   nU$ U R                  nU(       a  X@R                  -   nU$ X@R
                  -   nU$ r	  )r=  r?  rD  r;  r>  rC  )rf  r  	is_suffixrF  prefixs        rX   get_reduction_prefix_suffixACppKernel.codegen_loops_impl.<locals>.get_reduction_prefix_suffix  sy    #44F!'!A!AF!J "M "(!E!E!N!M#44F!'*J*J!J "M "(*N*N!N!MrW   depthc           	        > U R                  5       nU R                  (       d   eU R                  U   n[        R                  " 5        nUR                  (       aO  U(       dH  T	" X4R
                  SS9nU(       a  UR                  TR                  5       5        TR                  U5        T
(       aa  UR
                  (       aP  TR                  T5        UR                  (       a.  UR                  (       d   eTR                  UR                  5        T" X5        T
(       aM  UR
                  (       a<  UR                  (       a  TR                  UR                  5        TR                  5         UR                  (       a'  U(       d   TR                  T	" X4R
                  SS95        S S S 5        g ! , (       d  f       g = f)NF)r  T)r  rL  r   r   rB  r  r   r   rQ  r@  rA  close)r  r  in_reductionrf  loopstack_outerr;  r   gen_loop_atr  is_reduction_loopthreadsworksharings          rX   gen_loop_with_reduction=CppKernel.codegen_loops_impl.<locals>.gen_loop_with_reduction  s2    $..0!''''!''.))+{((+F"MMU,( ,'55dkkmD$45(T]]#,,W5!66#)#@#@@#@ KK(C(CD
2(T]]!88 KK(E(EF#))+((7 &+ ,++s   	EF//
F=c                 n  > [         R                  " 5        nU R                  (       d   eU R                  U   nUR                  5       nUc
   S S S 5        g TR	                  U5        UR                  TR                  5       5        T" XS-   UR                  5        S S S 5        g ! , (       d  f       g = fr   )r   r   rL  linesr   r   r   rB  )r  r  r   r  
loop_linesr   r  s        rX   r  1CppKernel.codegen_loops_impl.<locals>.gen_loop_at  s    ))+u%++++%++E2D!%J!) ,+ OOJ/''6!*ai9J9JK ,++s   7B&AB&&
B4r  c                 v   > U R                   b  U[        U R                   5      :X  a	  T" U 5        g T" XU5        g r	  )rL  r   )r  r  r  r  r  s      rX   r  3CppKernel.codegen_loops_impl.<locals>.gen_loop_nest  s5    
 ##+uJ<L<L8M/Mz*+J|LrW   zstd::make_unique<z []>(r   zstd::unique_ptr<z	 []> buf_r   r   z* z = buf_z.get();)FF)r   F)r   )%r   r  r*   r9  rf  rc  decide_parallel_depthmax_parallel_depthrL  r  rB  r   r   r  r  r  mark_parallelsingler   r   rQ  rn   rl   r4   local_buffer_contextrK   local_buffersvaluesr/   
get_layoutrC  r  rH   r   rD   get_namerQ  )r5  rS  r   r  r   r  local_buffersize_vallocal_buf_sizelocal_buf_dtypeallocatelocal_buffer_namer  r  r  r  r  r  r  r  s     ``        @@@@@@@@rX   codegen_loops_implCppKernel.codegen_loops_impl  s!   $////&(+++i&&(<==!((>>,,.I 22,,.I
 OO4' D	 5 56CC 	 !!#u''$%%'((1''	21%%''''6.x . .$"" DI$-0 B	L 	L 	L 	L %*M$MM #M M . 9++-ABBq557IJJ**88 !" 6 6 D D$1$8$8$:L%2 -9,C,C,E,J,J,J !00:,J&N '3<3J3J3L3R3R&SO!2?2C5~I^H__`aH(4(=(=(?%KK*?*;9EVDWWZ[cZddef KK*+2.?-@HYGZZab %;" )$} $#`a $#s    2F>M$0MBM$M$$
M2c                 R    [         R                  U 5      nU R                  X1U5        g r	  )rQ  buildr  )r5  r   r  rS  s       rX   codegen_loopsCppKernel.codegen_loops!	  s     NN4(		=rW   c                 D    [         R                  R                  (       a  gg)NAOTI_TORCH_CHECKTORCH_CHECK)r4   r  aot_moder;  s    rX   assert_functionCppKernel.assert_function%	  s    77% rW   c                    U R                   c   eU R                   UR                  UR                  UR                  -    nU R                  5       nSnSnU Hj  n[        R
                  R                  R                  USS9nUSU-  :  d  XR:X  a    O3XB-  [        R                  R                  :  a    OUS-  nXX-  nXH-  nMl     [        R                  R                  (       a  US:X  a  [        U5      S:  a  Sn[        XaR                  S9$ )Nr5   r   r  r  r   r  r  )r9  r  r  r  r4   r  r  r   r  min_chunk_sizerS  r   r  )	r5  r  r  rX  seqparr  r  hints	            rX   r  CppKernel.decide_parallel_depth,	  s    +++!!**"..1C1R1RR

 nnD77##--dT-BDa'k!S^~

 9 99QJEKCKC  ::%%%1*VqE .L.L
 	
rW   c              #     #    U R                   U R                  U R                  U R                  4n[	        5       U l         [	        5       U l        [	        5       U l        U R                  R                  5       U l        S v   U R                  R                  U R                   5        U R                  R                  U R                  5        U R                  R                  U R                  5        Uu  U l         U l        U l        U l        g 7fr	  )rb  r  r[  r  r=   cloner=  rQ  )r5  ri  s     rX   write_to_suffixCppKernel.write_to_suffixI	  s     T\\4;;A#%
%'$&88>>#$$TZZ0$$T\\2$$T[[1<A9T\4;s   D
Dc                     [        U0 UD6$ r	  )rG   )r5  r  r  s      rX   create_cse_varCppKernel.create_cse_varV	  s    t.v..rW   c                 "    S[         U    SU S3$ )Nzc10::convert<r  r   )rH   )r5  srcr   r   s       rX   r  CppKernel.get_to_dtype_exprY	  s    |E232cU!<<rW   c                 ^    U R                  X2U5      nU R                  R                  XQ5        g r	  )r  r  rS  )r5  dst	dst_dtyper  r   r  s         rX   r  CppKernel.cache_dtype_convert\	  s$    %%ci@TrW   r  r   c                   ^ ^
 Uc  SnT R                   (       d  g/ m
U
U 4S jnUb3  UT R                   ;   d   eT R                   U   u  pVU" XVU5      (       d  gO7T R                   R                  5        H  u  pxUu  pVU" XVU5      (       a  M    g   SR                  T
5      n	U	(       a  UR                  SU SU	 S35        gg)	NrS   Tc                 J  > X:X  a  gS n[        TR                  5       H  u  pEX%:X  d  M  Un  O   [        T5      [        :X  a"  U(       a  U S:X  a  UTR                  U   :X  a  SnTR                  U S[        U 5       35        TR                  U S[        U5       35        g)NFr   r5   rD  r   T)r   r  r  r2  rX  r  rE   )rc  rd  r   var_idr   _var
conditionsr5  s         rX   gen)CppKernel.codegen_conditions.<locals>.genl	  s    |F$T]]3;F 4
 T
i'QJ4;;v..T+e*<)=>?SS)9(:;<rW   Fr  zif(r  r   )r7  r  joinr   )r5  r   r  r   r$  rc  rd  r"  _rangejoined_conditionsr#  s   `         @rX   codegen_conditionsCppKernel.codegen_conditions`	  s     >F!!
	& ?$,,,,,++C0JEu3'' ( !% 2 2 8 8 :#
5t,,  !; #KK
3NNS*;)<B?@rW   )rh  r7  r9  r  r  r8  rB  r  rb  r@  rA  rC  rD  rK  r>  r?  rJ  rI  rX  rG  r:  rL  r;  r<  r=  rM  r[  rH  r	  )r5   r   NN)9r  r  r  r  r  r  rD   r  rE  rF  r4  r   r   rX  r]  r   r7   re  r   contextmanagerr  r   r  rn  rp   rr  r   r}  r  r  rl   r  ry  r  r	   r9   r   r   r  rn   r  r  r  r  r  r  r  propertyr  r  r  r  r  r  r)  r  rm  rn  s   @rX   r2  r2  T  s   IEMF$1X /('
RIXl3 ( $ $& BCZZ2%** 2 2	
uzz 	
ELL 	

ejj 
5<< 

5:jj: jj: 	:
 :@ UZZ 8$;#$  	
 {{:7hsm 7>

&
O%b> ! ! !
: 
B 
B/=  !%&*	.. . ell#	. .rW   r2  c                     ^  \ rS rSr\r S+U 4S jjrS\R                  S\R                  4S jr
S\R                  S\4S	 jrS\R                  S\4S
 jrS\R                  S\4S jr\R                  4S\R                  S\4S jjrS\S\R                  S\4S jr S+S\S\R                  S\R                  S\\   4S jjr   S,S\\   S\R                  S\R                  S\\   S\\\\4      S\S\\   4S jjrS\S\R                  4U 4S jjr S-S\\\4   S\S\R                  S\R                  S\4
S jjrS+S jrS rS r S\S\4S jr!S\S\R                  S\4S  jr"S! r#S" r$ S+S# jr%S$ r&SSS\RN                  4S\\R                     S%\\   S&\\R                     4S' jjr(S+U 4S( jjr)U 4S) jr*S*r+U =r,$ ).r  i	  Nc                    > [         TU ]  X5        [        R                  " 5       U l        U R                  (       d   eUS:  d   S5       eX0l        X@l        XPl        U(       a  XPl        g UU l        g )Nr   z0Expect pass in Non-Zero tiling_factor explicitly)	r3  r4  r   pick_vec_isavec_isar  r  r  	num_elems)r5  r  rK  r  r  r  r8  s         rX   r4  CppVecKernel.__init__	  s_     	+"//1|||q T"TT *$"&/]rW   r   rt  c                   ^  T R                  X5      (       a  g U 4S jUR                   5        H-  n[        U[        5      (       d   eUR                  (       d  M-    g    [        XT R                  5      nUR                  (       a  U$ S $ )Nc              3      >#    U  HI  n[        U[        R                  5      (       d  M$  TR                  R                  UR
                     v   MK     g 7fr	  r   r   r  r  rw  r   r  ry  r5  s     rX   r   5CppVecKernel._try_get_const_stride.<locals>.<genexpr>	  s;      
'a* )DHH  ('
   #A*A)r}  r{  r   rG   r   r
  r  r  )r5  r   rt  indirect_varr  s   `    rX   r   "CppVecKernel._try_get_const_stride	  s|    ))%99
''
L
 lN;;;;"""
 %UT5G5GH))v3t3rW   r   r   c                     [         R                  " U R                  UR                  -  S-  U R                  R                  5       -  5      nUS:  d   eU$ )N   r5   )mathr  r  itemsizer2  	bit_widthr5  r   num_vectorss      rX   r  CppVecKernel._get_num_vectors	  sO    ii/!3dll6L6L6NN
 arW   c                 p    U R                   UR                  -  S-  U R                  R                  5       -  $ )Nr>  )r  r@  r2  rA  )r5  r   s     rX   r  !CppVecKernel._get_raw_num_vectors	  s0     !!ENN2Q69O9O9QQQrW   c                 j    U R                  U5      nUS:X  a  S[        U    S3$ S[        U    SU S3$ )Nr5   r
  r   r  r  )r  rH   rB  s      rX   r  CppVecKernel._get_vec_type	  sJ    ++E2!),u*=)>a@@*<+>*?qQOOrW   c                 n    U[         R                  :X  a  gU R                  U5      nS[        U    SU S3$ )NrS   r  r  r   )r   rl   r  rH   rB  s      rX   r/  CppVecKernel._get_mask_type	  s<    EJJ++E2"<#6"7qQGGrW   r  c                     UR                   [        R                  :X  d   [        U5      5       eU R	                  U5      nU S[
        U    SU S3$ )Nz.template cast<r  r   )r   r   rl   reprr  rH   )r5  r  r   rC  s       rX   r  CppVecKernel._get_mask_cast	  sP    zzUZZ'3d3'++E2|E':&;1[MMMrW   r   	load_maskc                    [         U   nU R                  U5      nSnU(       aX  UR                  (       d&  U R                  [        R
                  5       SU S3nO!U R                  U[        R
                  5       nUS:w  a  U S[        U5       3OUnU[        R                  :X  a  U R                  5        SU S3n	U	$ U(       a  U SU SU SU S3O,U R                  U5       S	U S
[        U R                  5       S3n	U	$ )a  
Get a load line str that loads a vector from `var` at `index` of type `dtype`.
If `load_mask` is not None, we do a masked load accordingly.
Notes on the `dtype`:
1. We always load `self.tiling_factor` number of elements regardless of the `dtype`.
   It means we load half of the vector lanes for 16-bit data types and quarter of the
   vector lanes for 8-bit data types.
2. `torch.bool` and `torch.uint8` could mean masks and we load them as float mask vectors.
Nr  r   r   r  z.template loadu<r  r  ::loadu(r   )rH   r  r   r/  r   ro   r  rE   rl   r  r3  )
r5  r   r   r   rN  cpp_typerC  load_mask_strloadbufr   s
             rX   _get_vec_load_lineCppVecKernel._get_vec_load_line	  s      &++E2###'#6#6u{{#C"DGI;VW X#'#6#6y%++#N"O5:aZSE[/01SEJJ))+,GG9A>D  ! !/!1(1[MG9TUV**512(7)2kRVR`R`FaEbbcd 
 rW   r   store_value
accu_storec                 0	  ^ ^^^ U(       a
  Uc   S5       eU(       a	  U(       d   eTc  T R                   mS[        R                  S[        4U 4S jjmS[        R                  S[        4U 4S jjmS[        S[        4UUUU 4S jjn[        5       nUR                  S	5        UR                  5          T" U5      n	T" U5      n
S
[        U    SU
 S3nUR                  U5        U(       a   UR                  U S[        U	5       S35        [        T R                  T R                      S35      n0 nU 4S jUR                   5        H?  n[        U[        5      (       d   eUR                  (       d  M-  U" U5      nU SU S3X'   MA     T R!                  UT R                  US9nSnT R"                  b{  U(       a   S5       e[        T R"                  [        5      (       d   T R"                  5       eT R"                  R                  (       a  T R"                   SU S3nOT R"                   S3n[$        R&                  " 5       (       a  UR                  ST R(                   35        OUR                  ST R(                   35        UR                  SU S3U S[        T R*                  5       S3-   U S3-   5        UR                  5          [,        R.                  " 5        n[        U5      nU H$  n[0        R2                  " SU -   S-   X   U5      nM&     Ub  U SU S3OU nU(       a4  UR                  S U S35        UR5                  UR                  5       5        U(       a&  U(       a  S!OS"nUR                  U S#U S$U S%35        OUR                  S&U S'U S(35        SSS5        SSS5        U(       d(  T R7                  S)S*U5      nUR                  S+U S(35        SSS5        UR                  S,5        U(       a#  UR                  S(5        TR9                  U5        gT R:                  R=                  TXS-9n[        U[        5      (       d   eS.Ul        U$ ! , (       d  f       N= f! , (       d  f       N= f! , (       d  f       N= f)/a  
Load or store a vector in a non-contiguous way. The vector is initialized from an array that is
filled in an inner loop over the tiling factor.
:param var: buffer to load from or store to, i.e. `var[transformed(index)]`. If None, we load the index
            as index expression, i.e. `transformed(index)`.
:param index: index into the `var` or the index expression by its own if `var` is None.
              The `index` could contain indirect indexing or the tiling itervar. When used in
              the inner loop, the index is transformed as follows:
              1. the index is linearized along the tiling dim.
              2. the indirect indexing vector variables are transformed into arrays over the tiling dim.
:param dtype: data type of `var` or `index` if `var` is None.
:param buffer: the code buffer to write the generated code to. If None, we write to `self.loads`.
:param store_value: the value to store. If None, we load the vector.
:param accu_store: whether accumulate the store_value to store_ptr. If True, a store_value should be provided
:return: a CppCSEVariable that represents the loaded vector or None if it is a store.
Nzstore var must be providedr   r   c                 t   > U R                   S:  a  TR                  SU R                   -  -  $ TR                  $ N   )r@  r3  r   r5  s    rX   get_result_sizeCCppVecKernel._load_or_store_non_contiguous.<locals>.get_result_size
  s1    ~~!~~enn)<==~~%rW   c                 t   > U R                   S:  a  TR                  SU R                   -  -  $ TR                  $ rZ  )r@  r  r\  s    rX   get_tiling_sizeCCppVecKernel._load_or_store_non_contiguous.<locals>.get_tiling_size
  s5    ~~!))Q%..-@AA)))rW   vec_varc                 n  > U R                   (       d   e[        5       nUR                  S5        UR                  5          U R                  nUc   eU[
        R                  :X  a  [
        R                  nT" U5      nT	" U5      nUR                  S[        U    SU S35        U  S[        U5       S3nUR                  U5        UR                  S5        S S S 5        UR                  S5        T
R                  R                  TU5      n[        U[        5      (       d   eU$ ! , (       d  f       NT= f)	Nr  r  r   r  r  r   zreturn tmpbuf;r  )r   r7   r   r   r   r   rl   ro   rH   rE   r  r  r   rG   )rb  r   r  result_sizetiling_sizer   r  r   r]  r`  r5  s          rX   vec_to_array@CppVecKernel._load_or_store_non_contiguous.<locals>.vec_to_array!
  s   >>!>>DNN5!#MM	 ,,,

* %I-i8-i8.|I/F.Gr+V_` ""8[9Q8RRTUt$/0  NN4 XX&&vt4Ffn5555M! s    BD&&
D4r  r  r   r  r  r   rN  c              3      >#    U  HI  n[        U[        R                  5      (       d  M$  TR                  R                  UR
                     v   MK     g 7fr	  r7  r8  s     rX   r   =CppVecKernel._load_or_store_non_contiguous.<locals>.<genexpr>H
  s;      !+A!!TXX. -$$QVV,+r:  r  r  rm  rD  zunexpected store with load maskz.is_masked(r   z != 0z#pragma GCC unroll z#pragma unroll 
for (long  = 0; r   r   r   r   r  +==r   z tmpbuf[r   ztmpbuf[r  r   ztmpbuf.data()r   r  r  rM  T)rb  r   r   rn   rG   r7   r   r   rH   rE   r-   r  r  r{  r   r   rn  rh  r   is_gccr  r3  r   r   r   r   r   rT  rQ  r  r  )r5  r   r   r   r   rV  rW  rf  r   rd  re  result_declareitervar_innerreplacementsr;  	array_varrN  r   index_crhsr   	load_liner  r]  r`  s   `   `                  @@rX   r  *CppVecKernel._load_or_store_non_contiguous	  s8   2 #/O3OO1;>ZZF	&5;; 	&3 	&	*5;; 	*3 	*	. 	^ 	 	, ~u[[])%0K)%0K*<+>*?r+iX  NN>*"m#9+k:R9SSUV /==12&9M L!++!
 ",????&&& ,\ :I4=;aa1PL.! 004??= 1 E I*&I(II!$//>BBSDOOSB??))#'??"3;}oQ OI#'??"35 9I!!##!4T5G5G4HIJ1C1C0DEFNN]O62"O3{4>>'B&C2FG"O3'(
 
 4 4 6%%e,$0L ff<.1E9$2G %1 .1_Qwiq)WINNT)A#67''6*4$#KNNcU!K=r#RSNNW]O4uA#FG! !7"  33OQN	156 @ 	tNN3MM$XX&&vt&AFfn5555 FMM; !7 6Y ]sE   3B?R6ERQ6%CQ%)Q617R%
Q3/Q66
R	 R
Rr   c                 4  > U R                   R                  U5      nU R                  U5      n[        R                  R                  U5      nU R                  U R                     nU R                  X%5      nUS:X  a  [        T	U ])  X5      $ US:X  aA  U R                  X2X@R                  5      nU R                  R                  U R                  XtS9nOU R!                  X2U5      n[#        U[$        5      (       d   eUR'                  SXU40 5        SUl        U$ )Nr   r5   rM  ry  T)r  r  r  r4   r  r  r  r  r   r3  ry  rT  rh  r  r  rb  r  r   rG   r  r   )
r5  r   r   r   r   r  r  r   r  r8  s
            rX   ry  CppVecKernel.load
  s    iiood#$$U+!!$']]4??3
++E>Q;7<,,q[**3uooNDXX&&tzz4&EF77EJF&.1111ft5&92>rW   r  c           	         [        U[        5      (       d-  [        U[        5      (       a  UR                  (       d   U5       eU R                  U R
                     nU S[        U5       3nU R                  X65      n[        5       n	US:X  a  U(       an  U[        R                  :X  a$  U R                  c  U R                  U5       SU S3O,U R                  U5       SU S[        U R                  5       S3n
SU SU
 S3nU[        R                  :X  a&  U R                  c  U	R                  U SU S35        U	$ U	R                  U SU S[        U R                  5       S35         U	$ U R                  X#XIXS	9  U	$ )
a  
Get a store line buffer that stores `value` into `var` at `index` of `dtype`. It handles
both contiguous and non-contiguous store cases.
:param value: Vectorized type templaterized on `dtype`.
:param var: buffer to store into.
:index: index into the `var`.
r  r5   rP  r   r   r  .store(r   )r   rV  rW  )r   rp   rG   r   r  r  rE   r   r=   r   ro   r  r  r3  r   r  )r5  r  r   r   r   rW  r  var_exprr  r   ry  s              rX   _get_store_lineCppVecKernel._get_store_line
  s   " %%%un--%,,		 
 ]]4??3
U#k%012++E>Q; +0F ))%01(1E ..u56hxj;W[WeWeKfJgghi 
 E7#dV1-#(>%z<=  gWXJbT^^1L0MRP  ..EE /  rW   c                 R  ^ ST;   d   e[        U[        5      (       d   U5       eUR                  (       d  U R                  U5      nU R                  R                  T5      nU R                  U5      n[        R                  R                  T5      nUcA  U R                  X5X&5      nU R                  R                  UR                  U4S j5      5        g US:X  Ga=  [        R                  R                   (       dS  U R"                  S:X  aC  U R                  U UUUSS9nU R                  R                  UR                  U4S j5      5        g U R%                  U5      nU R%                  [&        R(                  5      n	[*        U   n
[,        R.                  " U[&        R(                  5      R0                  n[        U[        5      (       a  UR                  (       d   eSU
 S	U	 S	U S
U S	U S	U S3nU R                  R3                  [5        TU5      5        g [7        SU 35      e)Nr  c                    > [        TU 5      $ r	  r;   r  r   s    rX   <lambda>$CppVecKernel.store.<locals>.<lambda>
  s    ,tQ2GrW   r  r5   T)rW  c                    > [        TU 5      $ r	  r  r  s    rX   r  r  
  s    l46KrW   zatomic_add_vec<r   r  r   r  )r   rG   r   r  r  r  r  r4   r  r  r}  r[  rQ  mapr   r  rS  rK  r  r   r  rH   r2   r  r  r   r;   r  )r5  r   r   r  r  r   r   r   n_srcn_idxr   r   s    `          rX   r  CppVecKernel.store
  s   }}%007%70||NN5)Eiit$$$U+!!$'<''EADKKtxx(GHI\!::--$2B2Ba2G++g# ,  ""488,K#LM--e4--ekk:%e,uekk:@@!%88U\\II(5'E7"SEE7RTUZT[[]^%%l4&>?%D6&:;;rW   c                    U[         ;   d   eUS;   nU R                  U R                  :  nU(       a  UOUn[        U[        5      (       d   U5       eUR
                  (       d  U R                  U5      nX#U4nXR                  R                  ;   a  U R                  R                  U   $ Sn	U	 S[        U    S3n
[        X75      nU R                  X75      nU R                  R                  U R                  SU 3SS9n[        U[        5      (       d   eU S3nS	U 3nU =R                  U X/-  sl        S
U l        U R                   R#                  U R%                  XX7[&        5      5        U R                   R#                  U R%                  UUUUU R(                  5      5        US:X  Gak  U R                   R#                  U R%                  UUUUU R(                  5      5        U R                  c   e[*        R,                  " [.        R0                  U R2                  U R                  S  5      nU R4                  R                  U R6                  SU 3SS9nS	U 3n[9        U R2                  U R                     U R:                  5      (       am  U R                  U R                  :  aQ  [9        UU R2                  U R                     5      [9        U R2                  U R                     U R:                  5      -  OUO[<        R>                  " S5      nU R2                  U R                     U R:                  -  (       a?  U R                  U R                  :  a#  [9        UU R2                  U R                     5      OUO[<        R>                  " S5      nU RA                  UUUU5        U RA                  UUUU5        U RB                  (       a  UOUnU RB                  (       a  UOUnU RD                  RG                  U SU RI                  UUUU5       S35        OU R                  c   eU RJ                  U R                     n[M        U R                  S-   [O        U RJ                  5      5       H%  nUU R2                  U   -  U RJ                  U   -   nM'     UUUUS.nU RD                  RG                  U SU RH                  " X>40 UD6 S35        U RQ                  UUUUU RH                  U R(                  S9  U RQ                  UUUU[R        [&        S9  US:X  a(  U RQ                  UUUUU RH                  U R(                  S9  U[T        RV                  :H  nU(       GaE  [Y        U5      (       aT  U R[                  U5      S;   d   S5       eSU S3nSU S3nU R\                  RG                  U S[S        X=U5       S35        OU(       a	  U SU S3nOU(       a  US;   a  SU S3nOUS:X  d   eU S3nOSU RI                  USS5      -   S-   nU[T        RV                  :H  nU(       a  [T        R^                  OUnS [        U    S3n
S![        U    S"U R[                  U5       S3nU S#U
 S$U
 S%U S"U S3
nU R\                  RG                  U S[S        X=UUS&9 S35        Un OCUn [Y        U5      (       a1  S	U  3n!U R\                  RG                  U  S[S        UU U!5       S35        [a        UU 5      n"U"U R                  R                  U'   U"$ )'Nr  zat::vecz::Vectorized<r   r  Fr  _vecmasked_Trh   r   r   r   r5   )r   r   horizontal_reductionr   )rT  rU  )r5   r   z4Welford reduction does not support VectorizedN (N>2)zwelford_vec_reduce_all(r   z_vec_reduce_all()rg   rd   r`   r  z.all_zero()r_   z.all_masked()z	{ return r  r1  z; }r
  zat::vec::vec_reduce_all<r   z([](z& x, z& y) rR  )1VECTORIZABLE_RTYPESr  r:  r   rG   r   r  rG  r  rH   r   reduction_acc_type_vecr  rb  rM  rB  r<  r  r  r   reduction_init_vecrT  rU  rV  rW  rX  rH  r  r   r  r   rY  _use_welford_helperr  r[  r   reduction_combine_vecr  rR  r   rX  r   r   rl   r)   r  r=  ro   r   )#r5  r   r   r   r  r  r  r  r  vec_nsvecr   acc_type_vecr   acc_vecmasked_acc_vecreduction_sizewelford_helper_valmasked_welford_helper_valwelford_helper_vec_rangemasked_welford_helper_vec_rangeacc_vec_welford_helper_val_r   r   r  r   r   masked_next_valuereduce_all_bodyr  vec_reduce_all_functmpvarmasked_tmpvarr   s#                                      rX   r  CppVecKernel.reduction
  s    !4444)-AA#$2F2FF"2Y
%007%70||NN5)E!58..>>>%%55mDDl5&9%:!<%nA22>N  ))JJ*]O4E * 
 #~....E,"7),  uw$GG  ((//&&~>	

 	((//&&''	
 --,,33**" "++ ''333&--dkk$*>*>*@AN "&!8!8!A!A
=/:% "B " +22D1E(F% DKK8$:L:LMM $*>*>> ^T[[-IJt{{4??;T=O=OPQ ( ]]1% %  ;;t/$2D2DD $*>*>> ^T[[-IJ' ]]1% , $$+-Eu $$)/	 *.~WH-1^^)AS   KK!!*C : :>8UZ\o pqqrs ''333MM$"6"67E4//!3S5GHA.q1AA I $(<&	F KK!!)3t99.\U[\]]^_ 	,,!%!;!;"55 	- 	
 	,,!2, 	- 	
 --00%)%?%?"&"9"9 1  5::%#N33,,U3 8  J JJ   7wiqA
&=n=MQ$O!%%//e30FWXYYZ[ " .//?yJ
! & 
 $%WI[!9J)U222$+9M!:J  00cJK  
  5::-+2EKK	,\)-D,EQG(@iAX@YY[\`\q\qr{\|[}}~&# 34DU3%u_L]]_`g_hhij
!!++%s,^*Xabccde FF#N33")& 2%%//hc"3NFM"Z![[\] #>6:<B**=9rW   c                 V  ^ U R                  U5      nU R                  R                  T5      n[        R                  R                  T5      nUR                  (       a&  U[        R                  :X  a  UO[        R                  O[        R                  n[        R                  R                  U5      n[        R                  R                  U5      n[        5       n	U R                  U R                  :  a.  U	R!                  U S[#        U5       S[$        U    SU S35        OXV:w  a  [$        U   R'                  SS5       SU 3n
U[        R(                  :X  a&  U SU R                  [        R(                  5       S3nO@Xs=:X  a  S	:X  a  O  OS
[$        U    SU S3nO S
[$        U    SU S[$        U    SU SU S3nU	R!                  SU
 SU S35        U
nU	R+                  U R-                  X4X%5      5        U R.                  R+                  U	R1                  U4S j5      5        g )Nr  z] = static_cast<r  r   z::r   z.template cast<bool,r   r5   at::vec::convert<r   r  r   r   r   c                    > [        TU 5      $ r	  r  r  s    rX   r  .CppVecKernel.store_reduction.<locals>.<lambda>  s    T18MrW   )r  r  r  r4   r  r  is_floating_pointr   rk   ro   r  rf  r  r=   r  r:  r   rE   rH   r  rl   rQ  r}  r=  r  )r5  r   r   r  r   	out_dtyper   out_num_vectorssrc_num_vectorsr   converted_valueconverts    `          rX   r  CppVecKernel.store_reduction  s   $$U+iit$GG%%d+	 ** $u||3Y 	
 ((33I>((33E:??d222NN%qU+,,<\)=T<UUWX]W^^`a
 !#I.66tSAB!E7K   

*!&';D<Q<QRWR\R\<];^^abG&>Q>/Y0G/H5'QRS  
 0Y0G/H./qe1D0EQFWWYZ_Y``ac   &7s7)1EF'KK,,UJK$$TXX.M%NOrW   
scalar_varc                 .   UR                   (       a   eUR                  [        R                  :X  aE  U R                  R                  U R                  U R                  5        SUR                   S35      nO^UR                  c   eU R                  R                  U R                  U R                  UR                  5       SUR                   S35      n[        U[        5      (       d   eUR                  Ul        UR                  Ul        SUl         U$ )Nr  r   r  T)r   r   r   rl   r  r  r  r/  r   r  r   rG   dependent_itervars)r5  r  rb  s      rX   r  CppVecKernel.broadcast  s    $$$$uzz)hh''!4!4!6 7wz>OqQG ##///hh''%%j&6&678*//9J!LG '>2222"((%/%B%B"rW   r  c           	      8   UR                   (       a   eUR                  c   eU R                  R                  U R                  U R                  UR                  5       SU SU S35      n[        U[        5      (       d   eUR                  Ul        SUl         U$ )Nz	::arange(r   r   T)r   r   r  r  r  r  r   rG   )r5  r   r  r  s       rX   r  CppVecKernel.arange  s    <<{{&&&""LL!!%++./yr&K
 &.1111{{rW   c                    [         U   nU R                  U5      n[        U5      (       a  SU S3$ US;   aa  [        U   nU R	                  X5      nUS:X  a  [        U5      (       a  SU S3OSU S3nO[        U5      (       a  SU S3OSU S	3nU S
U S3$ US:X  a  U R                  5        S3$ [        X5      nU S
U S3n	U[        R                  :X  a  US;   d   eU R                  5        SU S3$ U	$ )Nr   r   rz   ra   r}   r|   r   r{   r~   r  r   rg   z	::from(0))r_   r`   rd   r  )
r<   r  r)   rH   r  r   r/  r   r   rl   )
r5  r   r   r   vec_typer   r   r  scalar_initvec_inits
             rX   r  CppVecKernel.reduction_init_vec  sF   07%%k2//hZs++11!+.F22>IH) &e,, +6(-@/xx@  &e,, ,F8=A/xx@ 
 ZqQ''U"))+,I66$^;ZqQ/EJJ!%::::))+,GK=BBrW   c                    [         U   nU R                  U5      n[        U5      (       a  SU S3$ US;   ax  U R                  U5      nU R                  [        R
                  5      nU[        R                  :X  a!  S[        [        R                      SU SU S3$ S[        U    SU SU S3$ U[        R                  :X  a  US;   d   eU R                  5        $ U$ )Nr   r   rz   zIndexValueVec<r   )r_   r`   rg   rd   )
r<   r  r)   r  r   r  rl   rH   ro   r/  )r5  r   r   r   r  r  r  s          rX   r  #CppVecKernel.reduction_acc_type_vec  s    07%%k2//hZq))11))+6E))%++6E

"'U[[(A'B"UG2eWTUVV#L$=#>br%PQRREJJ!%AAAA))+,-rW   c           	          U(       a  [        X$5      OUn[        U5      nSn[        XW5      nSU R                  U5       SU SU SU S3	n	[        U[        R
                  5      (       a  US::  a  SU	 3$ U	$ )	Ni   zWelfordHelper<r   r  r  r   r5   zstatic )r   rE   r  r   r   rY  )
r5  r  r  r   rK  vec_num_range_threadvec_num_range_thread_expr
chunk_size
num_chunkswelford_helper_init_lines
             rX   _welford_helper_init!CppVecKernel._welford_helper_init!  s    
  ,:) 	
 %00D$E!
1>
T//67r*RHZG[() 	! j%--00Z1_ 5677++rW   c                    [         R                  R                  (       a  SO	[        5       nU R                  R                  U R                  X#U5      5        U R                  R                  U R                  X#XE5      5        U R                  R                  U SU SU S35        U R                  R                  U SU SU S35        g )NrQ  z = welford_combine(rN  r   z_local = welford_combine(z	_local, &)
r   r  rS  r*   rC  r   r  r@  rD  rA  )r5  r  r  r  r   rK  s         rX   r   CppVecKernel._use_welford_helper9  s     $ZZ77M=Q=S 	 	**44%%"e	

 	!!++%%"e	

 	**44i*7)37I6J"M	
 	##--i0	CUBVVXY	
rW   r  r   c                    U[         R                  :H  nUS:X  aG  U R                  (       a  SU SU S[        U R                  5       S3$ U(       a  U SU 3$ SU SU S3$ US:X  aG  U R                  (       a  SU SU S[        U R                  5       S3$ U(       a  U S	U 3$ S
U SU S3$ US:X  aE  U R                  (       a  SU SU S[        U R                  5       S3$ U(       a  SOSn	U SU	 SU 3$ US:X  a7  U R                  (       a  SU SU S[        U R                  5       S3$ U SU 3$ US:X  a7  U R                  (       a  SU SU S[        U R                  5       S3$ U SU 3$ US:X  a  U(       a?  U R                  (       a"  SU SU S[        U R                  5       SU S3	$ SU SU SU S3$ U R                  (       a  SU SU S[        U R                  5       S3$ SU SU S3$ US:X  an  [	        U[
        5      (       a  Uu  pnO[        X5      u  pnU R                  (       a%  SU SU
 SU SU S[        U R                  5       S3$ SU SU
 SU SU S3	$ US;   a  Uc   e[        U   nU[         R                  :X  a  [        [         R                     nU R                  U5      nU R                  [         R                  5      nSnSnUb&  Uc   eS[        U5      R                  5        3nSU 3nU R                  (       a.  U SU SU SU U S U SU U S[        U R                  5       S3$ U SU SU SU U S U SU U S3$ US!:X  ac  [	        U[        5      (       aG  UR                  [         R                  :X  d   e[        [         R"                  R$                  U45      u  nU SU 3$ [&        e)"Nr`   zmax_masked_reduce(r   r   r  r  r_   zmin_masked_reduce(r  r  rd   zsum_masked_reduce(r   r[   r   re   zprod_masked_reduce(r   rf   zxor_sum_masked_reduce(r   rh   r   rN  ri   r   z}, r   rz   rS   z_combine_vec<r  rg   )r   rl   r  rE   r   r   r   rH   ro   r  r  rp   r  rG   r   rO   r4   rf  r  r  )r5  r   r   r   r  r   r  r   r   r   r   r   r   r   r  r  t_extra	arg_extras                     rX   r  "CppVecKernel.reduction_combine_vecP  s    uzz)U"~~+C5:,bT^^A\@]]^__  e3zl+ -SEJ<qA
 u$~~+C5:,bT^^A\@]]^__  e3zl+ -SEJ<qA
 u$~~+C5:,bT^^A\@]]^__%,c#a}Aj\::v%~~,SEJ<r+dnnB]A^^_``c*..y(~~/uBzl"[QUQ_Q_E`Daabccc*..//!>>-cU"ZL;t~~C^B__bcubvvwxx +3%r*SAS@TTUV >>-cU"ZL;t~~C^B__`aa-cU"ZLBB00*e,,#- & $5^#P &~~)#d4&2$b[Y]YgYgMhLiijkk)#d4&2$bLL33(((!),FEJJ&%ekk2)))4E))%++6EGI +777s#78>>@AB L	~~%&mF82eWBuggY WuBzl9+RDNN8S7TTUW
 ))vhbr%QXPYY[\_[``bcmbnoxnyyz{{u$*n55!''5::555 4QXX5E5E
} UU#j\**%%rW   c           	         > [        U[        5      (       d   eUR                  c   eUR                  (       d<  [        U[        5      (       a  UR                  (       a  SU S3n[        T	U ]  XX45      $ UnUnU(       a!  U R                  UR                  5       SU S3nU(       a!  U R                  UR                  5       SU S3nU(       a!  U(       a  SU SU SU SU S3	nU SU SU 3nO-U(       a  U SU 3nU SU 3nOU(       d   eU SU 3nU SU 3nU R                  UR                  5       SU S3nU(       a;  UR                  (       d!  U R                  UR                  5       SU S3nSU SU S3nU R                  (       aS  U R                  UR                  5       SU R                  UR                  5       S	U S
[        U R                  5       S3nSU S3nU R                   SU SU S3$ )Nr  z).all_masked()r   r@  z) & (r   z) | ~(z::set(z::from(1), (r   z, "index out of bounds: z"))r   rG   r   r   r3  r  r  r/  r  rE   r  )
r5  r   r  r  r  lower_scalarupper_scalarcond
cond_printr8  s
            rX   r  CppVecKernel.indirect_assert  s&   #~....yy$$$zz$//DKK4&/7*3uCC))#))45QugQ?E))#))45QugQ?EUugT#eC5E7!<D(>cU#l^DJWD&D(>cU3JL5U#eW%D5L>2J%%cii014&:;;--cii894&BtfF4&*D>>&&syy12&9L9LSYY9W8X YV3{4>>:;1>  4&'&&'q.FzlRTUUrW   c                 F  > [        U[        5      (       d   eUR                  (       d  [        T	U ]  XU5      $ [
        U   nU R                  U5      n[
        U   nU R                  U5      nSU S3nU[        R                  :w  a3  U[        R                  :X  a  U R                  U5       SU SU SU S3nU$ U[        R                  :X  a!  U[        R                  :w  a  U SU SU S3nU$ X2:w  a,  XWs=:X  a  S:X  a  O  OS	U SU S3nU$ S	U SU SU SU SU S3nU$ )
Nr  r   z::from<r  r  z.to<r   r5   r  )
r   rG   r   r3  r  rH   r  r   rl   r/  )
r5  r  r   r   src_cpp_typer  dst_cpp_typedst_num_vectorsr  r8  s
            rX   r  CppVecKernel.get_to_dtype_expr  sY   #~....zz7,SCC#I.//	:#E*//63%qz

"u

':)))45W\N!OK\\^_b^ccdeD  %**$%**)<U$|nAo->cBD  6Q6*<.3%qA  +<./9J!L>YZ[jZkkmnqmrrstrW   )rB  r3  r  r  r  r2  r	  )NNF)F)-r  r  r  r  r  r  r4  r   r  r   r   r   r   rn   r  ro   r  rp   r  r/  rG   r  r   rT  r=   r	   rl   r  ry  r}  r  r  r  r  r  r  r  r  r  r   r  r  r  r  rm  rn  s   @rX   r  r  	  s   I C"45:: 4 4ekk c R%++ R% R
P5;; P3 P 38++ HEKK H# HN> N%++ N# N /3## zz# {{	#
 N+#T ,0<@ Lc]L zzL {{	L
 (L eC$789L L 
.	!L\ UZZ 4 !*S.()* * zz	*
 {{* *X<BBH&PPN ~ $
N 
ELL 
^ 
 D" PT,0
8  (,/3+0==_& %_& 'tn_& EKK(_&B#VJ rW   r  c                      ^  \ rS rSrSr\r  SU 4S jjrS rS r	 SS jr
S\S\R                  4U 4S	 jjrSU 4S
 jjrS rU 4S jrS\R                  S\R                  4S jrSrU =r$ )r.  i  a  
A vector kernel that handles the 2d tiles with the tile size defined in `tiling_factor` on
the inner-most loop level and one of the outer loop level (`outer_tiling_idx`). When the data
tile is accessed in a contiguous way from the outer loop axis, a transposition is applied on the
tile to make the access contiguous from the inner-most loop axis. Then, the same vectorization
logic from its parent `CppVecKernel` is leveraged for load/store/compute. The transposed tile load
and store are generated into kernel.preloads and kernel.poststores buffers.

The loop structure looks like below:
for ...
  for i_outer ...
    for ...
      for inner_most ...
        // generated by CppTile2DKernel
        float tmp0[16*16]; at::vec::transpose_mxn<...>(tmp0, in_ptr0 + ..., ...); // into kernel.preloads
        float tmp1[16*16]; // into kernel.preloads
        for i_inner ... { // the kernel inner loop
          vectorized loads/compute/stores (e.g., load tmp0, store tmp1) // into kernel.loads/compute/stores
        }
        at::vec::transpose_mxn(out_ptr0 + ..., tmp1, ...) // into kernel.poststores
      for inner_most ... (tail)
        // generated by CppVecKernel
        ...
  for i_outer ... (tail)
    for ...
      for ...
        // generated by CppKernel
        ...
c                    > [         TU ]  UUUUS   U5        X@l        XPl        X`l        U(       a  UOUU l        U(       a  UOUU l        SU l        g )Nr5   T)r3  r4  tiling_indicesinner_tail_sizeouter_tail_sizeinner_num_elemsouter_num_elemsinner_is_tiling_idx)r5  r  rK  r  r  r  r  r8  s          rX   r4  CppTile2DKernel.__init__  sZ     	1	
 -..2A}2A}#' rW   c                 L    [        U R                  U R                      S35      $ )NrN  )r-   r  	outer_idxr;  s    rX   inner_itervarCppTile2DKernel.inner_itervar#  s"    !T]]4>>%B$C6"JKKrW   c                    U R                   U R                     nU R                   U R                     n[        XU R                  5      n[        XU R                  5      nU R
                  S L =(       aW    US:H  =(       aK    UR                  U5      =(       a3    UR                  U5      (       + =(       a    UR                  U5      (       + $ r   )r  r  r  r
  r  rh  r   )r5  r   	outer_var	inner_varouter_strideinner_strides         rX   need_vec_transpose"CppTile2DKernel.need_vec_transpose&  s    MM$..1	MM$//2	*5T=O=OP*5T=O=OPOOt# 0!0		)$0 !$$Y//0 !$$Y//	
rW   c                    [         R                  R                  U5      nU R                  nU S[	        U5       3nSn	[	        [        X0R                  U R                     U R                  5      5       n
[	        U R                  5       nU(       a  XpXpSnU R                  U-  (       a  U R                  U R                  pOU R                  U R                  pU(       a  US:X  a  SOSn[        U[        R                  5      (       a  UR                  (       a0  [        U[        R                  5      (       aF  UR                  (       d5  S[         U    SU S	U S
U
 S
U	 S
U S
[	        U5       S
[	        U5       S3nO4S[         U    S[	        U5       S[	        U5       SU S	U S
U
 S
U	 S
U S3nU(       a  U R"                  R%                  5       nOcU R"                  R'                  U5      (       d&  U R"                  R)                  U R*                  USS9nOSnU R"                  R-                  U5      nU(       a>  [         U   nSU SU S3nU SU SU SU SU S3
nU R*                  R/                  U5        UR1                  S[3        U5      5      nU(       a'  U R4                  R/                  [7        UU5      5        U$ U R*                  R/                  U5        U$ )Nr  __place_holder__Tr  truefalseztranspose_mxn<r  r  r   r   Fr  zalignas(std::max(std::size_t(z), alignof(z)))r   r  r\   r   )r4   r  r  r  rE   r
  r  r  r3  r  r  r  r   r   r  r  rH   r  rO  containsr  rI  getr   r  rp   rJ  r;   )r5  r   r   r   is_store
store_moder   factorr  r  ld_srcld_dstneed_defineMNr  load_or_storetile_var	cpp_dtypealignasdefine_lines                        rX   gen_transposed_tile_load_store.CppTile2DKernel.gen_transposed_tile_load_store3  s    !!$'##SU+,-  3E==;Y[_[m[m nop/0#F##h.'')=)=q $$$$  !)jL.HVPW
q%**%%akkq%**%%akk !e!4 5Qzl C56("SEF82k!n5ERTUGWWY[  !e!4 5Q{1~6FaTUGWWXYcXd e56("SEF827  xx(H""=11xx((U(SHKxx||M2H$U+I 6fX[SVWG$IQyk8*AfXQvhbQKMM##K0%--.@#h-POO%%l4&GH  MM##M2rW   r   r   c                 X  > U R                   R                  U5      nU R                  U5      nU R                  5       nU R	                  U5      (       a  U R                  XUSS9nU S[        X@R                  -  5       3n[        R                  R                  U5      nU R                  USU5      nU R                  R                  U R                  XS9n	U	R                  SXU40 5        [!        U	["        5      (       d   eSU	l        U	$ U R'                  U5      n
[(        TU ]U  X5      $ )NF)r  r  r   rM  ry  T)r  r  r  r  r  r  rE   r3  r4   r  r  rT  r  r  rb  r  r   rG   r   r/  r3  ry  )r5  r   r   r   re  r  rS  r   r   r  r   r8  s              rX   ry  CppTile2DKernel.loado  s   iiood#$$U+""$""5))::55 ; H "
#k%..2H&I%JKGGG%%d+E**7Au=DXX&&tzz4&EF!!&4u*=rBfn5555 FMM//6I7<00rW   c                   > SU;   d   e[        U[        5      (       d   U5       eUR                  (       d  U R                  U5      nU R                  R                  U5      nU R                  5       nU R                  U5      nU R                  U5      (       a  U R                  XUSUS9nU S[        X`R                  -  5       3nU R                  (       dH  [        R                  R                  U5      [         ["        R$                  ["        R&                  /-   ;   a  U SU S[        U R                  5       S3n	OU SU S3n	U R(                  R+                  [-        X5      5        g U R/                  U5      n
[0        TU ]e  XX45        g )Nr  T)r  r  r  r{  r   r   )r   rG   r   r  r  r  r  r  r  r  rE   r3  r  r4   r  r  r   r   r  r  r[  r   r;   r/  r3  r  )r5  r   r   r  r  r   re  r  storebufr   r   r8  s              rX   r  CppTile2DKernel.store  s`   }}%007%70||NN5)Eiit$""$$$U+""5))::54D ; H #3{5>>3I'J&KLH~~!2!24!8M

M = "  zK4O3PPRSz4KK!!,t":;//6IGM$57rW   c                    U R                  5       nU R                  (       a2  UR                  SU SU S[        U R                  5       SU S3	5        g UR                  SU SU S[        U R
                  5       SU S3	5        g )Nrk  rl  r   r   r   )r  r  r   rE   r  r  )r5  r   re  s      rX   r`  #CppTile2DKernel.codegen_inner_loops  s    ""$##NNUG6%K@T@T4U3VVXY^X__bc NNUG6%K@T@T4U3VVXY^X__bcrW   c                   > [         TU ]  X5      nU R                  S   U R                  :  a  U R                  O[	        U R                  5      u  U l        U l        U R                  U R                  S   :X  a+  U R                  U l        U R                  U l
        SU l        U$ U R                  U l        U R                  U l
        SU l        U$ )Nr5   r   FT)r3  r  r  r:  reversedr  r  r  r  r  r3  r  r  r  )r5  groupreduction_groupr  r8  s       rX   r  CppTile2DKernel.set_ranges  s    w!%9 ""1%(<(<< $--. 	(
 ??d11!44!11DN!11DN',D$
  "11DN!11DN'+D$rW   r   c                 T    U R                  UU R                  U R                  5       S9$ )Nrj  )rn  r  r  rq  s     rX   r/  "CppTile2DKernel.transform_indexing  s0    ++%%' , 
 	
rW   )
r  r  r  r3  r  r  r  r  r  r  r,  r	  )r  r  r  r  r  r,  r  r4  r  r  r  rp   r   r  ry  r  r`  r  r/  r  rm  rn  s   @rX   r.  r.    sx    < #I (.L
 6::x1 1UZZ 1,8:	$


 
uzz 
 
rW   r.  _bodyc                    U R                   /[        U R                  R                  5       5      -   nSnSnU GH  nUR                  R
                   H  nUR                  S:X  d  UR                  S;   a  M%  UR                  S;  a  Sn[        US5      (       a  UR                  (       a  [        R                  UR                  ;   d   eUR                  [        R                     nUR                  (       a  UR                  [        ;  a  SnM  Ub)  X&R                  :w  a  [        R                  " S5        M  M  UR                  nM  SnM     GM     X#4$ )	z
Returns the low precision data type (torch.float16/torch.bfloat16) contained in the nodes
and if all the nodes can codegen with this data type without converting to float.
Otherwise returns None and True.
NFplaceholder)	get_indexr  )ry  r  r  r  r  Try  z.bf16 and fp16 are mixed in the scheduler node.)
root_blockr)  	subblocksr  r  nodesoptargetr   ry  rA   rx  r   r   warningswarn)r  
sub_blocks_lowp_fp_type	_use_fp32	sub_blockr7  rt  s          rX   get_loop_body_lowp_fpr    s     ""#d5??+A+A+C&DDJ+/MI	__**Exx=(ELL = -  || $  !	uf%%%***..%**<<</4zz:M:Q:Q/R}}](J $I".$5 &VW 6 %,MMM 	9 +  > ##rW   c                   V   ^  \ rS rSrSrU 4S jrS\\\   \\   4   4S jr	S r
SrU =r$ )TilingSelecti  z
Implement the heuristic to select the tiling factors and tiling indices.
In the future, we can implement advanced heuristic in a subclass.
c                 "   > [         TU ]  5         g r	  )r3  r4  r5  r8  s    rX   r4  TilingSelect.__init__  s    rW   r   c           	      L	  ^# [        U5      n[        U5      nU(       d   e[        S U 5       5      (       a  / / 4$ [        R                  n[        US   5      S   m#T#(       a  [        U#4S jUSS   5       5      (       a  T#n[        R                  " 5       R                  US9nU R                  XU5      nU(       Ga  [        US S9u  p[        U5      [        U	5      -   n
[        R                  R                  (       Ga  S nS	 nS
 n[!        [#        U
5      5       Vs/ s H  n[%        [&        R(                  U5      PM     nn[#        U5      nUS U UUS  nn0 n0 nU GH  nUR*                  /[-        UR.                  R1                  5       5      -   nU GHS  nUR2                  R4                   GH4  nUR6                  S;   a  UR6                  S:X  a  SOSnUR8                  R;                  UU45      UR<                  U   R<                  S      nU" X5      (       a7  U" UXU5      nUR6                  S:X  a  Uc  OUS;  a  U" UR6                  U5        [?        UR6                  [@        5      (       d  M  UR6                  RC                  S5      (       a  M  UR6                  S;   a  M  UR6                  U;  a  SUUR6                  '   GM  UUR6                  ==   S-  ss'   GM7     GMV     GM     [E        UR1                  5       5      n[E        UR1                  5       5      nSnSnUU:  d  US:  a  UU-  U:  a  / / 4$ U	(       dD  U(       a=  [#        U5      S:X  a.  [G        XS      /5      (       d  XS      US-  :  a
  US:  a  / / 4$ U[H        ;   a  [        R                  " 5       R                  US9n U H  n!U!S:  a  U![#        U
5      -   n!U!S:  d  U![#        U
5      :  a  M.  [G        U
5      (       ad  [J        R2                  RL                  RO                  U
U!   SS9n"U"U :  a1  [J        R2                  RL                  RQ                  U"U 5        U S-  n  OM  U
U!   U :  d  M  U S-  n  O   [#        U5      S:X  a  U/U4$ [#        U5      S:X  a  Xf/U4$ / / 4$ s  snf )Nc              3   2   #    U  H  o[         ;  v   M     g 7fr	  )rw   r  r   s     rX   r   -TilingSelect.select_tiling.<locals>.<genexpr>  s     HZE//Z   r   c              3   F   >#    U  H  n[        U5      S    T:H  v   M     g7f)r   N)r  )r  	loop_body_lowp_fp_dtypes     rX   r   r%  	  s'      "
,	 #9-a0NB,   !r5   rM  c                     [        U S   5      $ r   r   sizess    rX   r  ,TilingSelect.select_tiling.<locals>.<lambda>  s    #eAh-rW   rx  c                 R    XS      n[        XU5      nUR                  (       a  U$ S $ ra  )r
  r  )r   r  r  r  rt  r  s         rX   _try_get_stride3TilingSelect.select_tiling.<locals>._try_get_stride  s0     'a'89G0OF%+%5%56?4?rW   c                 0    X;  a  SX'   g X==   S-  ss'   g r   rV   )	node_namenon_contig_indexing_op_counters     rX   _update_negative_op_count=TilingSelect.select_tiling.<locals>._update_negative_op_count&  s!     !FDE6A6AQFArW   c                     [        U5      S:H  =(       a@    [        U 5      S:  =(       a+    US   S:  a  US   OUS   [        U 5      -   [        U 5      :  $ Nr5   r   r,  )r  r  s     rX   _is_valid_indices5TilingSelect.select_tiling.<locals>._is_valid_indices.  sb    
 N+q0 (MA-(  .a0A5 +1-!/!2S]!Bh-(	rW   )r  ry  r  r  r   r   r5   masked_subblock)r2   r  r  r  gQ?#   r[  
   r  ))rC   rB   rg   r   ro   r  r(  r   r1  	nelements_select_tiling_indicesr`   r   r   r  enable_tiling_heuristicsrR  r   r.   r   r  r  r)  r  r  r  r  r  r  indexing_from_argsr  r   rp   
startswithrd   r'   r   r4   r  r  guard_lt)$r5  fn_listvar_sizes_listloop_bodies
all_dtypesr   r  r  r  r	  r9  r2  r7  r;  r  r  r:  r  reduction_vars
op_counterr6  r  r  r  r7  arg_idxr   r  op_numnon_contig_indexing_op_numratio_thresholdquantity_thresholdfactor_lowptiling_indice
call_ranger)  s$                                      @rX   select_tilingTilingSelect.select_tiling  s    %W-/<
zHZHHHr6M.{1~>qAc "
(_"
 
 
 #E#002<<5<I44]
 %($?&"E  ,)??Kzz222@G" #3{#344 34;;B4   #&e*-o._-. % .0
 BD.(E"'"2"2!3d5??;Q;Q;S6T!TJ%/	%.__%:%:E$||/NN/4|||/K!QR(1(I(I%)>$:)""'**W"5":":1"=)? $5X#N#N-<(-x.&F
 ,1<<<+G )/-36-A(A,1LL:X)*  *%,,<< % 7 78I J J#(<<#M$N $)<<z#A?@Ju||$<$.u||$<$A$<7 &; &0 )@ Z..01-0299;.* #'%'"-1CCQJ2V;N
 r6M (N+q0,!"34 
 Q/0=13DD r6M% *668BBBO%3M$q((5K8H(H$q(MS=M,M '44%&WW%5%5%?%?'6 &@ &
 &3GG,,55j+N,71,<M! 4 %]3kA(3q(8 &4" >"a'%66>"a'%5~EE2vQs   "$R!c           	         / n[        X5       Hd  u  pV[        R                  " U/UQ76 nU[        R                  " UR
                  UR                  5       Vs/ s H  oR                  PM     sn-  nMf     [        [           " 5       n	/ n
[        [           " 5       n[        [           " 5       nU GH  nUR                   GH  n[        R                  " SUR                  5      (       d  M,  [        XU5      nUS:X  a  M@  US:X  aP  U	R                  [        UR                  SS  5      5        U
R!                  [        UR                  SS  5      5        M  [#        S UR                   5       5      (       a)  UR                  [        UR                  SS  5      5        M  UR                  [        UR                  SS  5      5        GM
     GM     X-
  U-
  n[%        US S9u  nn['        U5      ['        U5      -   n['        U	5      S:X  a  US-
  /$ U(       a  [)        U5      SS  $ X-  U-
  n[)        U	5      n['        U5      S:X  a  US   U;   a  US   US-
  :X  a  U$ [)        UU
R*                  S9SS  $ s  snf )	Nz^d\d+$r   r5   c              3   V   #    U  H  n[        U[        R                  5      v   M!     g 7fr	  )r   r   SIZEr  ry  s     rX   r   6TilingSelect._select_tiling_indices.<locals>.<genexpr>  s      S?R!49955?R   ')c                     [        U S   5      $ r   r,  r-  s    rX   r  5TilingSelect._select_tiling_indices.<locals>.<lambda>      s5QR8}rW   r0  r+  r   )rR  r
   extract_read_writes	itertoolschainreadswritesr   r   rn   r{  r   searchr   r
  r  r  r(  r`   r   sortedcount)r5  rG  rH  r  	all_indexfn	var_sizesrwdepcontig_varscontig_vars_listnon_contig_stride_constnon_contig_stride_otherr   r   r  contig_onlyr  r	  num_itervarscontig_and_const_stridecontig_vars_sorteds                         rX   rB  #TilingSelect._select_tiling_indices  s?    	 9MB11"AyAByrxx/ST/S))/STTI : !o'",S/"3",S/"3E))yyCHH55,UGQ;q[OOC$56$++C,=>Sv?R?RSSS+//CHHQRL0AB+//CHHQRL0AB *  ";>UU!$^9T!U5zC$88{q  1$%%+&rs++1##$ $K0"#q("2&*AA"2&,*::%%(.>.D.DEbcJJK Us   I6rV   )r  r  r  r  r  r4  r   r)  rn   rU  rB  r  rm  rn  s   @rX   r  r    s=    
i 
tCy$s)#	$	iV.K .KrW   r  c                      ^  \ rS rSr% \r\\   \S'   \r	\\   \S'   \
r\\
   \S'   U 4S jrS rS\4S jrS	\4S
 jrS rS rS rS\\   4S jrS rS rSS\\   4S jjrS\S\S   4S jrSrU =r$ )r  i  
kernel_clsvec_kernel_clstile2d_kernel_clsc                    > [         TU ]  UR                  UR                  R                  5        Xl        S U l        S U l        [        R                  " 5       U l
        / U l        g r	  )r3  r4  r  wsrK  rb  rS  r9  r   r1  picked_vec_isakernelsr5  rb  r8  s     rX   r4  CppKernelProxy.__init__  sO    **LOO,G,GH(2=2J2J2L(*rW   c                 p    U H0  n[        U[        5      (       d   e[        R                  " U5        M2     g r	  )r   r#   r:   propagate_scheduler_node)r5  r  r7  s      rX   data_type_propagation$CppKernelProxy.data_type_propagation  s-    Ee]333388? rW   scheduler_nodec                     [        UR                  [        5      (       d  g[        R                  " U5        [        UR                  5      S   S L=(       a    [        UR                  5      S   (       + $ )NTr   r5   )r   r  r   r:   r  r  )r5  r  s     rX   is_lowp_fp_scheduler#CppKernelProxy.is_lowp_fp_scheduler  s_    ...9944^D!."6"67:$F C).*>*>?BB	
rW   r(  c                     S[         R                  R                  4S jnUR                  /[	        UR
                  R                  5       5      -   nU H  nU" UR                  5        M     g )N	sub_graphc                   ^^^^^^^^ S[         R                  R                  S[        [         R                     4S jmS[         R                  R                  S[        [         R                     4S jmS[         R                  R                  S[         R                  4U4S jjmS[         R                  R                  S[         R                  4U4S jjmS[         R                  R                  S[         R                  4UU4S jjn[        U R                  5      n/ mU GHm  nUR                  S	;   a  T" U5      =m[        ;   a  [        UU4S
 jUR                   5       5      (       a  MM  UR                  S   nU R                  U5         U R                  SXC[         R                  4S9mUR                  TU4S j5        [         =R"                  S-  sl        S S S 5        M  UR                  S:X  a  T" U5      =m[        ;   a  UR                  u  pEpgnU" UT5      (       a  GM
  [$        R&                  R)                  U5      mU R+                  U5         U R                  SXGT4S9mUR-                  UT5        [         =R"                  S-  sl        S S S 5        GM  UR                  S:X  a  UR                  u  nmnn	n
U[        ;   a{  T[         R                  [         R.                  [         R0                  [         R2                  4;   d   eUT[        ;   a  [         R                  OT[         R                  U	U
4Ul        GM*  GM-  UR                  S:X  ah  UR                  S   [        ;   aQ  UR                  u  pJm[        UU4S jUR                   5       5      (       a  GM  XJ[         R                  4Ul        GM  UR                  S:X  ay  UR                  S   [        ;   ab  UR                  u  pKm[        UU4S jUR                   5       5      (       a  GM  TR5                  U5        XK[         R                  4Ul        GM.  UR                  S:X  Ga.  UR                  u  pGmnU[        ;   ad  U" Xx5      (       dW  U R+                  U5         U R                  SXGU4S9mUR-                  UT5        [         =R"                  S-  sl        S S S 5        T[        ;   a  [        UU4S jUR                   5       5      (       d{  UR                  S   nU R                  U5         U R                  SXC[         R                  4S9mUR                  TU4S j5        [         =R"                  S-  sl        S S S 5        GMg  GMj  GMm  GMp     S[         R                  R6                  4U4S jjnU" U 5        g ! , (       d  f       GM  = f! , (       d  f       GM  = f! , (       d  f       GN= f! , (       d  f       GM  = f)Nr  r   c                 b   U R                   S:X  a,  [        R                  R                  U R                  S   5      $ U R                   S:X  a  U R                  S   $ U R                   S:X  aD  [        U R                  5      S:  a  U R                  S   $ U R                  R                  SS5      $ g)	z6Get input dtype for nodes that may consumes lowp fp dtr  r5   r  r+  r  r   r   N)r  r4   r  r  r  r   r  r  r  s    rX   get_input_dtype]CppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.get_input_dtype  s    ;;')77,,TYYq\::[[$6699R=([[J.499~)#yy|+#{{{DAArW   c                 .   U R                   S:X  aG  [        U R                  5      S:X  d   e[        R                  R                  U R                  S   5      $ U R                   S;   a  U R                  S   $ U R                   S:X  a  U R                  S   $ g)	z6Get output dtype for nodes that may produce lowp fp dtry  r   r5   )r  r  r  r+  r  r   N)r  r   r  r4   r  r  r  s    rX   get_output_dtype^CppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.get_output_dtype  sz    ;;&(tyy>Q...77,,TYYq\::[[$JJ99R=([[$6699Q<'rW   r  c                 2   > U[         ;   d   eT" U 5      U:H  $ )z]Check if the given node produces output with expected low precision floating point data type.)r   )r  r  r  s     rX   is_lowp_fp_source_CppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.is_lowp_fp_source  s!    ]***'-33rW   c                 f   > U[         ;   d   eT" U 5      =n(       a  X!:H  $ U R                  S:X  a  gg)zZCheck if the given node accept input with expected low precision floating point data type.r  TF)r   r  )r  r  input_dtyper  s      rX   is_lowp_fp_sink]CppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.is_lowp_fp_sink  s<    ]***"1$"77;7&,,[[J. rW   c                 f   >^ T" U T5      =(       a     [        UU4S jU R                   5       5      $ )zCheck if the node is a lowp fp sources which are all directly fed to ops that accepts lowp fp input
thus no need to promote to float
c              3   6   >#    U  H  nT" UT5      v   M     g 7fr	  rV   r  userr  r  s     rX   r   }CppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.is_lowp_fp_source_no_promote.<locals>.<genexpr>+  s      ;:D$OD"--*   r(  users)r  r  r  r  s    `rX   is_lowp_fp_source_no_promotejCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.is_lowp_fp_source_no_promote'  s1     )r2 s ;:>**; 8 rW   )ry  r  c              3   6   >#    U  H  nT" UT5      v   M     g 7fr	  rV   r  s     rX   r   WCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.<genexpr>7       M?444r  r   r  r  c                    > U TL$ r	  rV   r  to_type_nodes    rX   r  VCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.<lambda>?  s
    A\4IrW   r5   r  r  r  r+  c              3   6   >#    U  H  nT" UT5      v   M     g 7fr	  rV   r  s     rX   r   r  n  r  r  c              3   6   >#    U  H  nT" UT5      v   M     g 7fr	  rV   r  s     rX   r   r  t  r  r  r  c              3   6   >#    U  H  nT" UT5      v   M     g 7fr	  rV   )r  r  r   r  s     rX   r   r    s     Ue < <r  c                    > U TL$ r	  rV   r  s    rX   r  r    s
    A\<QrW   r  c                 Z   > S[         R                  R                  4U4S jjnU" U 5        g )Nr  c                   >^ S[         R                  R                  4S jnU R                   Vs/ s H  o"R                  S:X  d  M  UPM     nnU Vs/ s H  o!" U5      (       d  M  X"R
                  0PM      nnU H  nUR                  5        H  u  mnTU R                  ;   d  M  [        U4S jU 5       5      (       d!  TT;   d  M:  [        S U 5       5      (       d  MS  TR                  S   nTR                  U5        U R                  T5        M     M     U R                  c  U R                  5         g g s  snf s  snf )Nto_nodec                 :    [        S U R                   5       5      $ )Nc              3   >   #    U  H  oR                   S :H  v   M     g7f)r  N)r  r  usrs     rX   r   ڮCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.eliminate_to_dtype.<locals>._eliminate_duplicate_to_node.<locals>._used_by_to.<locals>.<genexpr>  s     "U}::#;}s   r  )r  s    rX   _used_by_toڛCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.eliminate_to_dtype.<locals>._eliminate_duplicate_to_node.<locals>._used_by_to  s    ""Uw}}"UUUrW   r  c              3   `   >#    U  H#  oR                   S    TR                   S    :H  v   M%     g7fr+  Nr  )r  r  r  s     rX   r   ڙCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.eliminate_to_dtype.<locals>._eliminate_duplicate_to_node.<locals>.<genexpr>  s$     #SUcHHRLDIIbM$AUs   +.c              3   L   #    U  H  oR                   S    [        ;   v   M     g7fr  )r  r   r  s     rX   r   r    s      ,&QV#(EQVr  r+  )r   fxNoder  r  r  r  r(  all_input_nodesreplace_all_uses_with
erase_nodeowning_modulelint)	r  r  r  all_to_nodesall_to_nodes_and_users
node_usersr  val_nodeto_lowp_fp_legalized_nodess	     `     rX   _eliminate_duplicate_to_nodeچCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.eliminate_to_dtype.<locals>._eliminate_duplicate_to_node  s-   VUXX]] V *3$)8KK:<U ! $ 8D.7Ct{SWGX*zz*| + . '=
+5+;+;+=KD%#y6 ##SU#S S S$(,F$F(+ ,&QV,& )& )&
 ,0+?+?+C $ : :8 D ) 4 4T : ,> '=, !..6!( 79$.s   D:D:D?'D?)r   r  Graph)r  r  r  s     rX   eliminate_to_dtype`CppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.eliminate_to_dtype  s"    ')EHHNN ')R -Y7rW   )r   r  r  r   r   r)  r  r  r   r(  r  r  inserting_aftercall_methodro   r  r   cpp_to_dtype_countr4   r  r  inserting_beforereplace_input_withr  r  r  r  r  )r  r  sub_graph_nodesr7  r2   r   r   	value_varr   r   r  r  r  r  r   r  r  r  r  r  r  s                @@@@@@@@rX   add_to_dtypeDCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype  s    ehhmm  8M  
 uxx}} 
 %++9N 
 4 45;; 4
	!ehhmm 	! 	!588== ekk   #9??3O)+&(LL$::/66=H MMMM **Q-C"2259'0'<'<&c%++-F (= ( 33(*I  22a72 :9 LLG+.u55-G16.CqQ3IrBB GG--d3E"33E:'0'<'<&ce-D (= ( 00LI22a72 ;: \\[0 

!& M1  %!KK!NN!MM!KK	)       +0M+AEKKu!KK*!&
 2& \\Z/EJJrNm4S',zz$SMMMM "%ekk!:EJ\\Z/EJJrNm4S#(::LSRMMMM  /55e<"%%++!6EJ\\%779>6SUI !M1  <IQQ!*!;!;E!B/8/D/D$.ci5P 0E 0" !& 8 8L Q ' : :a ? : "C -  UUUU"'**Q-C!*!:!:5!A/8/D/D$.c%++5N 0E 0" !& ; ;$02Q!" !( : :a ? : "B!A V . o )r*8ehhnn *8X y)y :9  ;:J "C!B  "B!As2   AW>W>W(1AW:
W	
W%	(
W7	:
X
	)r   r  r  r  r)  r  r  r  )r5  r(  r  r  r  s        rX   legalize_lowp_fp_dtype_loopbody.CppKernelProxy.legalize_lowp_fp_dtype_loopbody  sX    Z	*EHHNN Z	*x  **+d93F3F3M3M3O.PP
#I) $rW   c                   ^  [        U 4S jU 5       5      (       a  U H  nUR                  R                  /[        UR                  R                  R                  5       5      -   nU H  nUR                  R                   H{  nUR                  S;   d  M  UR                  (       d   e[        R                  UR                  ;   d   eUR                  [        R                     nUR                  [        ;   a  M{   e   M     M     g U Ho  n[        U[        5      (       d   e[        UR                  [         5      (       d   eUR                  nUR#                  5       (       a  M^  T R%                  U5        Mq     g )Nc              3   t   >#    U  H-  n[        U[        5      =(       a    TR                  U5      v   M/     g 7fr	  )r   r#   r  )r  r7  r5  s     rX   r   8CppKernelProxy.legalize_lowp_fp_dtype.<locals>.<genexpr>  s2      
 um,Q1J1J51QQs   58)ry  r  )r(  r  r  r)  r  r  r  r  r  ry  rA   rx  r   r   r   r#   r   is_memory_copyr  )r5  r  r7  r  r  fx_nodert  r  s   `       rX   legalize_lowp_fp_dtype%CppKernelProxy.legalize_lowp_fp_dtype  s8    

 
 

 #kk445KK))0029 
 ",I#,??#8#8">>->>#*<</<#6#:#:gll#JJ#J;B<< 3 7 7<G $+==M#AA#A $9 ",	  Ee]3333ekk84444"[[D&&((44T: rW   c           	      
  ^^^ ^!^"^# [        T5      [        T5      :X  d   eU R                  m![        TS S9u  m m"U R                  T T"5        U!U#4S jnUU U"U4S jm#U" U R                  5      n[
        R                  =R                  UR                  -  sl        [
        R                  =R                  UR                  -  sl        [        R                  U5      U l        U R                  (       a  U R                  (       d6  U/U l        U R                  SS 5        U R                  R!                  U 5        g ["        R$                  R&                  R)                  SS9   [+        5       nUR-                  TT5      u  pg[        U5      [        U5      :X  d   eSn[/        [1        T5      5      n	[3        S U	 5       5      (       a  SnSn
S nU(       a  SnUS	   nUS
-   n[        U R                  R4                  5      U:  aV  U R                  R4                  U   R6                  nU R                  R4                  U   R6                  nU=(       a    U(       + n
[        U5      S
:X  Ga  [8        =R:                  S
-  sl        U R                  R=                  US	   US	   S9nU" U R>                  US	   US	   5      nUR@                  URB                  -
  nURD                  S	URB                  40Ul#        [&        RH                  RJ                  (       a#  U(       a  U" U R>                  US	   US	   U5      nOUnURD                  /Ul&        URD                  URB                  UR@                  40Ul#        UU/U l        UnGO[        U5      S:X  Ga  US
   [        U R                  5      S
-
  :X  a  US	   US
   :X  d   e[8        =R:                  S-  sl        U R                  R=                  US	   US	   S9nS	URB                  4URB                  UR@                  4S.nUR@                  URB                  -
  nU R                  R=                  US
   US	   S9nS	URB                  4URB                  UR@                  4S.nUR@                  URB                  -
  nU" U RN                  US	   U5      nURD                  US   URD                  US   0Ul#        / n[&        RH                  RJ                  (       aw  U(       ap  S Hi  u  nnUS:X  a  UOS nUS:X  a  UOS nU" U RN                  US	   UUU5      nURD                  UU   URD                  UU   0Ul#        URQ                  U5        Mk     OU" U R>                  US	   US	   5      nURD                  US   URD                  US   0Ul#        URD                  /Ul&        URQ                  U5        URD                  US   URD                  S	UR@                  40Ul#        URD                  URD                  /Ul&        URQ                  U5        U/U-   U l        UnOU/U l        U R                  X5        U R                  R!                  U 5        S S S 5        g ! , (       d  f       g = f)Nc                     [        U S   5      $ r   r,  r-  s    rX   r  2CppKernelProxy.codegen_functions.<locals>.<lambda>  r_  rW   r0  c                    > TR                   " U /UQ76  n[        =R                  S-  sl        T" U5        UsS S S 5        $ ! , (       d  f       g = fr   )
new_kernelr   generated_kernel_count)r+  r  rf  rb  runs      rX   codegen_kernel8CppKernelProxy.codegen_functions.<locals>.codegen_kernel  sA    ((4t4 ..!3.F 544s   #A
Ac           	      r  > U R                  TT5      u  pSn[        TT	5       H~  u  pEUTT4[        [        R                  " TT5      5      S44;   a  U(       a   eU" X5        MB  SnUTS4:X  d   SU ST ST 35       eU R                  5          U" US5        S S S 5        M     g ! , (       d  f       M  = f)NFrV   Tzunexpected group: r  r   )r  rR  r   ra  rb  r  )
rf  r  rK  	in_suffixri  rj  rG  r  r	  rH  s
         rX   r  -CppKernelProxy.codegen_functions.<locals>.run
  s    #)#4#4UO#L DI!$Wn!=O,9??5/BCRH!   )(=t, $I$)  V ,I;d5'OCTUV 
  //14 21 "> 21s   
B''
B6	Finplace_buffersTc              3   2   #    U  H  o[         ;  v   M     g 7fr	  )rx   r$  s     rX   r   3CppKernelProxy.codegen_functions.<locals>.<genexpr>6  s     S
u ::
r&  r   r5   )r  r   maintailr  )r  )r  r  )r  r  r  ))r   rb  r`   r  rw  r4   r  removed_buffersinplaced_to_removerQ  r  rS  r|  r  r}  aggregate_reduction_buffers
set_kernelr   	_inductorr   patchr  rU  rB   rC   rg   rL  rB  r   generated_cpp_vec_kernel_counttilerx  rC  
tiled_sizer   r7  r  enable_loop_tail_vecr8  ry  r  )$r5  rG  rH  r  scalar_kerneltiling_selecttiling_factorsr  could_masked_vecrJ  _inner_loop_reduction_outer_not_outer_loopinner_loop_reductionouter_loop_levelinner_loop_levelouter_loop_reductionr  
vec_kernelr  tail_kernel
outer_loopr]  r  
inner_loopinner_rangesr  tile2d_kernelouter_rinner_r_inner_tail_size_outer_tail_sizerf  r  rb  r	  r  s$    ``                             @@@@rX   codegen_functions CppKernelProxy.codegen_functions  s*   7|s>2222((!$^9T!U/		% 	%( 't7	=#@#@@	""m&F&FF"!6""$--)?DL,,UD9NN%%d+ __##))%)@(NM-:-H-H.*N ~&#n*====#3N74KLJS
SSS#( .3+K',$#1!#4 #3a#7 t~~++,/??+/>>+?+?(,"l ) ,0>>+?+?(,"l ) -I5I1I 4 >"a'66!;6~~**>!+<^TUEV*W+''):N1<M
 !II7	,0HHq$//6J+K
(::227G"0++&q)&q)!	#K #0K48HH:M0-1XX7S,T) *K8"^$)"1%T]]);a)??&q)^A->>? 66!;6!^^00"1%nQ.? 1 
 
 5 56'22JOOD  #-//J4I4I"I!^^00"1%nQ.? 1 
 
 5 56'22JOOD  #-//J4I4I"I .**"1%"! NNL$8NNL$8/+ !::227G-( 07&/@Od ) 07&/@Od ) "0 22*1-*,," 'NNL,A&NNL,A0, $**62--0 "0++^A->q@Q"J #V(<"V(<0J, 2<0@J-&&z2"V(<"JOO(<3M/ 5?NNJNN3SM0&&}5 -<( -,,/ NN%%d+G A@@s   "T	Y44
Zc                     U H*  nU R                  U5        [        R                  " U5        M,     U R                  X5        g r	  )r  r:   propagate_loopbodyr  )r5  rI  rH  r  s       rX   codegen_loop_bodies"CppKernelProxy.codegen_loop_bodies  s8    D0062248   	{;rW   r  c                    U R                  U5        U R                  U5        [        U5      S:  d   eS nU Vs/ s H  n[        R                  " X#5      PM     nn[        [        R                  [        5      (       a9  [        R                  R                  (       a  S nU Vs/ s H
  o%" U5      PM     nnU Vs/ s H  o3R                  S   PM     nnU R                  XF5        g s  snf s  snf s  snf )Nr5   c                     U R                  5         U R                  5         [        [        R                  [
        5      (       a  U R                  " U6 $ U R                  U5      $ r	  )decide_inplace_updatemark_runr   r4   rf  r1   r  codegen)r  
index_varss     rX   ri  (CppKernelProxy.codegen_nodes.<locals>.fn  sG    &&(MMO!(($566zz:..||J//rW   c                 P    [         R                  R                  U 5      nXl        U$ r	  )r4   r  localize_functionoriginal_fn)ri  
wrapped_fns     rX   wrap_fn-CppKernelProxy.codegen_nodes.<locals>.wrap_fn  s)    33EE
 *,&!!rW   )r  r  r   rT  partialr   r4   r  rK   r  r  r  )r5  r  ri  r  rG  r"  rH  s          rX   codegen_nodesCppKernelProxy.codegen_nodes  s    ##E*""5)5zQ	0 <AA549$$R.5A q--/ABB&&44" .55Wrwr{WG549:ED**Q-E:w7# B 6:s    C)&C.=C3c                 <    U R                  U R                  X5        g r	  )r  rS  )r5  r   r  s      rX   r  CppKernelProxy.codegen_loops  s    BrW   c                 J    U R                    H  nUR                  5         M     g r	  )r}  r]  r5  rf  s     rX   r]  4CppKernelProxy.update_stores_with_parallel_reduction  s    llF88: #rW   r   c                 D   Uc   eSnU R                    Hw  n[        R                  " 5        nUR                  X5      (       a@  SnUR	                  UR                  5       5        UR                  UR                  5       5        S S S 5        My     g ! , (       d  f       M  = f)N
C10_LIKELYC10_UNLIKELY)r}  r   r   r)  r   r   rQ  re  )r5  r   	if_prefixrf  r   s        rX   re  CppKernelProxy.gen_body  s{     	llF%%'5,,T== .I''6KK 12	 (' #''s   AB
B	inner_loop_reduction_outer_notr	  	LoopLevelc                   ^  SU 4S jjnT R                   S   nU(       a  U(       d   eU" U5        OZUR                  5         T R                  R                  UR                  5        T R                  R                  UR                  5        T R
                  R                  UR
                  5        T R                  R                  UR                  5        T R                  R                  UR                  5        T R                  R                  UR                  5        T R                  R                  UR                  5        T R                  R                  UR                  5        g )Nc           
        > [        TR                  5      S:  d   eTR                  S   nTR                  S   n[        UTR                  5      (       d   e[	        U5      TR
                  :X  a^  UR                  UR                  5        UR                  5         TR                  R                  UR                  UR                  -   5        O5UR                  5         TR                  R                  UR                  5        [        5       n[        R                  " 5        nUR                  USU R                  5      (       a:  UR                  UR!                  5       5        UR                  UR"                  5        S S S 5        [        R                  " 5        nUR                  USU R                  5      (       Ga  UR                  UR!                  5       5        [	        U5      TR
                  :X  a  UR$                  nU HW  nU SU R                   S['        U R(                  5       S3n[+        UR,                  Xg5        [+        UR"                  Xg5        MY     UR                  [/        UR"                  U R                  U R                   S	3U R(                  U R0                  5      5        OUR                  UR"                  5        S S S 5        UTl        g ! , (       d  f       GNh= f! , (       d  f       N(= f)
Nr   r   r+  r-  r.  r   z_tail - r  ra  )r   r}  r   rx  r  rw  r  r  r;  rQ  r7   r   r   r)  r   r   r   r=  rM  rE   r  r   r[  r   rC  )	r	  main_loop_kerneltail_loop_kernel
suffix_bufr   rK  r   r   r5  s	           rX   !aggregate_reduction_prefix_suffixUCppKernelProxy.aggregate_reduction_buffers.<locals>.aggregate_reduction_prefix_suffix  s   t||$)))#||A#||B/.0C0CDDDD $%8 !::$22 !::<%%,,$55&778
 !::<%%,,-=-N-NO &J%%'5#66jnn  ''
(9(9(;<%%&6&G&GH ( %%'5#66
  ''
(9(9(;<,-@)9)M)M$2D*.uZ^^4DH[YcYnYnMoLppq'rH,-=-D-DdU, 0 A A4 %3 #))6 0 A A *#->>"2% 8 * 5 5 * #))*:*K*KL/ (0 %/D!= (' ('s   #AK!D4K3!
K03
Lr   )r	  r2  )r}  r  r;  rQ  r=  r>  r?  r@  rA  rC  rD  )r5  r1  r	  r8  main_kernels   `    rX   r  *CppKernelProxy.aggregate_reduction_buffers  s   6	/p ll1o):-j9113!!(()E)EF!!(()E)EF&&--k.S.ST&&--k.S.ST!!(()I)IJ##**;+M+MN**1155	
 	**1155	
rW   )r9  rb  r}  rS  r|  r=  r	  ) r  r  r  r  r2  rw  r  r  r  rx  r.  ry  r4  r  r#   r  r   r  r  r  r  r)  r%  r  r]  r   r7   re  rl   r  r  rm  rn  s   @rX   r  r    s     #,JY+)5ND&5/>tO,>+@
= 
_* _*B;<v,p<84#6 8BC;3Xl3 3O
.2O
@H@UO
 O
rW   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )rc  i<  c                 p   > [         TU ]  UR                  UR                  R                  5        / U l        g r	  )r3  r4  r  r{  rK  re  r~  s     rX   r4  OuterLoopFusedKernel.__init__=  s)    **LOO,G,GH%'
rW   c           
         / nU R                    Vs/ s H  oDR                  5       PM     nnU Hg  nUR                  nUc   eUR                  UR	                  [        [        U5      UR                  -
  UR                  S9U5      R                  5        Mi     [        [        UR                  [        U5      5      UR                  S9$ s  snf )Nr  )re  r  r9  r  r  r  r   r  r  r_   r`   )r5  r  r  kernels_parallel_depthrS  nested_kernelsrf  r9  s           rX   r  *OuterLoopFusedKernel.decide_parallel_depthA  s    !#48JJ+
4>y  "J 	 +
 %F !,,K***")),,!,/A/M/MM$6$B$B	  !.
 %  "1137M3N +66	
 	
)+
s   C
)re  )r  r  r  r  r4  r  r  rm  rn  s   @rX   rc  rc  <  s    (
 
rW   rc  c                        \ rS rSrSrSrSrSrg)ReasonFusedNodesi_  same_vars_reducecompatible_reductioncompatible_ranges_no_reductionrV   N)r  r  r  r  SAME_VARS_REDUCECOMPATIBLE_REDUCTIONCOMPATIBLE_RANGES_NO_REDUCTIONr  rV   rW   rX   rD  rD  _  s    )1%E"rW   rD  c                     ^  \ rS rSr% \r\\   \S'   Sr\	" \
R                  \
R                  /5      r\S\R                   S\	\
   4S j5       rU 4S jrS\4S	 jrS
 rS rS rS\\   4S jrS rS rS rS\S\S\4S jrS r S r!S r"S r#S\$\%   4S jr&S\'4S jr(S\)\'\*\%4   4S jr+S\S\4S jr,S\S\-\   S \-\   4S! jr.S" r/S# r0S$ r1S(S% jr2S& r3S'r4U =r5$ ))CppSchedulingie  kernel_proxy_clsi  devicer   c                     U R                   $ r	  )backend_features)r+  rN  s     rX   get_backend_features"CppScheduling.get_backend_featuresu  s    ###rW   c                 `   > [         TU ]  U5        U(       a  U R                  5         SU l        g NF)r3  r4  reset_kernel_group_ready_to_flush)r5  r'  r8  s     rX   r4  CppScheduling.__init__y  s'    ###%$rW   statusc                     Xl         g r	  rV  )r5  rX  s     rX   _set_flush_statusCppScheduling._set_flush_status  s    %rW   c                 &    [        S U 5       5      $ )Nc              3      #    U  H<  n[        [        [        R                  R                  R
                  U5      5      v   M>     g 7fr	  )r   r  r4   r  r  r   rZ  s     rX   r   )CppScheduling.group_fn.<locals>.<genexpr>  s/     Mu!U3qww//88!<==us   AA)r   )r5  r.  s     rX   group_fnCppScheduling.group_fn  s    MuMMMrW   c                 "    [        5       U l        g r	  )KernelGrouprb  r;  s    rX   rU   CppScheduling.reset_kernel_group  s    'MrW   c                   ^ UR                  5       (       d  UR                  5       (       a  [        R                  " X5      $ UR                  5       (       a-  UR                  5       (       a   e[        R                  " X5      $ U R                  X5      [        R                  :X  Ga  [        U[        [        45      (       d   e[        U[        [        45      (       d   eUR                  u  nu  pEUR                  u  nu  pgUS:X  a  US:X  d   XW45       eU4S jm[        U5      [        U5      :  a  UOUn[        U[        5      (       d   e[        U5      [        U5      :  a  UOUn	T" U	5      n
UR                  U
S9  UR                  u  nu  pCUR                  u  nu  pcXF:X  a  [        R                  " X5      $ T" U5      n[        U	[        5      (       a  U	R                  US9  Op[        U	[        5      (       d   eU	R                   H)  n[        U[        5      (       d   eUR                  US9  M+     [	        U	R                  U	R                  5      n	UR                  u  nu  pCUR                  u  nu  pcXF:X  d   XF45       e[        R                  " X5      $ U R                  X5      (       a%  [         R                  XU R#                  X5      5      $ [        R                  " X5      $ )NrV   c                 f  > [        U [        5      (       a  [        U R                  5      S:  d   U R                  5       eS n[        [
           " 5       nU R                   H;  nT	" U5      u  pEUc  UnX:X  d   XU R                  45       eUR                  U5        M=     U[        U5      4$ [        U [        5      (       d   eU R                  n[        U[        R                  5      (       d   eUR                  5       u  pxnUR                  [        UR                  R                  5       5      4$ ra  )r   r!   r   snodesr   r   updater)  r#   r  r   ComputedBufferget_default_sizes_bodyr  indexing_exprsr  )
r  r  rk  snodevexprscomp_bufferr   r  get_indexing_ranges_exprss
            rX   rp  5CppScheduling.fuse.<locals>.get_indexing_ranges_exprs  s	   !$(:;;"4;;/!3@T[[@3%)
)3C):%)[[E'@'GHA)1-.
#-?PZDKK4PP?*11%8 &1  *4+???)$>>>>&*ii)+r7H7HIIII%0%G%G%I
#T5H5H5O5O5Q0RRRrW   )extra_indexing_constraints)
is_foreachr    r-  is_templater!   _why_fuse_nodesrD  rJ  r   r#   r  r   recompute_size_and_bodyrg  r'  can_fuse_vertical_outer_loopr  _get_outer_loop_fusion_depth)r5  r  r  r   vars1reduce1vars2reduce2node_to_recompref_noderef_indexing_constraints#node_to_recomp_indexing_constraintsrl  rp  s                @rX   r-  CppScheduling.fuse  s   !1!1!3!3-225@@  ((****%**588 $$U2#BBC "%-9K)LMMMM!%-9K)LMMMM&+kk##E&+kk##E"}BJ8JJ6S& +.e*s5z*Au!.-@@@@$'JU$;5+DX+N(66/G 7  !&:E %:E>-225@@ 7P"73 h66443V 5  &h0BCCCC!))%????557Z 6  "1
  2(2D2DhooVH %:E %:E~5~5~)..u<<225@@277$"C"CE"Q  *..u<<rW   c                    UR                   u  nu  pEUR                   u  nu  pgXF:X  a  XW:X  a  [        R                  $ US:X  a  XFU-   :X  a  [        R                  $ U R	                  X5      (       a  [        R
                  $ g )NrV   )r  rD  rH  rI  &_can_fuse_nodes_with_compatible_rangesrJ  )r5  r  r  r   ry  rz  r{  r|  s           rX   ru  CppScheduling._why_fuse_nodes  sv    #kkE#kkE>g0#444b=Ugo5#88866uDD#BBBrW   c                    UR                   u  nu  pEUR                   u  nu  pgUS:H  =(       a    US:H  n[        R                  " U5      [        R                  " U5      :H  n	[        U5      S:H  =(       d    [        U5      S:H  n
U(       a  U	(       a  U
(       d  g[        U5      [        U5      :  a  UOUn[        U5      [        U5      :  a  UOUn[	        U[
        5      (       a  g[	        U[        5      (       d   e[	        UR                  [        R                  5      (       a  g[	        UR                  [        R                  5      (       d   eUR                  R                  R                  5       nS n[	        U[
        5      (       a  [        [        [        S4      " 5       nUR                    H  n[	        UR                  [        R                  5      (       a    Ok[	        UR                  [        R                  5      (       d   eUR#                  [        UR                  R                  R                  5       5      5        M     [        U5      S:w  a  g[%        ['        [)        U5      5      5      nOf[	        U[        5      (       d   e[	        UR                  [        R                  5      (       d   eUR                  R                  R                  5       nX:w  a  gg)NrV   r5   F.T)r  r?  re   r   r   r!   r#   r  r   TemplateBufferri  dataget_sizer   r   r   rg  r  r)  nextiter)r5  r  r  r   ry  rz  r{  r|  c1c2c3r}  r~  ranges2ranges1
ranges_setrl  s                    rX   r  4CppScheduling._can_fuse_nodes_with_compatible_ranges  s+    $kkE#kkE],w"}YYu5!11Z1_/E
arb"%e*s5z"9uJU35 n&899 .-8888n))2+<+<==.--r/@/@AAAA !%%**335h 233#E#s(O46J!ejj"*;*;<<!%**b.?.?@@@@uUZZ__%=%=%?@A	 ) :!#4Z 012Gh6666hmmR->->????mm((113GrW   c                     [        U[        [        45      (       d   e[        U[        [        45      (       d   e[        S X4 5       5      (       a  gU R	                  X5      S L$ )Nc              3   B   #    U  H  n[        U[        5      v   M     g 7fr	  )r   r  r  s     rX   r   :CppScheduling._can_fuse_horizontal_impl.<locals>.<genexpr>"  s      
FTdJt899ns   F)r   r!   r#   rg   ru  r5  r  r  s      rX   _can_fuse_horizontal_impl'CppScheduling._can_fuse_horizontal_impl  sk    %"4m!DEEEE%"4m!DEEEE 
GLn
 
 
 ##E1==rW   c                    UR                  5       (       d  UR                  5       (       a  g[        UR                  5       5      [        UR                  5       5      -   [        R                  R
                  :  a  gU R                  X5      $ rT  )rt  r   r2  r   r  max_horizontal_fusion_sizer  r  s      rX   can_fuse_horizontal!CppScheduling.can_fuse_horizontal(  sj    %"3"3"5"5!"S):%;;jj334 --e;;rW   r  r  c                    UR                  5       =n(       a  [        UR                  [        R                  5      =(       a    [        UR
                  [        R                  5      =(       a]    [        UR
                  R                  5      S:H  =(       a4    UR
                  R                  S   R                  5       UR                  :H  $ g)Nr5   r   F)get_template_noder   layoutr   MultiOutputLayoutr  MultiOutputr   inputsr  r   )r5  r  r  template_bufs       rX   can_fuse_multi_outputs_template-CppScheduling.can_fuse_multi_outputs_template3  s     !2244<4<..0D0DE Iuzz2>>:I

))*a/I JJ%%a(113|7H7HH	 rW   c                 :   Sn[        S X4 5       5      (       d  U$ [        U[        5      (       a  UR                  5       S   OUn[        U[        [
        45      (       d   e[        U[        5      (       a  UR                  5       S   OUn[        U[        [
        45      (       d   eUR                  u  nu  pxUR                  u  nu  pUS:X  a  U	S:X  a  US:w  a  U
S:w  a  U$ [        S X4 5       5      (       a(  UR                  UR                  :X  a  UR                  $ U$ [        [        U5      [        U	5      5      nUS:  aP  US U U	S U :X  aD  [        S X4 5       5      (       a*  [        U5      [        L a  UOUnUR                  U:X  a  U$ U$ U$ U$ )Nr   c              3   \   #    U  H"  n[        U5      [        [        [        4;   v   M$     g 7fr	  )r  r  r!   r#   r  s     rX   r   =CppScheduling._get_outer_loop_fusion_depth.<locals>.<genexpr>A  s.      
 ' J+-?OP&r"  r+  rV   c              3   D   #    U  H  n[        U5      [        L v   M     g 7fr	  r$  r  s     rX   r   r  Z  r%  r&  r5   c              3   D   #    U  H  n[        U5      [        L v   M     g 7fr	  r$  r  s     rX   r   r  e  s      FTdT
99nr&  )r(  r   r  r*  r!   r#   r  r,  r_   r   rg   r  )r5  r  r  DISABLE_OUTER_LOOP_FUSION_node1_node2r   ry  rz  r{  r|  r,  _compare_nodes                rX   rx  *CppScheduling._get_outer_loop_fusion_depth?  s   $%! 
 
 
 

 -, %!<== !!#B' 	
 &#5}"EFFFF %!<== !!#A& 	
 &#5}"EFFFF$llE$llEB;5B;7b=W],,Te^TTT 00E4Q4QQ -- /
 #&c%j#e*"=#q(../59Q:Q3RR GLn   "%[,GGEU  !88<SS2244 /.((rW   c                 T   UR                  5       (       + =(       a    UR                  5       (       + =(       aq    UR                  5       UR                  -  =(       aM    U R                  X5      =(       a    UR	                  5       (       + (       + =(       a    U R                  X5      S:  $ r   )rt  get_operation_names	ancestorsr  rB  rx  r  s      rX   rw  *CppScheduling.can_fuse_vertical_outer_loopu  s    !!## E%%''E))+eoo=E ..u< -**,,E 11%?1D		
rW   c                 2    U R                  X5      (       a  ggr:  )rw  r  s      rX   get_fusion_pair_priority&CppScheduling.get_fusion_pair_priority  s    ,,U::rW   c                 :   UR                  5       (       a  gUR                  5       (       a,  [        X/5      u  p4UR                  5       (       + =(       a    U$ U R                  X5      =(       a    UR                  5       (       + =(       d    U R	                  X5      $ rT  )rt  rN   rB  r  rw  )r5  r  r  template_fusion_supportedr   s        rX   can_fuse_verticalCppScheduling.can_fuse_vertical  s    +Sw,(% ))++I0II**58UASASAU=U=..u<	=rW   r  c                   ^^^^^ [        S U 5       5      (       a  U$ SmSmSnSnSnSnU GH  n[        UR                  [        R                  5      (       d   eUR                  R                  5       u  pxnUR                  R                  5        GHW  u  mn	[        U	[        R                  5      (       d  M(  U	R                  [        5       GH  m[        U4S jUR                   5       5      (       a  TU:w  a  TnUS-  nUS:  a  Us  s  s  $ [        TR                  S   [        R                  R                  R                   5      (       d  M  TR                  S   UR                  ;   d  M  Tc  M  [#        UU4S jUR                  R                  5        5       5      (       d  M  TR                  S   S:  d  M  TR                  S   mTR                  S   mS	nUnGM     GMZ     GM     U(       d  U$ SmUUU4S
 jn
U H  nXe:X  d  M
  UR%                  U
S9  M     U H  nXe:w  d  M
  UR%                  TU
S9  M     U$ )a  
Apply loop split optimization.
When one of the indexing_exprs contains a division, we eliminate the division by splitting the loop
to avoid non-contiguous loads, subject to the following conditions:
    1. No reduction and no mudular index for all nodes.
    2. The indexing_exprs of all nodes contain only one (or more, but all the same) division,
       where the divisor is an integer and not too small (the divisor > 8), the dividend is
       one of the iter_vars, and this var, i.e. the dimension that needs to be split, is
       contiguous in all other indexing_exprs.

For example, if the node's var_ranges: {z0: 2, z1: 9216, z2: 960} and indexing_exprs:
{'index0': 8847360*z0 + 960*z1 + z2, 'index1': 32*z0 + (z2//30), 'index2': z2},
we will split z2 -> 30*z2 + z3, then the node's var_ranges will be changed to
{z0: 2, z1: 9216, z2: 32, z3: 30} and indexing_exprs will be changed to
{'index0': 8847360*z0 + 960*z1 + 30*z2 + z3, 'index1': 32*z0 + z2, 'index2': 30*z2 + z3}.
c              3      #    U  H^  n[        UR                  S    S    5      S:g  =(       d4    [        S UR                  R                  R                  5        5       5      v   M`     g7f)r5   r   c              3   J   #    U  H  oR                  [        5      v   M     g 7fr	  )r   r   )r  r  s     rX   r   9CppScheduling.try_loop_split.<locals>.<genexpr>.<genexpr>  s      6Xd))6Xs   !#N)r   r  rg   r  rk  r  r  s     rX   r   /CppScheduling.try_loop_split.<locals>.<genexpr>  sd      

 	 

1a !Q&  6:jj6O6O6V6V6X  s   A&A(Nr   Fc              3   F   >#    U  H  nTR                  U5      v   M     g 7fr	  )r   )r  r   div_exprs     rX   r   r    s     Q9P#HLL--9Pr*  r5   c              3   p   >#    U  H+  u  pUT:w  d  M  [        UTR                  S    5      S;   v   M-     g7f)r   r=  N)r
  r  )r  name_expr_r  r   s      rX   r   r    s:       0T$} S/x}}Q7GHFR0Ts   6#6r>  Tc                   > U u  p4Uu  pVUR                  T5      nUR                  5       nX7   T-  X'   UR                  US-   T5        [        R                  " XSS9u  u  pnU	R                  5       nUR                  US-   5      nTX   -  U-   X'   [        R                  " XU/XU5      nT(       d/  UR                  [        UR                  R                  5       5      4mX4UX44$ )Nr5   r1  )r  )r   copyinsertr
   index_vars_no_squeezepopr   r   r  r)  rk  r  )r.  r  r  
index_sizereduce_sizer  reduce_vars	split_idxnew_index_sizenew_index_varsr   r  	iter_varsdivisor_varrr  split_number	split_vars                 rX   
loop_split0CppScheduling.try_loop_split.<locals>.loop_split  s   &+#J&*#J"((3I'__.N(2(=(MN%!!)a->.:.P.PC/+^ '++-I#--	A6K#/)2F#F#TI ;;+.
KD .OO,,3356.*
  -- rW   )recompute_sizes_body_func)rr  r  )rg   r   r  r   ri  rj  rk  r  r   r  findr   r  r  corenumbersrY  r(  rv  )r5  r  num_div	div_expr_	match_divmatched_noder  r   original_bodyr  r  r  rr  r   r  r  s              @@@@@rX   try_loop_splitCppScheduling.try_loop_split  s   &  

 
 
 
 L			Ddii):):;;;;"&))"B"B"DAa+::@@B
d!$

33 $		( 3HQ9P9PQQQ$	1$,	1{$"8==#3UZZ5G5G5O5OPP$MM!,0G0GG ,  0=0L0L0R0R0T   
 %MM!,q0$,MM!$4	'/}}Q'7$(	'+/ !4 C @ L%)"	: D#,,z,R  D#,,/I.8 -   rW   r  c                 ~  ^ ^^^	 T R                   m[        R                  n/ m/ m	[        U[        5      (       d   eS[        4UUU	U 4S jjnU" U5      (       d  U[        l        TR                  5         T	R                  5         [        R                  R                  R                  SS9   UR                  5        Hc  n[        U[        [        45      (       d   eUR                  5       nT R                  T5      nUR                  U5        TR!                  Xe5        Me     SSS5        gg! , (       d  f       g= f)z
Generate the code for the outer loop fused scheduler node.
1. Codegen with fused outer loop: depends on the analysis of
    the outer loop fused scheduler node, with or without the local buffer.
2. If failed, fallback to standard codegen.
r  c           	      
	  >^ ^^^^^ [        T [        5      (       d   eTR                  5         TR                  5         S[        4S jm/ n0 m[	        UU 4S jT R                  5        5       5      (       Ga  [        5       mT R                  5        GH  m[        T[        5      (       d   eTR                  TR                  5       5        TR                  5       (       d  [        TR                  5       5      S:w  a  Mn  TR                  5       S   m[	        U 4S jTR                   5       5      (       d  M  TR                  n[        U[         R"                  5      (       d   eUR%                  5       nT R&                  [        T" T5      5      -
  nUU4S jnUR)                  5       (       a  U" 5       (       d  GM-  [         R*                  " UR,                  UR.                  UR0                  US UR2                  US 5      nUU4S	 jnS
nU" Xa5      n	U	(       dC  [         R4                  " U S[        U5       3US9n	UR7                  U	5        / TU	R8                  '   TU	R8                     R7                  U5        GM     [;        TR<                  5       n
[        U5      S:  a7  U H1  nUR8                  c   eU
R?                  UTUR8                     5        M3     T R                  5        H  n[        U[@        [        45      (       d   eTRC                  T5      nURE                  UR                  5       5        TR7                  U5        TR7                  UR                  5       5        M     T RG                  TT R&                  5      (       dF  U
RH                   H,  n[J        RL                  RH                  RO                  U5        M.      SSS5        g[P        RR                  R7                  [P        RT                  " [        T5      [        U
RV                  5      S95        T RY                  T5      nTR[                  U/ [\        R^                  Ra                  T5      Q5        SSS5        g! , (       d  f       g= f)z6
Codegen code with fused outer loop and local Buffer.
r  c                     [        U [        [        45      (       d   eU R                  5       n[	        US S9R
                  u  nu  p4[        U5      [        U5      -   nU$ )Nc                 4    [        U R                  5       5      $ r	  )rn   rB  r  s    rX   r  ~CppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.get_call_ranges.<locals>.<lambda>"  s    Q^^-=)>rW   r0  )r   r#   r!   r2  r`   r  r   )r  r  r   r  r	  r9  s         rX   get_call_rangeslCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.get_call_ranges  sa    !$8J(KLLLL-1^^-=.1>/% ,+E $ElU?-CC""rW   c              3   f   >#    U  H&  n[        T" U5      5      TR                  S -   :H  v   M(     g7f)r5   N)r   r,  )r  r7  r  r  s     rX   r   fCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.<genexpr>*  s2      3E OE*+t/K/Ka/OO3s   .1r5   r   c              3   \   >#    U  H!  oR                   TR                  5       ;   v   M#     g 7fr	  )r  r2  )r  r  r  s     rX   r   r  >  s#      BX$		T^^%55BXs   ),c                  N  >^^ SmSn [        TR                  R                  R                  5       5       H  u  pTX-  -  mX-  n M     TR                  R	                  TR                  5       5      nU4S jmT" U5      =(       a     [        UU4S jTR                   5       5      $ )Nr   r5   c                    > U T:H  $ r	  rV   )r  contiguous_index_exprs    rX   is_contiguous_indexږCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.is_all_write_read_contiguous.<locals>.is_contiguous_indexT  s    '(,A'A ArW   c              3      >#    U  H_  n[        UR                  [        5      =(       a9    T" UR                  R                  R	                  TR                  5       5      5      v   Ma     g 7fr	  )r   r  r#   r  get_read_exprr  )r  r  r  scheduler_buffers     rX   r   ڌCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.is_all_write_read_contiguous.<locals>.<genexpr>W  s^      Q -CD !+499m D !"$7$(IIOO$A$A(8(A(A(C%&%"!" -Cs   A'A*)r  r  r  r  get_write_exprr  r(  r  )r  r   rR  write_index_exprr  r  r  r  s       @@rX   is_all_write_read_contiguousyCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.is_all_write_read_contiguousH  s    451%&F.6 . 4 4 ? ? E E G/
 !6 E 5 &	/
 0>/C/C/R/R 0 9 9 ;0,B $77G#H $S Q -=,B,BQ N rW   Nc                    > U H?  nXR                   :X  d  M  [        U4S jTUR                      5       5      (       d  M=  Us  $    g )Nc              3      >#    U  Hb  nUR                   c  M  [        U4S j[        R                  R                  R
                  UR                      R                   5       5      v   Md     g 7f)Nc              3   ^   >#    U  H"  nUR                   R                  5       T;   v   M$     g 7fr	  )r  r  )r  r  visited_scheduler_nodess     rX   r   ڐCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.try_share_local_buffer.<locals>.<genexpr>.<genexpr>q  s/      (&50D )-		(:(:(<@W(W50s   *-)r   r(  r4   r  r'  name_to_bufr  )r  global_bufferr  s     rX   r   چCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.try_share_local_buffer.<locals>.<genexpr>p  si      S":& (5'9'9%&C (&45GG4E4E4Q4Q,9,>,>5**/%50(& %& %&:&s   A-AA-)r  r(  r   )local_buffer_layoutr  	local_buflocal_to_global_buffersr  s      rX   try_share_local_buffersCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.try_share_local_buffern  sR    -:	#6:J:J#Js S" :Q(1:&S" P" P" ,5$4 .; $(rW   local_buffer_datar   )r   r  F)local_buffer_numberT)1r   r  clearr   r(  r*  r   r2  r#   r  r  rB  r   get_outputsr  r  r   ri  r  r,  is_contiguousFixedLayoutrN  r   rC  r  Bufferr  r   rK   r  add_local_bufferr!   rM  r%  r^  r  r4   r  remover   !cpp_outer_loop_fused_inner_countsCppOuterLoopFusedCountr  rj  finalize_kernelra  rb  from_iterable)r  r  r  global_buffer_layoutsize_offsetr  r  r  local_buf_prefixlocal_buffer_usedscoper  r7  r\  removed_bufferouter_fusion_cpp_kernel_proxyr  r  r  r  r  rZ  rb  
nodes_listr5  s   `               @@@@@rX   $try_outer_loop_fusion_with_local_bufSCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf  s    d$?@@@@!'')#&7 # .0MBD# !113   <F<'&*nn&6N%nmDDDD+//0G0G0IJ&3355~99;<A '5'A'A'CA'F$ BRBXBX   )9(=(=)-9J9JKKKK/</G/G/I,&*&B&BS+N;F '4 1>>@@ < > >$.0nn077066055klC077E	/+(" ,?(,B/-)  102		(8'93};M:N%O':1- *001BCNP34E4J4JK/0A0F0FGNN)s '7z $L$5$56%}%)(5+00<<<..(*A,BSBS*T )6
 "113E%e.@--PQQQQ'+'<'<\'J$$225??3DE)001AB%%eoo&78 4 >>)4+G+G  +0*?*? //66~F +@ !+ 76, 99@@2212,/0C0C,D 150O0O)1- ,,1@ioo33J?@? 7H I 76H s   D9Q4BQ44
RFr  N)rb  r   r  r   r  r  r   r  r   r  r*  r!   r#   r2  rM  r%  r  )
r5  r  r  r  r7  _nodesr\  rZ  rb  r  s
   `      @@@rX   codegen_outer_loop_node%CppScheduling.codegen_outer_loop_node  s    (()0)O)O&=?02
$ ;<<<<^	7R ^	 ^	@ 4D995SG2!'') ''--e-D!113E%e.@--PQQQQ27//2CF'+'<'<\'J$$226: 001AJ 4 ED : EDs   ,A8D..
D<c                 ~   U R                   n[        U[        5      (       a  U R                  U5        OTUR	                  5       nU R                  U5      nU R                  U5      nUR                  U5        UR                  XC5        U R                  5       nU[        R                  :  a  U R                  S5        gg)z3
Turn an set of pre-fused nodes into a C++ kernel.
TN)rb  r   r  r  r2  r  rM  r%  r  _get_scheduled_num_argsrL  MAX_FUSED_KERNEL_ARGS_NUMr[  )r5  r  rb  r  r\  args_nums         rX   codegen_nodeCppScheduling.codegen_node  s     ((d788((.)-)9E''.E#44\B**51(()9A//1m===""4( >rW   c                 x    [        U[        5      =(       a$    [        UR                  [        R                  5      $ r	  )r   r#   r  r   CppTemplateBuffer)r5  r  s     rX   is_cpp_templateCppScheduling.is_cpp_template  s,    $. 
:IIr++4
 	
rW   template_nodeepilogue_nodesprologue_nodesc                 <   U(       a   eU Vs/ s H"  n[        U[        [        45      (       d  M   UPM$     nn[        S   S==   S-  ss'   [        S   S==   [	        U5      -  ss'   U R                  U5      (       d   S5       e[        [        U5      nUR                  u  nu  pVUS:X  d   e[        [        R                  UR                  5      nU Vs/ s H  oR                  PM     n	n[        S U	 5       5      (       d   S5       eS	 n
U
" XqR                  U	5      nUR                  UUU	S
9u  pU   [        UR                  5      (       d  UR                  5         U H  nUR                  5         M     U" 5       nSSS5        [         R"                  " U5         U/UQnU R%                  WUUR&                  5      nSSS5        [        UR                  5      (       a  [	        UR(                  5      S:X  d   S5       eUR(                  S   R*                   H}  n[        UR                  [,        5      (       d   S5       e[        UR                  R                  [        R.                  5      (       d   S5       eUR                  R                  5         M     UR1                  WU5        [         R2                  =R4                  UR4                  -  sl        U R7                  5         gs  snf s  snf ! , (       d  f       GN~= f! , (       d  f       GNO= f)z7
Codegen a CPP template, possibly with fused epilogues
inductorcpp_templated_kernel_counterr5   cpp_epilogue_fusion_counterzlTemplate node passed to CppScheduler.codegen_template must be a SchedulerNode that wraps a CppTemplateBufferrV   c              3   V   #    U  H  n[        U[        R                  5      v   M!     g 7fr	  )r   r   ri  )r  r  s     rX   r   1CppScheduling.codegen_template.<locals>.<genexpr>   s"     O=N:a!2!233=Nr\  z9Epilogue nodes must all be instances of ir.ComputedBufferc                    ^ T(       d  gU R                  5       U;   d   eXR                  5          R                  n[        U4S jU 5       5      (       + $ )NFc              3      >#    U  H?  n[        UR                  [        5      =(       a    UR                  R                  T;   v   MA     g 7fr	  )r   r  r   )r  r  r)  s     rX   r   ZCppScheduling.codegen_template.<locals>.template_buffer_has_other_users.<locals>.<genexpr>  s@       "D 499&78 5IINNn45!s   AA
)r  r  r(  )template_bufferoutputs_by_namer)  r  s     ` rX   template_buffer_has_other_usersGCppScheduling.codegen_template.<locals>.template_buffer_has_other_users  sZ     ""++-@@@#$<$<$>?EEE  "   rW   )$flag_template_buffer_has_other_usersr)  NzSMulti outputs template should be with 1 output template buffer of MultiOutputLayoutr   z?Multi outputs template should be with ExternKernelSchedulerNodez7Multi outputs template has multi users with MultiOutput)r   r#   r!   r   r   r&  r   r  r   r%  r  r(  r5  make_kernel_renderr(   r  r4   set_kernel_handlerdefine_kernelr  outputsr  r   r  call_kernelr  r  free_buffers_in_scheduler)r5  r(  r)  r*  epilogue_noder   rnumelctbr  epilogue_ir_nodesr6  r8  rf  renderr  src_codenode_schedulekernel_namer  s                      rX   codegen_templateCppScheduling.codegen_template  s    "!!
 "0
!/--9K)LM !/ 	 
 	;<A<:;s>?RR;##M22 	
z	
2 ]M:&,,;A||$()=)=}?Q?Q$R*;
*qFFN 	 ;
 O=NOOO 	
G	
O	 0O..0A0
, //1U, 0 

 ,]-?-?@@&&(& 'xH  !!&)*<^<M,,X}fkkRK * %]%7%788 },,-2 e2 &--a066!$))-FGG UG "$)).."..AA MA 		""$ 7 	;,	6#9#99&&(S
 ;
: V *)s)   K0K0K58AK:!#L:
L	
Lc                 6    U R                   R                  5       $ r	  )rb  get_num_argsr;  s    rX   r  %CppScheduling._get_scheduled_num_args8  s      --//rW   c                     U R                   $ r	  rZ  r;  s    rX   ready_to_flushCppScheduling.ready_to_flush;  s    ###rW   c                     g r	  rV   r;  s    rX   codegen_syncCppScheduling.codegen_sync>  s    rW   c                    [         R                  R                  n[        R                  R
                  (       a$  [        U[        R                  R
                  5      OSnSR                  SXTR                  5       /5      n[        R                  R                  (       a  [        X&5        [         R                  R                  (       a  UOSnUR                  [        [        R                   5      U5      nUR                  [        [        R"                  5      U5      nUR                  SS5      nUR%                  S5      nUR'                  SU5      n	[(        (       a  UR'                  SU	S	-   5      n	XU	S	-     S
3n
[+        5       nUc  U R,                  R.                  OUnUR1                  5       u    p[         R                  R                  (       d  UR3                  SU< S35        UR5                  USS9  [         R                  R                  (       d  UR3                  S5        UR7                  UUR9                  5       SU
S9  U$ )NrS   r   r  rf  z#pragma CMTz//z
extern "C"r   r5   z;
zasync_compile.cpp_pybinding(z, '''T)stripz''')F)gpucpp_definition)r4   r  wrapper_coder   r  descriptive_namesr&   r&  next_kernel_suffixtraceenabledr,   cpp_wrapperr  rp   r+   KERNEL_NAMEDESCRIPTIVE_NAMErfindr  rU   r=   rb  r  cpp_argdefsr   rQ  r;  getvalue)r5  rD  r  kernel_argsr  
fused_namerF  kernel_decl_name
first_char	last_charkernel_definitioncompile_wrapperr  r   	arg_typess                  rX   r;  CppScheduling.define_kernelA  s   ''&& zz++ "%)E)EF 	
 hhz3M3M3OPQ<<3EG*+''*=*=;8##C(?(?$@BRS##C(D(D$E{S ##M48 ^^L1
MM#z2	; c9q=9I'Y]CDCH(*)4)<t  %%+**,1ww""%%(DYMQV&WXxt4ww""%%f-$$&,	 	 	
 rW   c                 :   U R                   R                  5       nU(       aY  U R                  XR                   R                  5      nU R                   R	                  [
        R                  R                  U5        U R                  5         U R                  S5        g rT  )
rb  codegen_groupr;  scheduled_nodesr=  r4   r  rV  rU  r[  )r5  rD  rF  s      rX   flushCppScheduling.flushm  st    $$224,,++;;K ))!''*>*>L!u%rW   )rV  rb  r	  )6r  r  r  r  r  rM  r  r  r   r   r6   INPLACE_BUFFERSREDUCE_TO_SINGLE_ELEMENTrP  rl  r   rN  rQ  r4  rl   r[  r`  rU  r-  r   rD  ru  r  r  r  r   r  rx  rw  r  r  r)  r#   r  r  r  r	   r!   r"  r&  r   rG  r  rM  rP  r;  rm  r  rm  rn  s   @rX   rL  rL  e  s    .<d>*; !$!**33	
 $%,, $:n;U $ $%& &N*P=dx8H/I 6p>	<
&
/@
	
4)l

=oD$7 ob~K)~K@)/1C]RS),
$5 
$ 

U)(U) !!23U) !!23	U)n0$*X& &rW   rL  c                   R   ^  \ rS rSrU 4S jrS rS rS rS
S\4S jjr	S r
S	rU =r$ )rc  ix  c                   > [         TU ]  5         [        5       U l        [	        5       U l        [        U R
                  5      U l        [        R                  " 5       U l
        U R                  R                  U R                  5        / U l        g r	  )r3  r4  r?   r  r7   
loops_codeWorkSharingr{  r   r   r   r   rl  r   s    rX   r4  KernelGroup.__init__y  s^    L	&.doo.))+


  )!rW   c                 :    U" U R                   [        5       /UQ76 $ r	  )r  r*   )r5  r+  r  s      rX   r  KernelGroup.new_kernel  s    49924<t<<rW   c                     U =R                   U-  sl         U R                  nU R                  nUR                  X45        g r	  )rl  rs  r{  r  )r5  r  r  r   r{  s        rX   r  KernelGroup.finalize_kernel  s3    %WW  *rW   c                 V    U R                   R                  5       u  pn[        U5      nU$ r	  )r  r_  r   )r5  arg_defs
_call_args
_arg_typesr!  s        rX   rJ  KernelGroup.get_num_args  s'    +/99+@+@+B(jx=rW   r   c           	      D   U R                   R                  5         U R                  (       d  g[        5       n[        R
                  R                  =(       a    [        R                  S;   nU(       a  UR                  S/5        UR                  S5        Uc  [        [        R                  5      OUnUc  [        [        R                  5      OUnU R                  R!                  5       u  n  nSR#                  S5      R%                  U5      n['        5       nUR                  SU SU S	U S
35        UR)                  5          U(       aH  [*        R,                  R.                  n	U	b  S[        U	5      -   S-   OSn
UR                  SX-    S3/5        U R                  R1                  5        H  u  pUR                  SU SU S35        M     UR3                  U R4                  5        S S S 5        UR7                  5       $ ! , (       d  f       UR7                  5       $ = f)NrS   )linuxrQ   z!#include <ATen/record_function.h>z+#include <torch/csrc/inductor/cpp_prefix.h>z,
   zextern "C" z void r  r   graph_r   zRECORD_FUNCTION("z#", c10::ArrayRef<c10::IValue>({}));r   r   r   )r   r  rl  r7   r   r  enable_kernel_profilesysplatformr   r   rp   r+   r\  r]  r  r_  ljustr&  rY   r   r4   r  graph_idaliasesrQ  rs  r`  )r5  r   r   r  rc  rF  r{  r   func_export_declr  r  oldnews                rX   rk  KernelGroup.codegen_group  s   

##~ !'

 @ @ !
S\\ V
 F
 !OO@ABDE <@<3{667T;?<c+667T..0!Q;;r?''113*+62B1C1XJaP	

 [[]$77++;C;OCM1C7UW+F,@+AAfg
 !II--/se3se156 0KK(  }} ] }}s   B&H
Hc                 `    U R                   R                  5       u  p4nUR                  X$SUS9  g )NF)tritonrh  )r  r_  generate_kernel_call)r5  r  rF  r   	call_argsrh  s         rX   r=  KernelGroup.call_kernel  s3    "&))"7"7"9i$$5I 	% 	
rW   )r  rs  rl  r   r{  r	  )r  r  r  r  r4  r  r  rJ  rp   rk  r=  r  rm  rn  s   @rX   rc  rc  x  s.    "=+
&# &P
 
rW   rc  c                   8    \ rS rSrS rS rS rS rS rS r	Sr
g	)
rt  i  c                 `    Xl         SU l        S U l        [        R                  " 5       U l        g rT  )r   in_parallelrK  r   r   r   )r5  r   s     rX   r4  WorkSharing.__init__  s'    	 ))+
rW   c                    U R                   (       a  XR                  :w  a  U R                  5         U R                   (       d  Xl        SU l         [        R                  R
                  (       a  U R                  R                  S5        OU R                  R                  SU S35        U R                  R                  U R                  R                  5       5        U R                  R                  S5        g g )NTz#pragma omp parallelz!#pragma omp parallel num_threads(r   zint tid = omp_get_thread_num();)r  rK  r  r   r  rS  r   r   r   r   r   )r5  r  s     rX   r  WorkSharing.parallel  s    +;+; ;JJL&#Dzz))		##$:;		##&GyPQ$RSJJ$$TYY%5%5%78II1  rW   c                 r    U R                   (       a  U R                  R                  S5        U R                   $ )Nz#pragma omp single)r  r   r   r;  s    rX   r  WorkSharing.single  s*    II 45rW   c                 F    U R                   R                  5         SU l        g rT  )r   r  r  r;  s    rX   r  WorkSharing.close  s    

 rW   c                 :    U R                   R                  5         U $ r	  )r   r{  r;  s    rX   r{  WorkSharing.__enter__  s    

rW   c                 <    U R                   R                  XU5        g r	  )r   r  r~  s       rX   r  WorkSharing.__exit__  s    

Hv6rW   )r   r  rK  r   N)r  r  r  r  r4  r  r  r  r{  r  r  rV   rW   rX   rt  rt    s     ,  
!7rW   rt  c                      \ rS rSr% Sr\\R                     \S'   Sr	\\R                     \S'   \R                  R                  r\R                  \S'   \R                  R                  r\R                  \S'   \R                  R                  r\R                  \S'   Sr\\S	'   S
r\\S'   S
r\\S'   S
r\\S'   S
r\\S'   S rS rS rSrg)r2  i  Nr   rC  rD  r  rE  r   r  Fsimd_ompsimd_vec	collapsedrB  c                 v    [         R                  " 5       nU(       a  UR                  5       U l        g SU l        g ra  )r   r1  rA  simd_nelements)r5  r|  s     rX   __post_init__LoopLevel.__post_init__  s-     .9-E-E-GAO>#;#;#=UVrW   c                    [         R                  " U5      n[        U R                  U R                  5      nX#l        SUl        [        UR                  U5      U-  Ul        U R                  Ul	        SUl
        U R                  Ul        U$ )NTF)r   rY  r2  r   rC  rE  r  r   r  r  r  rB  )r5  r  sympy_factorr  s       rX   r  LoopLevel.tile  sl    }}V,499-!
"499l;lJ --rW   c                    [        U R                  5      n[        U R                  5      n[        R                  R
                  (       a  X:X  a  g U R                  (       a   U R                  S:  a  SU R                   S3OSnU R                  (       aL  SnU R                  S:  a  USU R                   S3-  nU R                  (       a  UR                  SSU 35      nO[U R                  (       a  SnOGU R                  (       a  S	U 3nO0U R                  (       d  [        R                  " 5       (       a  S
nOSn[         SU R                   SU 3nU R                   SU 3nU R                   R"                  (       a%  U R                   S[        U R                   5       3nO;U R                   S[        U R                   5       S[        U R                   5       S3nSU SU SU S3nU R$                  (       d  U(       d  U/$ XH/$ )Nr5   zsimd simdlen(z) rS   z#pragma omp forz
 collapse(r   z for z#pragma omp z#pragma GCC ivdepr   rn  <rm  z+=(z == 0 ? 1 : zfor(r   )rE   rD  rC  r   r  no_redundant_loopsr  r  r  r  r  rB  r   ro  rJ   r   rE  r  r  )	r5  offset_expr	size_exprsimdline1
offset_strr  	steps_strline2s	            rX   r  LoopLevel.lines  s   !$++.		*	::(([-E }}!4!4q!8 D//03 	
 ==%E}}q :dmm_A66}}gtf~>]]E]]"4&)E""{'9'9';';'EE"|1TXXJa}=
hhZq,::88*B{4::'>&?@I
 88*CDJJ 78 9"4::./q2  zl"XJb1=>>7N~rW   )r  )r  r  r  r  r   r   r   r  r  rC  r   r   rD  r  OnerE  r  rn   r  rl   r  r  rB  r  r  r  r  rV   rW   rX   r2  r2    s     $C%**	$!%D(5::
%FEJJ% #WW\\J

)E5::#HcHdHdItL$
W	'rW   r2  c                       \ rS rSr% SrSr\\\      \	S'   Sr
\\   \	S'   \S\4S j5       rS r\S 5       rS	 rS
 rS\4S jrS rS\4S jrSrg)rQ  i=  a>  
A loop-nest-like structure. It is built with the `build` method
as a loop nest and then will perform loop-tiling at some depth.

A typical case is for vectorization, where we typically do loop-tiling
at the innermost loop level. A more complicated case is when we do
2D tiling at both the innermost and outer levels.
NrL  rf  c                 0   U R                   nU R                  nU R                  nUc   eSn[        [	        X5      5       HG  u  nu  pg[        Xg5      nU(       d  U/nOUR                  U5        XS:  d  M6  U R                  Ul        MI     [        U5      n	U	$ )z4Build a LoopNest with the given `kernel` as the leafN)	r  rX  r:  r   rR  r2  r  rB  rQ  )
rf  r  rX  r:  rL  loop_idxr   rC  r  rS  s
             rX   r  LoopNest.buildK  s     ?? 00***+/%.s8/D%E!HksS'DT"*$*$7$7! &F UO	rW   c                 ,    [        U R                  5      $ r	  )rl   rL  r;  s    rX   __bool__LoopNest.__bool__`  s    DJJrW   c                    U R                   c
  [        SSS9$ SnSnU R                   S   R                  n[        R                  " S5      nU R                    H=  nUR                  U:w  a    O,U[        UR                  UR                  5      -  nUS-  nM?     S nU" U R                   5      nU[        U R                   5      :  Ga$  [        U[        R                  5      (       Ga  [        U R                   U   R                  [        R                  5      (       a  US-  [        U R                   U   R                  U R                   U   R                  5      :  a  Ub#  X':  a  U R                   U   R                  (       dg  UnSnU R                   U   R                  n[        U[        U R                   5      5       H'  nU R                   U   R                  U:w  a    O	US-  nM)     [        X!S9$ )aY  
Maximal allowed depth for parallelism: All reduction or non-reduction levels.
When the range of the first inner loop beyond the maximum parallel depth is much
larger than the range of all outer loops within the maximum parallel depth,
change the starting depth of parallelism to the first inner loop and recalculate
the maximum parallel depth.
r   r  r5   c                 V    [        U 5       H  u  pUR                  (       d  M  Us  $    g r	  )r   r  )rL  r   r  s      rX   get_simd_vec_depth7LoopNest.max_parallel_depth.<locals>.get_simd_vec_depthy  s&    $U+===H , rW   rP  )rL  r  rB  r   rY  r   rC  rE  r   r   rR  )	r5  r  	max_depthrB  	num_stepsr  r  simd_vec_depthr   s	            rX   r  LoopNest.max_parallel_depthc  s    :: qAA	zz!}11MM!$	JJD  L0!HTYY

$CCINI	 	 ,DJJ7
 DJJ'9emm444::i055u}}EECtzz),114::i3H3N3NOP *.JJy)66 $KI::k2??L;DJJ8::a=--=Q	 9 IOOrW   c                    UR                   U R                  5       R                   ::  d   S5       eU R                  c   e[        U R                  5      UR                   :  d   eU R                  UR                     nUR                   Ul        UR                  (       a  [        =R                  S-  sl        [        UR                  S-   UR                   5       H  nSU R                  U   l
        M     g )Nz?Parallel depth cannot exceed the maximal allowed parallel depthr5   T)r  r  rL  r   r  r  rB  r   parallel_reduction_countrR  r  )r5  r  r  r   s       rX   r  LoopNest.mark_parallel  s    ''4+B+B+D+S+SS 	
M	
S zz%%%4::)":"::::zz)//0!00,,1,y,,q0)2J2JKA&*DJJqM# LrW   c                     U R                   (       d   eU R                   U   R                  U5      U R                   U'   U R                   U   $ )z
Do loop-tiling at the `depth` level with `factor`.
    for (x0 = 0; x0 < x0_end; x0++)
    ->
    for (x0 = 0; x0 < x0_end; x0 += factor)
See details in Note [tiled_size].
)rL  r  )r5  r  r  s      rX   r  LoopNest.tile  sA     zzz JJu-226:

5zz%  rW   r   c                 @    U R                   (       d   eU R                   $ r	  rf  r;  s    rX   r  LoopNest.get_kernel  s    {{{{{rW   c                     Xl         g r	  r  r*  s     rX   r  LoopNest.set_kernel  s    rW   levelc                     U R                   (       d   e[        U R                   5      U:  d   eU[        U R                   5      :X  a  S OU R                   US  n[        X R                  5      $ r	  )rL  r   rQ  rf  )r5  r  rL  s      rX   rd  LoopNest.from_loop_level  sT    zzz4::%'''TZZ0djj6H{{++rW   r  )r  r  r  r  r  rL  r   r)  r2  r  rf  r2  r  r  r  r$   r  r  r  r  r  rn   rd  r  rV   rW   rX   rQ  rQ  =  s     (,E8DO$+"&FHY&i  (  4P 4Pl+
!I ,S ,rW   rQ  r,  r	  )r   dataclassesrT  ra  r?  rV  r   r  r  collections.abcr   enumr   typingr   r   r   r   r	   r   r   torch.fxtorch._inductorr
   torch._prims_commonr   r   torch.utils._ordered_setr   torch.utils._sympy.functionsr   r   r   torch.utils._sympy.symbolr   r   r   _dynamo.utilsr   rS   r   r   r   r   r   r(  r   r'  r   r   r   r    r!   r"   r#   utilsr$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   virtualizedr1   r2   r3   r4   commonr6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   	cpp_utilsrB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   r  rU   cacherY   _logginggetArtifactLoggerr  schedule_logNATIVE_OMP_RTYPESRTYPE_TO_CPPr  PYTHON_TO_CPPCONTAINER_PYTHON_TO_CPPr  r  r   r  ro   rl   r  r  rP  r  r  r  rw   r)  r   r  rx   r   r   r   r   r   r  rp   r   rn   r   r   	lru_cacher   r  r
  	dataclassr  r  rp  r  r  _initialize_pointwise_overridesr  r(  r,  r2  r  r.  r   r  r  r  rc  rD  rL  rc  rt  r2  rQ  rV   rW   rX   <module>r     sB         	 
  $  7 7    ( @ / K K O O % < <        > =       & llg% : : ~~//*EBC   !   #&   
NN	MM 
MM	KK	NN	MM	JJ	KK	JJ	KK	KK		* T%++&  
KK	NN	MM	KK	JJ1 D- )D %)+) ELL!	+)\-
-jj- - 

	-
 jj- -`3#$  ;;	
 
sCxBV^ V3 V# V -UZZ -ell - - ;uzz ;

 ;PS ; ;| FJ!::!!LL!6>sm! !   ]!"4 ]!@! !B&k; k\  , ,U 3|7l |7~  / / 9  % % '7 7z zzW9 WtY
l Y
x)$ )$eHU[[4I44O.P )$XbK bKJa	
Y a	
H 
9  
FFt FP&N P&fD
 D
N%7 %7P R R Rj @, @, @,rW   