
    Thr                     d   S SK r S SKJrJr  S SKJr  \ R                  " \S5      r\ R                  " \S5      r\" 5       (       Gak  S SK	r	S SK	J
r  \	R                    SES j5       r\	R                    SES	 j5       r\	R                      SFS
 j5       r\	R                      SFS j5       r\	R"                  " \	R$                  " SS0SSS9\	R$                  " SS0SSS9\	R$                  " SS0SSS9\	R$                  " SS0SSS9// S9\	R                    SES j5       5       r\	R"                  " \	R$                  " SS0SSS9\	R$                  " SS0SSS9\	R$                  " SS0SSS9\	R$                  " SS0SSS9// S9\	R                    SES j5       5       r\	R"                  " \	R$                  " SS0SSS9// S9\	R                    SES j5       5       r\	R"                  " \	R$                  " SSS.SSS9\	R$                  " SSS.SSS9\	R$                  " SSS.SSS9\	R$                  " SSS.SSS9// S9\	R                      SGS j5       5       rS r\	R"                  " \	R$                  " SS0SSS9\	R$                  " SS0SSS9// SSS\0S9\	R                    SES j5       5       r\	R                    SES  j5       r\	R                    SES! j5       r\	R                      SGS" j5       r\	R                    SES# j5       r\	R                      SGS$ j5       r\	R                    SES% j5       r\	R                    SES& j5       r\	R                    SES' j5       r \	R                    SES( j5       r!\	R                  S) 5       r"\	R                      SHS* j5       r#\	R                      SIS+ j5       r$\	R                      SJS, j5       r%\	R                      SJS- j5       r&\	R                  S\RN                  4S. j5       r(\	R                  S\RN                  4S/ j5       r)S S0K*J+r+J,r,  \	R                    SES1 j5       r-\	R                    SES2 j5       r.\	R                    SES3 j5       r/\	R                    SES4 j5       r0\	R                    SES5 j5       r1\	R"                  " \	R$                  " SSSSS6.SSS9\	R$                  " SSS7SS6.SSS9// S8QS9\	R                  S9\RN                  S:\RN                  S;\RN                  S<\RN                  4S= j5       5       r2\	R                  S\RN                  4S> j5       r3\	R                  S\RN                  4S? j5       r4\	R                  S\RN                  4S@ j5       r5\	R                  S\RN                  4SA j5       r6 SKSB\7\8   SC\94SD jjr:gg)L    N)HAS_CUDAHAS_GPU)
has_tritonzrequires cudazrequires gpu)language
BLOCK_SIZEc                    [         R                  " SS9nXT-  nU[         R                  " SU5      -   nXs:  n[         R                  " X-   US9n	[         R                  " X-   US9n
X-   n[         R                  " X'-   XS9  g Nr   axismasktl
program_idarangeloadstorein_ptr0in_ptr1out_ptr
n_elementsr   pidblock_startoffsetsr   xyoutputs               ^/var/www/fran/franai/venv/lib/python3.13/site-packages/torch/testing/_internal/triton_utils.py
add_kernelr       u     mm#&		!Z 88#GGG%D1GGG%D1
"F6    c                    [         R                  " SS9nXT-  nU[         R                  " SU5      -   nXs:  n[         R                  " X-   US9n	[         R                  " X-   US9n
X-
  n[         R                  " X'-   XS9  g r	   r   r   s               r   
sub_kernelr$   "   r!   r"   c                    [         R                  " SS9nXe-  nU[         R                  " SU5      -   nX:  n	[         R                  " X-   U	S9n
US:X  a  [         R                  " X-   U	S9nX-   nOU
n[         R                  " X(-   XS9  g Nr   r
   r   twor   )r   r   r   r   ARGS_PASSEDr   r   r   r   r   r   r   r   s                r   add_kernel_with_optional_paramr)   3   s     mm#&		!Z 88#GGG%D1%)5AUFF
"F6r"   c                     [         R                  " SS9nXv-  nU[         R                  " SU5      -   n	X:  n
[         R                  " X	U-  -   U
S9nUS:X  a  [         R                  " X-   U
S9nX-   nOUn[         R                  " X)U-  -   XS9  g r&   r   )r   r   r   r   strider(   r   r   r   r   r   r   r   r   s                 r   -add_kernel_with_none_param_and_equal_to_1_argr,   H   s     mm#&		!Z 88#GGG..T:%)5AUFF
V++V?r"            )
num_stages	num_warps   @   )configskeyc                    [         R                  " SS9nXT-  nU[         R                  " SU5      -   nXs:  n[         R                  " X-   US9n	[         R                  " X-   US9n
X-   n[         R                  " X'-   XS9  g r	   r   r   s               r   add_kernel_autotunedr7   ^   u    " mm#&		!Z 88#GGG%D1GGG%D1
"F6r"   c                    [         R                  " SS9nXT-  nU[         R                  " SU5      -   nXs:  n[         R                  " X-   US9n	[         R                  " X-   US9n
X-
  n[         R                  " X'-   XS9  g r	   r   r   s               r   sub_kernel_autotunedr:   x   r8   r"         c                    [         R                  " SS9nXS-  nU[         R                  " SU5      -   nXr:  n[         R                  " X-   US9n	[         R                  " X-   US9n
X-   n[         R                  " XG-   XS9  g r	   r   )r   r   r   r   r   r   r   r   r   r   r   r   s               r   &add_kernel_autotuned_weird_param_orderr>      su      mm#&		!Z 88#GGG%D1GGG%D1
"F6r"   )BLOCK_SIZE_XBLOCK_SIZE_Yc                    [         R                  " S5      U-  nU[         R                  " SU5      S S 2S 4   -   nX:  n	[         R                  " S5      U-  n
U
[         R                  " SU5      S S S 24   -   nX:  nUnUn[         R                  " XX>-  -   -   X-  5      n[         R                  " XXM-  -   -   X-  5      nUU-   n[         R                  " X-X>-  -   -   UX-  5        g )Nr      r   )r   r   r   
x_elements
y_elementsr?   r@   xoffsetxindexxmaskyoffsetyindexymaskx1y0tmp0tmp1tmp2s                     r   add_kernel_2d_autotunedrP      s    6 --"\1299Q5ag>>#--"\1299Q5dAg>>#www
"895=Iwww
"895=Id{
*/23T5=Ir"   c                     U $ )N )r4   ___s      r   _dummy_early_config_prunerU      s    r"   
      early_config_prune)r4   r5   warmuprepprune_configs_byc                    [         R                  " SS9nXT-  nU[         R                  " SU5      -   nXs:  n[         R                  " X-   US9n	[         R                  " X-   US9n
X-   n[         R                  " X'-   XS9  g r	   r   r   s               r   *add_kernel_autotuned_with_unsupported_argsr]      su    $ mm#&		!Z 88#GGG%D1GGG%D1
"F6r"   c                    [         R                  " SS9nXe-  nU[         R                  " SU5      -   nX:  n	[         R                  " X-   U	S9n
[         R                  " X-   U	S9nX-   U-  n[         R                  " X(-   XS9  g r	   r   )r   r   r   r   scaling_factorr   r   r   r   r   r   r   r   s                r   add_kernel_with_scalingr`      sz     mm#&		!Z 88#GGG%D1GGG%D1%>)
"F6r"   c                    [         R                  " SS9nXC-  n[         R                  " U U/U/[         R                  5      n[         R                  " UU/U/[         R                  5      nXg-   n[         R                  " UUU/5        g Nr   r
   r   r   _experimental_descriptor_loadfloat32_experimental_descriptor_store	in_desc_ptr0in_desc_ptr1out_desc_ptrr   r   offsetabr   s	            r   add_kernel_with_tma_1d_old_apirn     s     mm#!,,HLJJ	
 ,,HLJJ	
 
))H	
r"   c                 B   [         R                  " SS9n[         R                  " SS9nXS-  nXd-  n[         R                  " U Xx/X4/[         R                  5      n	[         R                  " UXx/X4/[         R                  5      n
X-   n[         R                  " UUXx/5        g Nr   r
   rB   rc   rh   ri   rj   r?   r@   pid_xpid_yoffset_xoffset_yr   r   r   s               r   add_kernel_with_tma_2d_old_apirv   "  s     1%1%'',, (JJ	
 ,, (JJ	
 
)) 	
r"   c                     [         R                  " SS9nXC-  n[         R                  " U U/5      n[         R                  " UU/5      nXg-   n[         R                  " UU/U5        g rb   r   r   load_tensor_descriptorstore_tensor_descriptorrg   s	            r   add_kernel_with_tma_1d_new_apir{   D  sn     mm#!%%H
 %%H

 
""H	
r"   c                     [         R                  " SS9n[         R                  " SS9nXS-  nXd-  n[         R                  " U Xx/5      n	[         R                  " UXx/5      n
X-   n[         R                  " UXx/U5        g rp   rx   rq   s               r   add_kernel_with_tma_2d_new_apir}   _  s     1%1%''%% 
 %% 

 
"" 	
r"   c                    UnUS-   nUS-   n	[         R                  R                  R                  UU Xf/X4/U R                  R
                  S9  [         R                  R                  R                  UUXf/X4/UR                  R
                  S9  [         R                  R                  R                  U	UXf/X4/UR                  R
                  S9  [         R                  R                  R                  U5        [         R                  R                  R                  U5        [         R                  R                  R                  U	5        [         R                  " SS9n
[         R                  " SS9nX-  nX-  n[         R                  " UX/Xf/[         R                  5      n[         R                  " UX/Xf/[         R                  5      nX-   n[         R                  " U	UX/5        g )Nr-      )desc_ptrglobal_address	load_sizeglobal_size
element_tyr   r
   rB   )r   extracuda&experimental_device_tensormap_create2ddtyper   )experimental_tensormap_fenceproxy_acquirer   rd   re   rf   )a_ptrb_ptrc_ptrmn	workspacer   
a_desc_ptr
b_desc_ptr
c_desc_ptrrr   rs   rt   ru   rl   rm   r   s                    r    add_kernel_on_device_tma_old_apir   }  s    
_
_

<< !.{{-- 	= 	
 	<< !.{{-- 	= 	
 	<< !.{{-- 	= 	
 	??
K
??
K
??
K1%1%%% ,, $JJ	
 ,, $JJ	
  	)) 	
r"   c                    [         R                  " U X4/US/Xf/S9n[         R                  " UX4/US/Xf/S9n[         R                  " UX4/US/Xf/S9n	[         R                  " SS9n
[         R                  " SS9nX-  nX-  n[         R                  " UX/5      n[         R                  " UX/5      nX-   n[         R                  " U	X/U5        g )NrB   )baseshapestridesblock_shaper   r
   )r   make_tensor_descriptorr   ry   rz   )r   r   r   r   r   r   r   a_descb_descc_descrr   rs   rt   ru   rl   rm   r   s                    r    add_kernel_on_device_tma_new_apir     s     **&F#0	
 **&F#0	
 **&F#0	
 1%1%%% %% 
 %% 
  	"" 	
r"   c                     [         R                  " SS9nXC-  nU[         R                  " SU5      -   nXb:  n[         R                  " X-   US9nSU-  n	[         R                  " X-   XS9  g Nr   r
   r   r<   r   )
r   r   r   r   r   r   r   r   r   r   s
             r   mul2_kernelr     sd     mm#&		!Z 88#GGG%D1Q
"F6r"   c                     [         R                  " SS9nX2-  nU[         R                  " SU5      -   nXQ:  n[         R                  " X-   US9nSU-  n[         R                  " X-   XS9  g r   r   )	ptrr   r   r   r   r   r   r   r   s	            r   mul2_inplace_kernelr     sb     mm#&		!Z 88#GGCM-Q
2r"   c                 8    [         R                  " U S:  U S5      $ )Nr   )r   where)r   s    r   	zero_negsr     s    xxQ1%%r"   c                    [         R                  " SS9nXS-  nU[         R                  " SU5      -   nXr:  nUS:X  a  [        XUS9  OUS:X  a  [	        X XUS9  [         R
                  " X-   US9n	[         R                  " X-   XS9  g )Nr   r
   r   )r   r    r   )r   r   r   r   r    r   r   )
r   r   r   r   
ACTIVATIONr   r   r   r   r   s
             r   indirection_kernelr     s     mm#&		!Z 88#..
K<'wTGGG%D1
"A1r"   c                    [         R                  " SS9n[         R                  " SS9nXd-  nXu-  n	U[         R                  " SU5      -   n
U	[         R                  " SU5      -   nUS S 2S 4   U-  U
S S S 24   -   nUS S 2S 4   U-  U
S S S 24   -   n[         R                  " X-   5      n[         R                  " X-   US-  5        g )Nr   r
   rB   g       @r   )in_ptrr   in_y_strideout_y_strideX_BLOCK_SIZEY_BLOCK_SIZExidyidx_starty_start	x_offsets	y_offsetssrc_offsetsdst_offsetssrcs                  r   double_strided_kernelr   ,  s     mm#mm#$$bii<88	bii<88	4(;6479KK4(<7)D!G:LLggf*+
&c	2r"   c           	         [         R                  " U [         R                  " SU5      -   5      n[         R                  " U[         R                  " SU5      -   5      n[         R                  " U/U[         R                  5      n[         R
                  " SSXVU/[         R                  SSS9n[         R                  " U[         R                  " SU5      -   U5        g )Nr   shf.l.wrap.b32 $0, $1, $2, $3;
=r,r, r, rTrB   r   is_purepackr   r   r   fullint32inline_asm_elementwiser   	XYZr   BLOCKr   r   szs	            r   inline_asm_kernel_is_pure_truer   @  s     GGA		!U++,GGA		!U++,GGUGQ)%%,1I((
 	RYYq%((!,r"   c           	         [         R                  " U [         R                  " SU5      -   5      n[         R                  " U[         R                  " SU5      -   5      n[         R                  " U/U[         R                  5      n[         R
                  " SSXVU/[         R                  SSS9n[         R                  " U[         R                  " SU5      -   U5        g )Nr   r   r   FrB   r   r   r   s	            r   inline_asm_kernel_is_pure_falser   Q  s     GGA		!U++,GGA		!U++,GGUGQ)%%,1I((
 	RYYq%((!,r"   c                 j   [         R                  " SS9nXT-  n[         R                  " [         R                  " U U/S/U/U/S/S9S/S9n[         R                  " [         R                  " UU/S/U/U/S/S9S/S9nXx-   n	[         R                  " [         R                  " UU/S/U/U/S/S9U	S/S9  g Nr   r
   rB   )r   r   r   r   r   order)boundary_checkr   r   r   make_block_ptrr   )
x_ptry_ptr
output_ptrr   r   r   r   r   r   r   s
             r   add_kernel_with_block_ptrr   b  s     mm#&GG!l$'Lc 3

 GG!l$'Lc 3

 
!l$'Lc 3	
r"   c                    [         R                  " SS9nXC-  n[         R                  " [         R                  " U US/SS/US/US/SS/S9S/S9nUn[         R                  " [         R                  " UUS/SS/US/US/SS/S9US/S9  g r   r   )r   r   r   r   r   r   r   r   s           r   kernel_with_block_ptr_2dr     s     mm#&GG!1oA$a('O!f 3

 
!1oA$a('O!f 3	
r"   )r   r   c                     [         R                  " SS9nXT-  nU[         R                  " SU5      -   nXs:  n[        X-   US9n	[        X-   US9n
X-   n[	        X'-   XS9  g r	   r   r   s               r   add_kernel_with_importr     si     mm#&		!Z 88#".".g3r"   c                 @   [         R                  " SS9nXT-  nU[         R                  " SU5      -   nXs:  n[         R                  " X-   US9n	[         R                  " X-   US9n
[         R                  " S5      S:X  a  X-   nOX-  n[         R                  " X'-   XS9  g r	   r   r   s               r   cond_op_kernelr     s     mm#&		!Z 88#GGG%D1GGG%D1==q UFUF
"F6r"   c                    [         R                  " SS9nXT-  nU[         R                  " SU5      -   nXs:  n[         R                  " X-   US9n	[         R                  " X-   US9n
X-   n[         R                  " X'-   XS9  g r	   )r   r   r   r   
atomic_addr   s               r   atomic_add_kernelr     su     mm#&		!Z 88#GGG%D1GGG%D1
g';r"   c                    [         R                  " SS9nXT-  nU[         R                  " SU5      -   nXs:  n[         R                  " X-   US9n	[         R                  " X-   US9n
[	        S5       H  nX-   n[         R
                  " X'-   XS9  M      SnUS:  a)  US-  nX-   n[         R
                  " X'-   XS9  US:  a  M(  g g )Nr   r
   r   r<   rB   )r   r   r   r   ranger   )r   r   r   r   r   r   r   r   r   r   r   ir   s                r   add_4_times_kernelr     s     mm#&		!Z 88#GGG%D1GGG%D1qAUFHHW&:  !eFAUFHHW&: !er"   c                    [         R                  " SS9nXT-  nU[         R                  " SU5      -   nXr:  n[         R                  " X-   US9n	[         R                  " X-   US9n
X-   n[         R                  " X7-   XS9  g r	   r   )r   r   r   r   r   r   r   r   r   r   r   r   s               r   add_kernel_out_of_order_fn2r     r!   r"   )BLOCK_SIZE_MBLOCK_SIZE_NBLOCK_SIZE_KGROUP_SIZE_M    )M_ptrNKr   r   r   r   c
                    [         R                  " SS9n
[         R                  " U5      nUS:X  a	  US:  a  SnOUS:X  a  g [         R                  " X5      n[         R                  " XG5      nX-  nX-  nX-  n[	        UU-
  U	5      nUX-  U-  -   nX-  U-  nUU-  [         R
                  " SU5      -   U-  nUU-  [         R
                  " SU5      -   U-  n[         R
                  " SU5      nU US S 2S 4   US S S 24   -   -   nUUS S 2S 4   US S S 24   -   -   n[         R                  " Xg4[         R                  S9n[        S[         R                  " XX5      5       Hq  n[         R                  " UUS S S 24   UUU-  -
  :  SS9n[         R                  " UUS S 2S 4   UUU-  -
  :  SS9n[         R                  " UUU5      nUU-  nUU-  nMs     UR                  [         R                  5      nUU-  [         R
                  " SU5      -   nUU-  [         R
                  " SU5      -   nUUS S 2S 4   -   US S S 24   -   n US S 2S 4   U:  US S S 24   U:  -  n![         R                  " U UU!S9  g )	Nr   r
   r   i   r   g        )r   otherr   )r   r   r   cdivminr   zerosre   r   dottofloat16r   )"r   r   r   r   r   r   r   r   r   r   r   M	num_pid_m	num_pid_nnum_pid_in_groupgroup_idfirst_pid_mgroup_size_mpid_mpid_noffs_amoffs_bnoffs_ka_ptrsb_ptrsaccumulatorkrl   rm   coffs_cmoffs_cnc_ptrsc_masks"                                     r   strange_config_matmul_kernelr    sz   N mm#GGEN6lR'A!VGGA,	GGA,	'3*-9{2LA 6,FG'L8<'"))A|*DDI<'"))A|*DDI1l+'!T'*VD!G_<=&D/GD!G,<<=hh;2::Nq"''!23AVD!G_q1|;K7K%KSVWAVAtG_q1|;K7K%KSVWA&&A{3Kl"Fl"F 4 NN2::&,&1l)CC,&1l)CCD))GD!G,<<!T'"Q&747+;a+?@
(r"   c                     [         R                  " SS9n[         R                  " SU5      X2-  -   n[         R                  " U/S[         R                  S9n[         R
                  " X-   XTU:  S9  g)zq
This kernel contains a triple-quote docstring w/ double quotes.
Make sure that codegen sanitizes the docstring.
r   r
         ?r   r   Nr   r   r   r   re   r   r   numelr   r   r   oness         r   #kernel_with_docstring_double_quotesr  _  sW     mm#))Az*S-==ww
|S

;
"D?r"   c                     [         R                  " SS9n[         R                  " SU5      X2-  -   n[         R                  " U/S[         R                  S9n[         R
                  " X-   XTU:  S9  g)z
This kernel contains a triple-quote docstring w/ single quotes
Make sure that codegen sanitizes the docstring.
To prevent it from being linted to double quotes: """!!!"""
r   r
   r  r   r   Nr  r  s         r   #kernel_with_docstring_single_quotesr  j  sW     mm#))Az*S-==ww
|S

;
"D?r"   c           	         [         R                  " SS9n[         R                  " SU5      XC-  -   n[         R                  " X-   XR:  S9n[         R                  " SSU/[         R
                  SSS9n[         R                  " X-   XuU:  S9  g )	Nr   r
   r   z{
            {
                cos.approx.f32 $0, $1;
                ex2.approx.f32 $0, $0;
            }
                =r, rTrB   asmconstraintsargsr   r   r   r   r   r   r   r   re   r   r   r   r  r   r   r   datacos_pows           r   kernel_inline_asm_double_quotesr!  v  s     mm#))Az*S-==wwv'go>++ !**
 	"GE/Br"   c           	         [         R                  " SS9n[         R                  " SU5      XC-  -   n[         R                  " X-   XR:  S9n[         R                  " SSU/[         R
                  SSS9n[         R                  " X-   XuU:  S9  g )	Nr   r
   r   z
            {
                // double quotes to pacify the linter """!!!"""
                cos.approx.f32 $0, $1;
                ex2.approx.f32 $0, $0;
            }
                r  TrB   r  r  r  s           r   kernel_inline_asm_single_quotesr#    s     mm#))Az*S-==wwv'go>++ !**
 	"GE/Br"   block_sizesnew_apic           	      F   U(       a3  [         R                  R                  R                  R	                  X5      $ [        U5      S:X  aZ  [         R                  R                  R                  U R                  5       U R                  S5      US   U R                  5       5      $ [        U5      S:X  d   e[         R                  R                  R                  U R                  5       U R                  S5      U R                  S5      US   US   U R                  5       5      $ )NrB   r   r<   )tritontoolstensor_descriptorTensorDescriptorfrom_tensorlenexperimental_descriptorcreate_1d_tma_descriptordata_ptrsizeelement_sizecreate_2d_tma_descriptor)tensorr$  r%  s      r   create_tensor_descriptor_shimr4    s     <<11BBNN  ;1$||;;TTOO%KKNN'')	  ;'1,,,||;;TTOO%KKNKKNNN'') r"   )r   tl.constexpr)r(   r5  r   r5  )r?   r5  r@   r5  )r   r5  r   r5  )r   r5  r   r5  )r   r5  r   r5  )T);unittest&torch.testing._internal.inductor_utilsr   r   torch.utils._tritonr   
skipUnlessrequires_cudarequires_gpur'  r   r   jitr    r$   r)   r,   autotuneConfigr7   r:   r>   rP   rU   r]   r`   rn   rv   r{   r}   r   r   r   r   r   r   r   r   r   	constexprr   r   triton.languager   r   r   r   r   r   r   r  r  r  r!  r#  listintboolr4  rR   r"   r   <module>rD     s    D * ##Ho>""7N;<<% ZZ7
 #7 7  ZZ7
 #7 7  ZZ7
 $7 #7 7( ZZ@ $@ #@ @* __MM<-!qIMM<-!qIMM<,aHMM<,aH	
  ZZ7
 #7 7  __MM<-!qIMM<-!qIMM<,aHMM<,aH	
  ZZ7
 #7 7  __MM<,aH
 	 ZZ7 #	7 7$ __MM!$c:qTU MM!$c:qTU MM!#R8QRS MM!#R8QRS
 " ZZJ %J %J #$J, __MM<-!qIMM<,aH
 .0IJ	 ZZ7
 #7 	7  ZZ7 #7 7" ZZ
 #	
 
< ZZ
 %	

 %
 
B ZZ
 #	
 
4 ZZ
 %	

 %
 
: ZZA
 #A
 A
F ZZ4
 #4
 4
l ZZ7 #	7 7 ZZ3 #3 3 ZZ& & ZZ2 #	2
 #2 2$ ZZ3
 %3 %3 3& ZZ-"-+9- -  ZZ-"-+9- -  ZZ+

 LL+
 +
Z ZZ
 LL	
 
B ,ZZ4
 #4 4  ZZ7
 #7 7& ZZ<
 #< <  ZZ;
 #; ;, ZZ7
 #7 7  __MM$&$&$&$%	 	 MM$'$&$&$%	 	
,  /2 ZZ1) ll1) ll1) ll1) ll1) 341)f ZZ@ @ @ ZZ	@ 	@ 	@ ZZC,.LLC C* ZZC,.LLC C0 9=!#Y15q r"   