
    7hÓ                       S SK Jr  S SKrS SKrS SKrS SKrS SKJr  S SKJ	r	J
r
Jr  S SKrS SKJr  S SKrS SKJr  S SKJr  S SKJrJr  S S	KJr  S
SKJrJrJr  S
SKJrJ r J!r!  SSK"J#r#J$r$J%r%J&r&J'r'J(r(  SSK)J*r*J+r+J,r,  \(       a  S SKJ-r-  S
SK.J/r/J0r0  S
SK1J2r2J3r3  SSK"J4r4  \Rj                  " \65      r7\Rp                  S\Rr                  S\Rt                  S\Rv                  S\Rx                  S\Rz                  S\R|                  S\R~                  S\R                  S0	rAS'S jrB " S S\5      rC " S  S!\'5      rD\DR                  S"5        \DR                  5          " S# S$\+5      rG " S% S&\,5      rHg)(    )annotationsN)Path)AnyOptionalTYPE_CHECKING)
PRECEDENCE)_embed_headers)
OrderedSet)
CppPrinterExprPrinter)ValueRanges   )ceildivget_bounds_index_exprget_kernel_metadata)ops
OpsWrapperV   )CSEVariableDeferredLineDTYPE_TO_COMPUTATION_DTYPEIndentedBufferOpOverridesPythonPrinter)IterationRangesEntry
SIMDKernelSIMDScheduling)Union)ReductionType	StoreMode)	SchedulerSchedulerNode)OpVarTboolcharshortintlongucharfloathalfbfloatc                    [        U [        5      (       a<  U [        R                  :X  a  gU [        R                  * :X  a  gX :w  a  g[	        U 5      $ [        U [
        5      (       a  U (       a  S$ S$ [	        U 5      $ )N	HUGE_VALFz
-HUGE_VALFNANtruefalse)
isinstancer+   torchinfstrr%   )vals    U/var/www/fran/franai/venv/lib/python3.13/site-packages/torch/_inductor/codegen/mps.pyvalue_to_metalr9   8   sc    #u%))UYYJZ3x	C		v)')s8O    c                      \ rS rSrSrSS jrSS jrSS jrSS jrSS jr	SS jr
SS	 jrSS
 jrSS jrSS jrSS jr\rSS jrSS jrSrg)MetalExprPrinterF   z/Converts sympy expression to Metal code snippetc                    UR                   u  p#U R                  U5      nU R                  U5      nUR                  (       a	  SU SU S3$ SU SU S3$ )Nc10::metal::floor_divide(, )metal::floor() / (argsdoprint
is_integer)selfexprxdivs       r8   _print_FloorDiv MetalExprPrinter._print_FloorDivI   sY    LLOll3??.qcC5::qcse1--r:   c                    UR                   u  p#nU R                  U5      nUS:w  a5  U R                  U5      nUR                  (       a
  SU SU S3nO	SU SU S3nU R                  U5      nSU SU S3$ )Nr   (rC   rA   rB   z) % (rD   )rH   rI   rJ   rK   mods        r8   _print_ModularIndexing'MetalExprPrinter._print_ModularIndexingQ   s    iiLLO!8,,s#Cs%uA&#A3eC52ll31#U3%q!!r:   c                    [        UR                  5      S:w  a  [        S5      e[        U R                  UR                  5      u  p#SU SU SU S3nSU SU SU S3nSU SU S3$ )	Nr   z$metal::min only supported for 2 argsstatic_cast<decltype(+)>(rA   zmetal::min(r@   lenrE   RuntimeErrormap_printrH   rI   ab
typecast_a
typecast_bs         r8   
_print_MinMetalExprPrinter._print_Min]   }    tyy>QEFF4;;		*,QCq3qc;
,QCq3qc;
ZL:,a88r:   c                    [        UR                  5      S:w  a  [        S5      e[        U R                  UR                  5      u  p#SU SU SU S3nSU SU SU S3nSU SU S3$ )	Nr   z$metal::max only supported for 2 argsrT   rU   rV   rA   zmetal::max(r@   rW   r\   s         r8   
_print_MaxMetalExprPrinter._print_Maxe   rc   r:   c                |    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S3$ )Nr   metal::abs(r   rA   rX   rE   r[   rH   rI   s     r8   
_print_AbsMetalExprPrinter._print_Absm   s9    499~"""T[[167q99r:   c                |    [        UR                  5      S:X  d   eSU R                  UR                  S   5       S3$ )Nr   zstatic_cast<long>(metal::rint(r   ))ri   rj   s     r8   _print_RoundToInt"MetalExprPrinter._print_RoundToIntq   s9    499~"""/DIIaL0I/J"MMr:   c                    [        UR                  5      S:X  d   eUR                  u  p#UR                  (       a  US:  d   e[        SU S35      eU R	                  U[
        S   5      nSU SU SU*  S	3$ )
Nr   r   zOFor integer inputs, only non-negative ndigits are currently supported, but got .Mulz!static_cast<float>(metal::rint(1e * z) * 1erA   )rX   rE   rG   
ValueErrorparenthesizer   )rH   rI   numberndigits
number_strs        r8   _print_RoundDecimal$MetalExprPrinter._print_RoundDecimalu   s    499~"""))Q;;abiajjkl  &&vz%/@A
27)3zl&RYQYPZZ[\\r:   c                l    UR                   u  p#SU R                  U5       SU R                  U5       S3$ )Nstatic_cast<float>(z) / static_cast<float>(rA   )rE   r[   )rH   rI   lhsrhss       r8   _print_IntTrueDiv"MetalExprPrinter._print_IntTrueDiv   s9    99$T[[%5$66MdkkZ]N^M__`aar:   c                    [        UR                  5      S:X  d   e[        U R                  UR                  5      u  p#SU SU S3$ )Nr   zmetal::pow(static_cast<float>(z), static_cast<float>(rn   )rX   rE   rZ   rF   )rH   rI   rJ   ys       r8   _print_PowByNatural$MetalExprPrinter._print_PowByNatural   sD    499~"""4<<+/s2H2NNr:   c                    [        UR                  5      S:X  d   eU R                  UR                  S   5      nSU S3$ )Nr   r   r}   rA   rX   rE   rF   rH   rI   rJ   s      r8   _print_ToFloatMetalExprPrinter._print_ToFloat   s=    499~"""LL1&$QCq))r:   c                    [        UR                  5      S:X  d   eU R                  UR                  S   5      nSU S3$ )Nr   r   z1static_cast<int>(metal::floor(static_cast<float>(z)))r   r   s      r8   _print_FloorToInt"MetalExprPrinter._print_FloorToInt   s=    499~"""LL1&B1#SIIr:   c                    [        UR                  5      S:X  d   eU R                  UR                  S   5      nSU S3$ )Nr   r   zstatic_cast<int>(metal::trunc(rn   r   r   s      r8   _print_TruncToInt"MetalExprPrinter._print_TruncToInt   s=    499~"""LL1&/s"55r:   c                    [        UR                  5      S:X  d   eU R                  UR                  S   5      nSU S3$ )Nr   r   zmetal::log2(rA   r   r   s      r8   _print_OpaqueUnaryFn_log2*MetalExprPrinter._print_OpaqueUnaryFn_log2   s=    499~"""LL1&aS""r:    N)rI   
sympy.Exprreturnr6   )__name__
__module____qualname____firstlineno____doc__rL   rQ   ra   re   rk   ro   rz   r   r   r   r   _print_floorr   r   __static_attributes__r   r:   r8   r<   r<   F   sR    9.
"99:N
]b
O
*
J
 %L6
#r:   r<   c                     \ rS rSrSr\  S2         S3S jj5       r\        S4S j5       r\S5S j5       r\S6S j5       r	\S7S j5       r
\S8S	 j5       r\S9S
 j5       r\S:S j5       r\S:S j5       r\S:S j5       r\S:S j5       r\S;S j5       r\S;S j5       r\S;S j5       r\S;S j5       r\S;S j5       r\S;S j5       r\S;S j5       r\S;S j5       r\S;S j5       r\S;S j5       r\S;S j5       r\S;S j5       r\S;S j5       r\S<S j5       r\S;S j5       r\S;S j5       r \S;S j5       r!\S;S  j5       r"\S;S! j5       r#\S:S" j5       r$\S;S# j5       r%\S;S$ j5       r&\S:S% j5       r'\S;S& j5       r(\S:S' j5       r)\S;S( j5       r*\S=S) j5       r+\S=S* j5       r,\          S>S+ j5       r-\S;S, j5       r.\S:S- j5       r/S?S. jr0S@S/ jr1\2SAS0 j5       r3S1r4g)BMetalOverrides   zXImplements Metal-specific overrides for ops. Base class emits Python-friendly overrides.Nc                    U[         R                  :X  a  [        R                  S5        SU  S3$ S[        U    SU  S3$ )Nz>float64 cast requested, probably from tensorify_python_scalarsr}   rA   static_cast<>()r4   doublelogwarningDTYPE_TO_METAL)rJ   dtype	src_dtypeuse_compute_typess       r8   to_dtypeMetalOverrides.to_dtype   sK     ELL KKP )1--nU34Bqc;;r:   c                6    S[         U    S[         U    SU  S3$ )Nzas_type<z>(static_cast<r   rn   r   )rJ   r   r   s      r8   to_dtype_bitcastMetalOverrides.to_dtype_bitcast   s/     ./0~i?X>YY[\][^^`aar:   c                    [        U 5      $ Nr9   )r7   r   s     r8   constantMetalOverrides.constant   s    c""r:   c                :   [         R                  R                  [         R                  R                  U 5      5      n[         R                  R                  R                  [         R                  R                  U[        U 5      S9n[        R                  " X15      $ )N)bounds)
r   kernelindex_to_strprepare_indexingcsegeneratecomputer   r   r   )rI   r   idx_strvars       r8   
index_exprMetalOverrides.index_expr   sj    ((''(A(A$(GHhhll##HHg.CD.I $ 
 ||C''r:   c                    [         R                  R                  X5       nU" 5       nS S S 5        WR                  R                  (       a  [        U5      n[        R                  " WXB5      $ ! , (       d  f       NK= fr   )r   r   
mask_loadsr   is_boolr%   r   where)maskbodyothernew_maskresults        r8   maskedMetalOverrides.masked   sV     XX  -VF . ==  KEyy611 .-s   A--
A;c                (    U  SU S[        U5       3$ )Nz ? z : r   )r]   r^   cs      r8   r   MetalOverrides.where   s    Cs#nQ/011r:   c                    SU  SU S3$ )Nzc10::metal::remainder(r@   rA   r   r]   r^   s     r8   	remainderMetalOverrides.remainder   s    's"QCq11r:   c                D    SU  SU SU  S3nSU  SU SU S3nSU SU S3$ )NrT   rU   rV   rA   zc10::metal::max(r@   r   r]   r^   r_   r`   s       r8   maximumMetalOverrides.maximum   K    ,QCq3qc;
,QCq3qc;
!*R
|1==r:   c                D    SU  SU SU  S3nSU  SU SU S3nSU SU S3$ )NrT   rU   rV   rA   zc10::metal::min(r@   r   r   s       r8   minimumMetalOverrides.minimum   r   r:   c                    U  SU 3$ )Nz || r   r   s     r8   
logical_orMetalOverrides.logical_or       D}r:   c                    U  SU 3$ )Nz && r   r   s     r8   logical_andMetalOverrides.logical_and   r   r:   c                    SU  S3$ )Nzmetal::isnan(rA   r   rJ   s    r8   isnanMetalOverrides.isnan       qc##r:   c                    SU  S3$ )Nzmetal::isinf(rA   r   r   s    r8   isinfMetalOverrides.isinf   r   r:   c                    SU  S3$ )Nzmetal::log(rA   r   r   s    r8   r   MetalOverrides.log       QCq!!r:   c                    SU  S3$ )Nzmetal::exp(rA   r   r   s    r8   expMetalOverrides.exp   r   r:   c                    SU  S3$ )Nrh   rA   r   r   s    r8   absMetalOverrides.abs   r   r:   c                    SU  S3$ )Nzmetal::signbit(rA   r   r   s    r8   signbitMetalOverrides.signbit   s     1%%r:   c                    SU  S3$ )Nzmetal::precise::sin(rA   r   r   s    r8   sinMetalOverrides.sin      %aS**r:   c                    SU  S3$ )Nzc10::metal::sinc(rA   r   r   s    r8   sincMetalOverrides.sinc  s    "1#Q''r:   c                    SU  S3$ )Nzmetal::precise::cos(rA   r   r   s    r8   cosMetalOverrides.cos  r   r:   c                    SU  S3$ )Nzmetal::tan(rA   r   r   s    r8   tanMetalOverrides.tan  r   r:   c                    SU  S3$ )Nzmetal::asin(rA   r   r   s    r8   asinMetalOverrides.asin      aS""r:   c                    SU  S3$ )Nzmetal::acos(rA   r   r   s    r8   acosMetalOverrides.acos  r  r:   c                    SU  S3$ )Nzmetal::atan(rA   r   r   s    r8   atanMetalOverrides.atan  r  r:   c                    SU  SU S3$ )Nz::metal::atan2(r@   rA   r   )rJ   r   s     r8   atan2MetalOverrides.atan2   s     2aS**r:   c                    SU  S3$ )Nzmetal::sqrt(rA   r   r   s    r8   sqrtMetalOverrides.sqrt$  r  r:   c                    SU  SU  S3$ )NrT   z)>(-rA   r   r   s    r8   negMetalOverrides.neg(  s     'qcaS22r:   c                    SU  S3$ )Nzmetal::rsqrt(rA   r   r   s    r8   rsqrtMetalOverrides.rsqrt.  r   r:   c                    SU  S3$ )Nzmetal::tanh(rA   r   r   s    r8   tanhMetalOverrides.tanh2  r  r:   c                    SU  S3$ )Nzmetal::atanh(rA   r   r   s    r8   atanhMetalOverrides.atanh6  r   r:   c                    SU  SU S3$ )Nr?   r@   rA   r   r   s     r8   floordivMetalOverrides.floordiv:  s     +1#Rs!44r:   c                    SU  S3$ )NrB   rA   r   r   s    r8   floorMetalOverrides.floor?  r   r:   c                    SU  S3$ )Nzmetal::sign(rA   r   r   s    r8   signMetalOverrides.signC  r  r:   c                D    SU  SU SU  S3nSU  SU SU S3nSU SU S3$ )NrT   rU   rV   rA   zmetal::fmod(r@   r   r   s       r8   fmodMetalOverrides.fmodG  sK    ,QCq3qc;
,QCq3qc;
j\J<q99r:   c                    SU  S3$ )Nmetal::trunc(rA   r   r   s    r8   truncMetalOverrides.truncM  r   r:   c                    U  SU 3nU R                   b  U R                   R                  (       d(  UR                   b!  UR                   R                  (       a  SU S3$ U$ )Nz / r-  rA   )r   is_floating_point)r]   r^   quots      r8   truncdivMetalOverrides.truncdivQ  sQ    Cs|GGAGG$=$=GGAGG$=$="4&**r:   c                    SU  S3$ )Nzmetal::ceil(rA   r   r   s    r8   ceilMetalOverrides.ceilZ  r  r:   c                f    [         R                  R                  R                  S5        SU  SU S3$ )Nrandomzc10::metal::rand(r@   rA   r   r   headersaddseedoffsets     r8   randMetalOverrides.rand^  s/    	X&"4&6(!44r:   c                f    [         R                  R                  R                  S5        SU  SU S3$ )Nr9  zc10::metal::randn(r@   rA   r:  r=  s     r8   randnMetalOverrides.randnc  s/    	X&#D6F8155r:   c           	     r    [         R                  R                  R                  S5        SU  SU SU SU S3	$ )Nr9  zc10::metal::randint64(r@   rA   r:  )r>  r?  lowhighs       r8   	randint64MetalOverrides.randint64h  s=     	
X&'vRxr#baHHr:   c                    SU  S3$ )Nzmetal::round(rA   r   r   s    r8   roundMetalOverrides.roundo  r   r:   c                D    SU  SU SU  S3nSU  SU SU S3nSU SU S3$ )NrT   rU   rV   rA   zmetal::pow(r@   r   )r]   r^   cast_acast_bs       r8   powMetalOverrides.pows  sK    (1QCs1#Q7(1QCs1#Q7VHBvha00r:   c                f    [         R                  R                  R                  S5        SU SU S3$ )Nspecial_mathc10::metal::rO   rA   r:  )rH   r]   names      r8   _special_unaryMetalOverrides._special_unaryy  s/    	^,dV1QCq))r:   c                l    [         R                  R                  R                  S5        SU SU SU S3$ )NrS  rT  rO   r@   rA   r:  )rH   r]   r^   rU  s       r8   _special_binaryMetalOverrides._special_binary}  s5    	^,dV1QCr!A..r:   c           
        S H,  n[        X[        R                  " U R                  US95        M.     [        R                  " U R                  SS9U l        S H0  n[        U U[        R                  " U R                  US-   S95        M2     S H,  n[        X[        R                  " U R
                  US95        M.     S H0  n[        U U[        R                  " U R
                  US-   S95        M2     g )N)erferfinvi0i0ei1i1edigammaspherical_bessel_j0)rU  	log_gamma)
	bessel_j0	bessel_j1	bessel_y0	bessel_y1modified_bessel_i0modified_bessel_i1modified_bessel_k0modified_bessel_k1scaled_modified_bessel_k0scaled_modified_bessel_k1_forward)	polygammazeta)chebyshev_polynomial_tchebyshev_polynomial_uchebyshev_polynomial_vchebyshev_polynomial_whermite_polynomial_hhermite_polynomial_he)setattr	functoolspartialmethodrV  lgammarY  )clsrU  s     r8   _initialize_special_ops&MetalOverrides._initialize_special_ops  s    	
D Cy66s7I7IPTUV	
 ,,S-?-?kR

D ''(:(:
ARS
&
D Cy66s7J7JQUVW	

D ''(;(;$BST
r:   r   )NT)
rJ   r   r   torch.dtyper   zOptional[torch.dtype]r   r%   r   r6   )rJ   r   r   r  r   r  r   r6   )r7   zUnion[bool, float, int]r   r  r   r6   )rI   r   r   r  r   r6   )r   r   r   r   r   r   r   r6   )r]   r$   r^   r$   r   r$   r   r6   )r]   r$   r^   r$   r   r6   )r]   r   r^   r   r   r6   )rJ   r   r   r6   )rJ   r   r   r   r   r6   )r>  r   r?  r   r   r6   )
r>  r   r?  r   rF  r   rG  r   r   r6   )r]   r   rU  r6   r   r6   )r]   r   r^   r   rU  r6   r   r6   r   None)5r   r   r   r   r   staticmethodr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r	  r  r  r  r  r  r  r  r!  r$  r'  r*  r.  r3  r6  r@  rC  rH  rK  rP  rV  rY  classmethodr}  r   r   r:   r8   r   r      s   b ,0"&	<<< )<  	<
 
< < bb*b7Bb	b b
 # # ( ( 2 2 2 2 2 2 > >
 > >
     $ $ $ $ " " " " " " & & + + ( ( + + " " # # # # # # + + # # 3 3
 $ $ # # $ $ 5 5 $ $ # # : :
 $ $   # # 5 5 6 6 II#.I5@IHSI	I I $ $ 1 1
*/ 7 7r:   r   mpsc                    ^  \ rS rSr% Sr\rSrSrSr	Sr
\" 5       R                  r\" 5       R                  r\" 5       R                  r\r\" S/5      rS\S	'   / rS
\S'         SU 4S jjrSS jrSS jr S         S S jjrS!S jrSSS\R:                  " 5       4           S"S jjr          S#S jr          S#S jr S$S jr!S%S jr"SS&S jjr#SS'S jjr$          S(S jr%Sr&U =r'$ ))MetalKerneli  z;Implement Metal codegen based on the SIMDKernel abstraction;auto i       utilszOrderedSet[str]r;  zlist[IterationRangesEntry]multistage_reduction_entryc                \   > [         TU ]  " U40 UD6  [        R                  " 5       U l        g r   )super__init__	itertoolscountacc_var_ids)rH   tilingkwargs	__class__s      r8   r  MetalKernel.__init__  s&    
 	*6*$??,r:   c                    [         U   $ r   r   )rH   r   s     r8   dtype_to_strMetalKernel.dtype_to_str  s    e$$r:   c                   U R                   R                  U5      nU R                  U5      n[        R                  R                  U5      nU SU R                  U5       S3nU[        R                  [        R                  4;   a  SU S3n[        R                  nU R                  R                  U R                  XTS9$ )z"Codegen a load from an InputBuffer[]r}   rA   r   )rE   inputr   r   graph	get_dtyper   r4   float16bfloat16float32r   r   loads)rH   rU  indexr   r   lines         r8   loadMetalKernel.load  s    iiood#%%e,!!$'a))%013U]]ENN33 )a0DMMExx  T ??r:   Nc                r   U R                   R                  U5      nU R                  U5      nU R                  [        R
                  R                  U5      5      nSU SU S3nUc  U SU R                  U5       SU S3nO\US:X  aH  U R                  R                  S5        S	U S
3n	SU	 SU S3n
U	 SU
 SU R                  U5       SU S3nO[        SU 35      eU R                  (       a%  U R                  R                  [        X5      5        g U R                  R                  [        X5      5        g )Nr   r   rA   r  ] = r  
atomic_addatomiczc10::metal::AtomicType<>zreinterpret_cast<device z
::type *>(z::atomic_add(r@   );zUnimplemented store mode )rE   outputr   r  r   r  r  r   r;  r<  rY   inside_reductionr   	writeliner   stores)rH   rU  r  valuemoder   	dtype_strcast_valr  atomic_typecast_vars              r8   storeMetalKernel.store  s8    iit$%%e,%%agg&7&7&=>	!)BugQ7<U!D--e45T(1ED\!LLX&3I;a@K1+jQOH!]-zD<M<Me<T;UUWX`WaacdD!:4&ABB  LL""<#;<KK!!,t":;r:   c                   U R                   R                  U5      nU R                  U5      nU R                  [        R
                  R                  U5      5      n[        S U R                   5       5      nU SU R                  U5       SU SU S3nSUR                   SU 3nU R                  R                  [        X5      5        g )Nc              3  J   #    U  H  oR                   (       d  M  Uv   M     g 7fr   is_reduction.0ts     r8   	<genexpr>.MetalKernel.store_reduction.<locals>.<genexpr>  s     K(81NNQQ(8   #	#r  z] = static_cast<r   r  if (z == 0) )rE   r  r   r  r   r  r  nextrange_treesr   rU  r  r  r   )rH   rU  r  r  r   r  reduction_dimr  s           r8   store_reductionMetalKernel.store_reduction  s    iit$%%e,%%agg&7&7&=>	K(8(8KKa))%011A)BugUWXm(()7l467r:   Tc                   [        U[        R                  5      (       a  U R                  U5      nS[	        U R
                  5       3n[        R                  R                  XeU5      nU(       a  SOSnX SU 3-  nU(       a	  USU S3-  nUb  U(       a   S5       eUSU 3-  nU R                  R                  XR                  -   5        U$ )	Ntmp_acc_zthreadgroup   r  r  z+Thread group var can not have default value = )r3   r4   r   r  r  r  r   r   create_cse_varindexing_coder  suffix)	rH   r   
elem_countdefault_valueis_threadgroupr   var_namer   var_defs	            r8   _new_idxvarMetalKernel._new_idxvar  s     eU[[))%%e,Ed4#3#3456hh%%h>$2.WAhZ((:,a((G$%T'TT%]O,,G$$W{{%:;
r:   c                    X#U4nXPR                   R                  ;   a  U R                   R                  U   $ U R                  XX45      nX`R                   R                  U'   U$ )z)Caching wrapper around _reduction_nocache)r   reduction_cache_reduction_nocache)rH   r   r   reduction_typer  	cache_keyr   s          r8   	reductionMetalKernel.reduction  s\     6	00088++I66((>Q.4  +r:   c                J   U R                   (       d   eU R                  (       a   eS?S jnSnSnU R                   HC  nUR                  (       d  M  U(       a  US-  nXhR                   SU 3-  nXxR
                  -  nME     [        XpR                  5      nUS:X  a  U R                  U5      n	U R                  R                  U	 S35        U R                  R                  S5        U R                  R                  S	U S
U	 S35        U R                  R                  S5        U	$ U R                  R                  S5        US;   a  [         U   n
U R                  U
[#        XpR$                  5      5      nU R&                  (       d  UnO@US:X  a  SOSu  pU R                  XSS9nU R                  R                  U SU SU S35        U R(                  R+                  U R                  SU SU SU SU SU S3[         U   S9$ US;   Ga  U R                  X'5      nU SU S3n[,        U   nU R&                  (       dS  U R                  R                  U SU SU S 35        U R(                  R+                  U R                  SU SU SU S3US9$ UR/                  S!5      (       a  S"OS!nU R                  R                  U S#U S$U S%35        UR1                  S&5      (       a  [3        S' U R4                  R7                  5        5       5      nU R                  [8        R:                  U5      nUS(:X  a  S)OS*nU SU S3nU R                  R                  U S+35        U R                  R                  S	U SU SU S
U S,U S-U S,UR                   S.35        U R(                  R+                  U R                  U S/U SU SU S03US9$ U R                  R                  U S1U SU SU S 35        U R(                  R+                  U R                  SU SU SU S3US9$ US2:X  Ga)  U R&                  (       dz  U R                  X'5      nU R                  R                  U SU S3U S35        U R(                  R+                  U R                  SU SU SU S3[8        R<                  S9nU" U5      $ U R                  S4U5      nU SU S3nU R                  R                  U S535        U R                  R                  U S6U S7U S835        U R(                  R+                  U R                  S9U SU S3[8        R<                  S9nU" U5      $ US::X  Ga@  [?        U[@        5      (       d   S;5       eU R                  S4U5      nU SU S3nS<US=    SUS    SUS>    S3nU R                  R                  U S535        U R&                  (       aC  U R                  R                  U S535        U R                  R                  U S6U SU S 35        O!U R                  R                  U S,U S35        U R(                  R+                  U R&                  (       a  U R                  OU R                  SU SU SU S3[8        R<                  S9nU" U5      $ [C        U5      e)@z]Codegen a reduction operation.
Only sum and prod operations are somewhat reasonable optimizedc           
         [         R                  " S Vs/ s H)  n[        U  SU 3U R                  U R                  5      PM+     sn5      $ s  snf )Nxyzrr   )r   _unwrapr   r   r   )res3r  s     r8   _unwrap_helper6MetalKernel._reduction_nocache.<locals>._unwrap_helper9  sE    %%NSTevQqc]DKKDeT Ts   0Ar  r    + rt   anyz	 = false;z7threadgroup_barrier(metal::mem_flags::mem_threadgroup);z
                if (z) {
                    z' = true;
                }
            reduction_utils)prodsumr  )r   rU   )r   *F)r  r  r  z= r  zc10::metal::threadgroup_rO   r@   rA   r  )maxminargminargmaxr  r  z = static_cast<r   r  r  lowestz = ::metal::numeric_limits<z>::z();argc              3  J   #    U  H  oR                   (       d  M  Uv   M     g 7fr   r  r  s     r8   r  1MetalKernel._reduction_nocache.<locals>.<genexpr>  s      =!AA=r  r  r  <z = -1;r  z;
                    z$;
                }
                z[c10::metal::threadgroup_z)]z = ::c10::metal::welford_reducer  float3z = 0.0;z! = ::c10::metal::welford_combine(z	, float3(z, 0.0, 1.0));z(c10::metal::threadgroup_welford_combine(welford_combinez&Input to welford combine must be tuplezfloat3(r   r   )r  r   r   ztuple[CSEVariable, ...])"r  
_load_maskr  r  rU  numelr  max_threadgroup_sizer  r  r  r   splicer  r;  r<  r   r   simd_group_sizer  r   r   r   endswith
startswithr  range_tree_nodesvaluesr4   r)   r  r3   tupleNotImplementedError)rH   r   r   r  r  r  reduction_idxacc_buf_sizerdacc	acc_dtypeacc_bufr7   default_valreduction_opacc_thread_varsrc_metal_typelim_fnidx_varidx_acc_bufcmp_opidx_thread_varwf_res	inp_values                           r8   r  MetalKernel._reduction_nocache-  s{    $$$$??""	 ""B??&yL>::MHH$L # <)B)BCU"""5)C((C5	):;((I LLG E  KK!!I J*+_,29=I&&7<1E1EFG 22 !/% 7HX * && '  ##se1\N"UG1$EF88$$*>*:!G9Bse2m_\^_k^llmn07 %  
 ??&&y?G 'y-:N+I6N22##%&on5ERwbQ xx((KK.~.>ay<.XYZ )  
 "0!8!8!?!?XUF((!""=n=MSQWPXX[\ ((// #44;;=  #..uzz<H .( :$/=-!B""))^,<F*CD## )G1VHAn%5 6#$Cw /#$C~ 6%  xx((KK"m#<^<LAgYVXYeXffhi )  
 LL""!""3N3C1^DTTVW\V]]_` 88$$*>*:!G9B|nTUV %  
 --22**9C##wiqtE7!$LM**LL.~.>ay<.XYZ-- + 
 &f--&&x>G 'y-:N%%(8&@ALL""!""CNCSS\]b\ccpq XX&&:7)2l^STUmm ' F
 "&))..eU++U-UU+&&x>G 'y-:N!%(2eAhZr%(1EI%%(8&@A..""))^,<G*DE&&%&&GGWWYZcYddfg &&.)9YKq'IJXX&&#>>DLL*>*:!G9B|nTUVmm ' F
 "&))!.11r:   c                $   U R                  UR                  5      nU R                  U5      nUR                  (       a$  UR                  R
                  U R                  ::  a9  U R                  R                  U R                   SUR                   SU S35        g U R                  R                  U5        UR                  R
                  U R                  -   S-
  U R                  -  nU R                  R                  SUR                   SUR                   SU SUR                   S	3	5        U R                  R                  5          U R                  R                  U R                   SUR                   SU S
U SUR                   S3
5        X@R                  -  UR                  R
                  :w  a@  U R                  R                  SUR                   SUR                  R
                   S35        S S S 5        g ! , (       d  f       g = f)Nr  r  r  r   z	for(auto z
_cnt = 0; z_cnt < z; ++z_cnt) {rt   r  z_cnt;r  z >= z) break;)rename_indexingrI   sexprr  rootr  r  r  r  index_dtyperU  r  appendr   indent)rH   entryr   	index_str	loop_sizes        r8   codegen_iteration_ranges_entry*MetalKernel.codegen_iteration_ranges_entry  s   ))%**5
JJz*	!!UZZ%5%59R9R%R((##$Aejj\YKqA ''..u5
 JJt8881<&&'	 			

|:ejj\4PUPZPZ|[cd	
 YYII##$Aejj\YKs9+SQVQ[Q[P\\ab 444

8H8HH		##d5::,d5::;K;K:LH$UV  s   	B/H
Hc                6   U R                   (       Ga8  U R                  R                  5          U R                  R                  U R                  5        U R                  R                  U R
                  5        SSS5        U R                  R                  S[        U R                   5      -  5        U R                  R                  [        S U R                  R                  R                  5        5       5      5        U R                   (       a;  U R                   R                  5       R                  5         U R                   (       a  M;  OJU R                  R                  U R                  5        U R                  R                  U R
                  5        U R                  R                  U R                  5        U R                  R!                  5         U R
                  R!                  5         U R                  R!                  5         g! , (       d  f       GN= f)z
Concat output code from index_code, loads, compute, stores,
suffix into self.body.

For pointwise kernels, this is called just once at the end.

For reduction kernels, this generates a loop over the reduction
axis.
N}c              3  j   #    U  H)  n[        U[        5      (       a  UOU4  H  nUv   M	     M+     g 7fr   )r3   r  )r  itemvs      r8   r  +MetalKernel.codegen_body.<locals>.<genexpr>  s7       A&0u&=&=dD7J J  As   13)r  r   r  r  r  r   r  rX   r   
invalidater
   r  r  popcache_clearr  clear)rH   s    r8   codegen_bodyMetalKernel.codegen_body  sc    ***!!#		  ,		  . $ IIc$*I*I&J JK
 HH  $ 8 8 ? ? A  11//335AAC 111 IITZZ(IIT\\*		%

1 $#s   AH		
Hc                
   U R                  5         [        5       n[        R                  R                  (       a  UR                  S5        OUR                  S5        U R                  5       nUR                  5          [        R                  R                  (       d)  U R                   H  nUR                  SU S35        M     OuU R                   Vs/ s H	  nSU S3PM     nn[        U[        [        5      R                  R                  R                  S-  /[        5       5      nUR                  U5        U R                  (       aQ  [        R                   " S U R"                   5       5      n[%        XpR&                  5      nUR                  SU S35        UR                  S	5        UR                  5          U R(                  R*                  R-                  5        H\  u  pXR.                  ;   a  M  U R1                  [        R                  R3                  U	5      5      nUR                  S
U SU
 S35        M^     U R(                  R4                  R-                  5        H  u  p[        R                  R3                  U	5      nU[6        R8                  :X  aD  [        R                  R;                  U	5      nUb  UR=                  5       / :w  a  [?        S5      eSnOU R1                  U5      nUR                  SU SU
 S35        M     U R(                  R@                  R-                  5        H  u  pUR                  SU
 S35        M     [C        U5      S:  d   S5       e[C        U5      S:  a  S[C        U5       3OSn[C        U5      S:X  a  US   RD                  OSnU R                  (       a  SOSnUR                  U SU SU 35        U R                  (       a  UR                  U S35        SSS5        UR                  S5        UR                  5          [C        U5      S:  aC  [G        U5       H4  u  nnUR                  SURD                   S[I        SU-   5       S 35        M6     URK                  U RL                  5        URK                  U RN                  5        SSS5        UR                  S!5        SSS5        [        R                  R                  (       a!  UR                  S"5        URQ                  5       $ UR                  S#5        URQ                  5       $ s  snf ! , (       d  f       GN@= f! , (       d  f       N= f! , (       d  f       N= f)$z3Called at the end to generate a final kernel stringz(R"MTL(zcompile_mps_shader('''z#include <c10/metal/z.h>includec              3  ^   #    U  H#  oR                   (       d  M  UR                  v   M%     g 7fr   )r  r  r  s     r8   r  -MetalKernel.codegen_kernel.<locals>.<genexpr>%  s      1%5GAGG%5s   --z$[[max_total_threads_per_threadgroup(z)]]zkernel void generated_kernel(zdevice z* ,Nzfloat64 is not supported by MPSr+   z	constant zconstant long&    z%Up to 3 index variables are supportedr   uintr   
thread_posr  r  z [[thread_position_in_grid]]z- group_pos [[thread_position_in_threadgroup]]z) {r  z = thread_pos.x   r  r#  z)MTL");z''')))r,  r   r   r  cpp_wrapperr  active_range_treesr  r;  r	   r   __file__parentr
   r  mathr  r  r  r  rE   output_buffersitemsremoved_buffersr  r  input_buffersr4   float64try_get_bufferget_sizerY   sizevarsrX   rU  	enumeratechrr  r  r   getvalue)rH   rU  codeidx_varsheaderr;  header_contentstotal_reduction_sizethreadgroup_sizeouterinnerr  r   	outer_bufthread_pos_dtypethread_pos_var_namethread_pos_suffixidxr   s                      r8   codegen_kernelMetalKernel.codegen_kernel
  si   77NN9%NN34**,[[]77&&"llFNN%9&#EF + FJ\\EQ6*6(#6\   #1(^**11889DEL#
 /$$'+yy 1%)%5%51 ($ $'';=V=V#W :;K:LCP NN:;$(II$<$<$B$B$DLE 4 44  $ 1 1!''2C2CE2J KINNWYKr%#BC	 %E
 %)II$;$;$A$A$CLEGG--e4E-$%GG$:$:5$A	$,	0B0B0D0J"./P"QQ$+	$($5$5e$<	NNYykE7!#DE %D %)II$6$6$<$<$>LENN_UG1#=> %?8}q(Q*QQ(.1(ma.?d3x=/*V ! ),H(:HQK$$ $ ,0+@+@Cb!'(*=)>>Z[lZmn ((NN+,,YZ? D NN5!x=1$$-h$7S#CHH:^Cc	N;K1M %8 D../DII&  NN3G J 77NN9% }} NN6"}}K& F w ]sR   =AUT.$CU>H<T3:)U#B	U,U.U3
U	=U
U	U
U$c           	     2   [         R                  R                  nU R                  R                  R                  5        H  nUR                  U5        M     U R                  R                  5       u  pVpW[        Xg5       VV	s0 s H  u  p[        U5      U	_M     n
nn	/ U R                  R                  R                  5       QU R                  R                  R                  5       QnU Vs/ s H  oU R                  ;  d  M  UPM     nnXR                  R                  R                  5        Vs/ s H  n[        U5      PM     sn-  nU Vs/ s H  oU   PM	     nn[         R                  R                  (       a  U R                  OU R                  nSS jn[!        U R#                  5       5      S:  a  U R#                  5        Vs/ s HQ  nU" UR$                  (       a+  [&        R(                  " UR*                  U R,                  5      OUR*                  5      PMS     nnUR/                  U" US5      5        UR/                  [0        5        O*[         R                  R                  (       a  [3        S5      eU R4                  (       a  U R#                  5        Vs/ s HG  nUR$                  (       a1  U" [&        R(                  " UR*                  U R,                  5      5      OSPMI     nnUR/                  U" US5      5        UR/                  [0        5        O6[         R                  R                  (       a  US/-  nUR/                  S5        UR7                  UU[8        R:                  " S5      S	US
9  gs  sn	nf s  snf s  snf s  snf s  snf s  snf )zCodegen a call to this kernelthreadsc                    [         R                  R                  (       a+  U  Vs/ s H	  nSU S3PM     n nSSR                  U 5       S3$ U SSR                  U 5       S3$ s  snf )Nzstatic_cast<uint64_t>(rA   {r@   r#  z=[r  )r   r  r7  join)rW  kwargr  s      r8   format_threads/MetalKernel.call_kernel.<locals>.format_threadst  si    ww""BIJ'Q3A3a8'JDIIg./r22499W#5"6a88 Ks   A"r   zWe should always have threads?1
group_sizeNcpuF)devicetriton	arg_types)rW  z	list[str]r[  r6   r   r6   )r   r  wrapper_coderE   rC  keysensure_size_computedpython_argdefszipr6   r<  r?  r>  r7  cexprpexprrX   r8  r  sympyMinr  r  r  listrY   r  generate_kernel_callr4   ra  )rH   rU  nodewrapperr&  _	call_argsrc  call_argarg_typearg_name_to_typerE   r  expr_printerr\  rW  s                   r8   call_kernelMetalKernel.call_kernela  s   ''&&##((*A((+ + &*YY%=%=%?"a>A)>W
>W(:CM8#>W 	 
 S))..0R4993J3J3O3O3QR#Gt$2F2F'FtG!3!3!8!8!:;!:AQ!:;;6:;dsc*d	;%&WW%8%8tzzdjj	9 t&&()A- 002 3A ~~ IIaggt'@'@A
 3   KKw	:;T"ww"""#CDD  
 002	 3A >> UYYqww0I0IJK 3	   KKw=>T"ww""   &$$<<& 	% 	
i

 H;; s,   M:+N N 0NN
	ANANc                (   U(       d  U(       d  g U R                  U5      nU(       a  U S3OSnU(       a  U SU R                  U5       3OSnU(       a  U(       a
  SU SU S3nOSU U S3nU R                  R                  U R                  US	S
9  g )Nz < 0r  z > zif ((z) && (z	)) returnr  z) returnF)
assignment)r   r   r   r   )	rH   rI   sizelowerupperexpr_str
lower_expr
upper_exprr  s	            r8   check_boundsMetalKernel.check_bounds  s      $$T**/z&R
BGzT%6%6t%<$=>R
U:,fZL	BD*j\:D$,,?r:   )r  )r  zdict[str, sympy.Expr]r  r   r   r  )r   r  r   r6   )rU  r6   r  r   r   r   r   )
rU  r6   r  r   r  r   r  r!   r   r  )rU  r6   r  r   r  r   r   r  )r   zUnion[str | torch.dtype]r  zOptional[int]r  zOptional[Any]r  r%   r   zValueRanges[Any]r   r   )
r   r  r   r  r  r    r  +Union[CSEVariable, tuple[CSEVariable, ...]]r   r  )r  r   r   r  r  )rU  zOptional[str]r   r6   )rU  r6   ro  r   r   r  )
rI   r   r{  r   r|  r%   r}  r%   r   r  )(r   r   r   r   r   r   	overridesr  newvar_prefixr  r  r   rF   rj  r   ri  r<   r  kexprr
   r;  __annotations__r  r  r  r  r  r  r   unknownr  r  r  r   r,  rT  rw  r  r   __classcell__r  s   @r8   r  r    s   EIFMOO##EL  E&&EE)7)4G_4=? :?-%- - 
	-%@ SW<< *<3><FO<	<*8 %)'+##.#6#6#8' " %	
  ! 
,  &	
 ; 
5 [2[2 [2 &	[2
 ;[2 
5[2zW6#JUnB
H@@&0@9=@FJ@	@ @r:   r  c                  J   ^  \ rS rSr\rSU 4S jjr        SS jrSrU =r	$ )MetalSchedulingi  c                   > [         TU ]  U5        [        R                  R                  nUb<  [        R                  R
                  (       d  UR                  R                  S5        g g g )NzDfrom torch._inductor.runtime.runtime_utils import compile_mps_shader)r  r  r   r  rd  r7  rI  r  )rH   	schedulerrp  r  s      r8   r  MetalScheduling.__init__  sQ    #''&&77&&%%Z ' r:   c                j   [         R                  R                  nXR                  ;   a  UR                  U   nU$ SUR	                  5        3n[         R                  R
                  (       a  SU 3U-   nU S3nOU S3nXTR                  U'   [        X$5      u  pxU SU 3n	UR                  XaU	SS9  U$ )Nmps_lib_z+at::native::mps::DynamicMetalShaderLibrary _funcz.generated_kernel
F)gpu)r   r  rd  src_to_kernelnext_kernel_suffixr7  r   define_kernel)
rH   src_codenode_scheduler   rp  kernel_namemps_lib_nameoriginsdetailed_originsmetadata_comments
             r8   r  MetalScheduling.define_kernel  s     ''&&,,,!//9K( ! &g&@&@&B%CDLww""A,P  ".e4!-.?@.9!!(+(;M(S%G")"-=,>?!!,:JPU!Vr:   r   )r  zOptional[Scheduler]r   r  )r  r6   r  zlist[SchedulerNode]r   r  r   r6   )
r   r   r   r   r  kernel_typer  r  r   r  r  s   @r8   r  r    s7    K,?IT	 r:   r  )r7   z)Union[float, int, bool, str, CSEVariable]r   r6   )I
__future__r   ry  r  loggingr;  pathlibr   typingr   r   r   rk  sympy.printing.precedencer   r4   torch.utils._cpp_embed_headersr	   torch.utils._ordered_setr
   torch.utils._sympy.printersr   r   ExprPrinter_torch.utils._sympy.value_rangesr   r  r   r   r   virtualizedr   r   r   commonr   r   r   r   r   r   simdr   r   r   r   ops_handlerr    r!   r  r"   r#   r$   	getLoggerr   r   r%   int8int16int32int64uint8r+   r,   r  r   r9   r<   r   _initialize_pointwise_overridesr}  r  r  r   r:   r8   <module>r     s*   #      / /  0  9 / O 7 G G , ,  C B 64! 
JJ	JJ	KK	KK	KK	KK	KK	JJ	NNH
Y#| Y#xW[ Wt  . .u 5  & & (t@* t@n%n %r:   