
    7hr                    n   S SK Jr  S SKrS SKrS SKrS SKrS SKrS SKrS SKrS SK	r	S SK
r
S SKrS SKrS SKrS SKrS SKJrJr  S SKJr  S SKJrJrJrJr  S SKJrJrJrJrJrJr  S SKrS SK rS SK!J"r"  S SK#J$r$  S S	K%J&r&  S S
K'J(r(J)r)J*r*J+r+J,r,  S SK-J.r.J/r/J0r0  S SK1J2r2  S SK3J4r4  \(       a  S SK5J6r6  S SK7J8r8  SSK9J:r:  SSK;J<r<  SSK=J>r>  Sr?\2" \@S5      rA " S S\B5      rC " S S5      rD " S S5      rE\\&R                  \&R                  4   rH\R                   " S S5      5       rJ\R                   " S S5      5       rK " S  S!\K5      rL " S" S#5      rM " S$ S%5      rN " S& S'\K5      rO " S( S)\M\O5      rP " S* S+\N\O5      rQ " S, S-\M\K5      rR " S. S/\N\K5      rS\R                  S2S0 j5       rU    S3S1 jrVg)4    )annotationsN)IterableSequence)ThreadPoolExecutor)byrefc_size_tc_void_pCDLL)AnyCallableIOOptionalTYPE_CHECKINGUnion)get_interface_for_device)rand_strided)ir)CppCodeCacheCUDACodeCache
DLLWrapperget_hashPyCodeCache)get_gpu_typeget_ld_library_pathis_gpu)getArtifactLogger)
OrderedSet)
ModuleType)TritonTemplateCaller   )config)benchmarker)VCUDA_VISIBLE_DEVICES
autotuningc                      \ rS rSrSrg)!NonzeroWorkspaceNotSupportedError6    N__name__
__module____qualname____firstlineno____static_attributes__r)       Z/var/www/fran/franai/venv/lib/python3.13/site-packages/torch/_inductor/autotune_process.pyr'   r'   6       r0   r'   c                      \ rS rSrSr\SS j5       r\SS j5       r\SS j5       rSS jr	S r
SS jrSS	 jrSSS
 jjrSSS jjrSS jrSS jrSS jrSrg)TuningProcess:   z>
Class to launch and interact with a benchmarking subprocess.
c                   ^ ^ [         R                  S[        R                  " 5       [        R                  R                  [        5      5        U U4S jn U" 5         g! [         a     gf = f)z$
Entry point for the child process.
z3Started autotune subprocess %s. Visible devices: %sc                    >  [         R                  T5      n U c  g  U " 5       n[         R                  UT5        M9  ! [         a  nUn S nAN)S nAff = fN)r4   recv	Exceptionsend)jobresulte	read_pipe
write_pipes      r1   workloop,TuningProcess.process_main.<locals>.workloopJ   sX    #((3; UF ""6:6  ! Fs   < 
AAAN)autotuning_logdebugosgetpidenvirongetr$   EOFError)r?   r@   rA   s   `` r1   process_mainTuningProcess.process_main?   sQ    
 	AIIKJJNN/0	

	7	J 		s   A 
A+*A+c                P    [         R                  " X5        UR                  5         g r8   )pickledumpflush)objr@   s     r1   r;   TuningProcess.send\   s    C$r0   c                .    [         R                  " U 5      $ r8   )rM   load)r?   s    r1   r9   TuningProcess.recva   s    {{9%%r0   c                0    Xl         U R                  5         g r8   )devicestart)selfrV   s     r1   __init__TuningProcess.__init__e   s    

r0   c                f   [         R                  R                  [         R                  R                  [        5      S5      n[         R
                  " 5       u  p#[         R
                  " 5       u  pE[         R                  " US5      U l        [         R                  " US5      U l        [        R                  " 5       U l        U R                  R                  U R                  [        R                  5        [        R                  US[         R                   " 5        3S[#        U5       3S[#        U5       3/n[         R$                  R'                  S[         R(                  R                  [        R                  5      5      S[+        5       [,        R.                  (       a  S	OSS
.nU R0                  b  [#        U R0                  5      U[2        '   [4        R6                  " U0 [         R$                  EUEX%4S9U l        [         R:                  " U5        [         R:                  " U5        SU l        g)z$
Start the benchmarking subprocess.
z__autotune_main__.pywbrbz	--parent=z
--read-fd=z--write-fd=TORCH_CUSTOM_PYTHONPATH01)
PYTHONPATHTORCH_WARM_POOLLD_LIBRARY_PATH3TORCHINDUCTOR_PROFILE_WITH_DO_BENCH_USING_PROFILINGN)envpass_fdsT)rE   pathjoindirname__file__pipefdopenr@   r?   	selectorsDefaultSelectorselectorregister
EVENT_READsys
executablerF   strrG   rH   pathsepr   r!   /profile_bandwidth_with_do_bench_using_profilingrV   r$   
subprocessPopenprocesscloserunning)rX   entrysubproc_read_fdwrite_fdread_fdsubproc_write_fdcmd	extra_envs           r1   rW   TuningProcess.starti   s    RWW__X68NO$&GGI!$&GGI!))Hd37D1!113t~~y/C/CD NN		}%_-./#./01
 **..)2::??388+D
  #24 EE DG
	 ;;".1$++.>I*+!''+2::++%8

 	!
!"r0   c                `    U R                   =(       a    U R                  R                  5       SL $ )z*
True if the subprocess is still running.
N)r{   ry   pollrX   s    r1   aliveTuningProcess.alive   s%     ||; 1 1 3t ;;r0   c                    U R                  5       (       d  U R                  5         [        R                  XR                  5        g)z(
Push a work item to the child process.
N)r   rW   r4   r;   r@   )rX   reqs     r1   putTuningProcess.put   s*     zz||JJL30r0   c                    U R                   R                  U5      (       d"  [        SU R                  R                   35      e[
        R                  U R                  5      n[        U[        5      (       a  UeU$ ! [         a    U R                  5         e [         a    U R                  5         e [         a<    [        R                  SU R                  R                  5        U R                  5         e f = f)zs
Get a response from the child process. Raises TimeoutError on timeout;
raises EOFError if the subprocess crashes.
zTimeout in autotune subprocess z.Unexpected exception in autotune subprocess %s)ro   selectTimeoutErrorry   pidr4   r9   r?   killrI   rz   r:   rC   	exception
isinstance)rX   timeoutr=   s      r1   rH   TuningProcess.get   s    
	==''00"%DT\\EUEUDV#WXX"''7F fi((L!  	IIK 	JJL 	$$@$,,BRBR IIK	s   A!A< <A:C6c                    U R                  5       (       a   [        R                  SU R                  5        U(       a  U R	                  5         gg)z3
Signal the child process to shut down gracefully.
N)r   r4   r;   r@   wait)rX   r   s     r1   shutdownTuningProcess.shutdown   s4     ::<<tT__5IIK r0   c                    U R                  5       (       a  U R                  R                  5         U R                  5         g)z%
Wait for the child process to exit.
N)r   ry   r   rz   r   s    r1   r   TuningProcess.wait   s(     ::<<LL

r0   c                    U R                   R                  5         U R                  R                  5         U R                  R                  5         SU l        g)z
Close resources.
FN)ro   rz   r?   r@   r{   r   s    r1   rz   TuningProcess.close   s;     	r0   c                    U R                  5       (       aD  [        R                  SU R                  R                  5        U R                  R                  5         U R                  5         g)z&
Send a SIGKILL to the child process.
z)Sending SIGKILL to autotune subprocess %dN)r   rC   errorry   r   r   rz   r   s    r1   r   TuningProcess.kill   sH     ::<<  ;   LL

r0   )rV   ry   r?   r{   ro   r@   N)r?   	IO[bytes]r@   r   returnNone)rP   r   r@   r   r   r   )r?   r   r   r   )rV   Optional[int])r   bool)r   r   r   r   )g      ^@)r   floatr   r   )T)r   r   r   r   r   r   )r+   r,   r-   r.   __doc__staticmethodrJ   r;   r9   rY   rW   r   r   rH   r   r   rz   r   r/   r)   r0   r1   r4   r4   :   sq      8   & &.`<16
r0   r4   c                  \    \ rS rSrSrS
S jr\SS j5       rS
S jrSS jr	    SS jr
Srg	)TuningProcessPool   z
Maintains a pool of TuningProcesses to benchmark kernels in parallel
across devices. By default, we create one TuningProcess per device and
set the sub-process environment to make only that device visible.
c                V   U R                  5       n[        R                  SU5        U Vs/ s H  n[        US9PM     snU l        [
        R                  " 5       U l        U R                   H  nU R                  R                  U5        M      [        [        U5      S9U l        gs  snf )z
Start the child processes.
z$Sub-process autotune device list: %s)rV   )max_workersN)get_device_listrC   rD   r4   	processesqueueQueueprocess_queuer   r   lenexecutor)rX   devicesrV   ps       r1   rY   TuningProcessPool.__init__   s     &&(CWM FMMW6-v6WM9>A""1%   +s7|D Ns   B&c                    [         R                  (       d  S/$ [        5       n [        U 5      nUR	                  5       n[
        [        R                  ;   aR  [        R                  [
           R                  S5       Vs/ s H  n[        U5      PM     nn[        U5      U::  d   eU$ [        [        U5      5      $ s  snf )z4
Gather the list of devices to be used in the pool.
N,)r!   autotune_multi_devicer   r   device_countr$   rE   rG   splitintr   listrange)gpu_typedevice_interfacecountdr   s        r1   r   !TuningProcessPool.get_device_list  s    
 ++6M>3H= --/  2::-')zz2F'G'M'Mc'RS'R!s1v'RGSw<5(((NE%L!!	 Ts   >B<c                    U R                   R                  5         U R                   H  nUR                  SS9  M     U R                   H  nUR                  5         M     g)z%
Signal all child processes to exit.
F)r   N)r   r   r   r   )rX   r   s     r1   r   TuningProcessPool.shutdown  sG     	 AJJEJ"  AFFH  r0   c                   UR                   c   eU R                  R                  5       nUR                  UR                   R                  5         UR                  [
        R                  5      U R                  R                  U5        $ ! [         aC    [        R                  " SU S35        [        S5      s U R                  R                  U5        $ [         aC    [        R                  " SU S35        [        S5      s U R                  R                  U5        $ f = f! U R                  R                  U5        f = f)z
Entry point for the thread-pool helper threads: Wait for an open TuningProcess,
remove it from the queue, execute the benchmark in that subprocess, and return
the TuningProcess to the queue.
zTimed out benchmarking choice 'z['. It will be ignored. Please debug the root cause in case the choice can bring perf gains.infzFailed to benchmark choice ')bmreqr   rH   r   	benchmarkr!   +max_autotune_subproc_result_timeout_secondsr   warningswarnr   r:   )rX   choicery   s      r1   targetTuningProcessPool.target#  s     ||'''$$((*FLL**+	,;;BB$ ""7+  	 MM1& :W W
 < ""7+  	 MM.vh 7W W
 <""7+	  ""7+s0   B
 
/D"9D% .D"D% !D""D% %Ec           	     v    [        [        XR                  R                  U R                  U5      5      5      nU$ )z.
Benchmark each choice in a separate process.
)dictzipr   mapr   )rX   choicesresultss      r1   r   TuningProcessPool.benchmarkB  s-     s7MM$5$5dkk7$KLMr0   )r   r   r   Nr   )r   zSequence[Optional[int]])r   r   r   r   r   zlist[TritonTemplateCaller]r   z!dict[TritonTemplateCaller, float])r+   r,   r-   r.   r   rY   r   r   r   r   r   r/   r)   r0   r1   r   r      sB    E& " "(	,>+ 
+r0   r   c                  |    \ rS rSr% S\S'   S\S'   S\S'   S\S	'   S
\S'   SrS\S'   \    SS j5       rSS jrSr	g)
TensorMetaiT  ztorch.devicerV   ztorch.dtypedtypeztorch._prims_common.ShapeTypesizesztorch._prims_common.StrideTypestridesr   offsetNzOptional[str]namec                D   [        U[        5      (       a;  U Vs/ s H  o R                  U5      PM     nn[        S U 5       5      (       d   eU$ Un[        U[        R
                  5      (       a  [        R                  " SUS9nUR                  5       nUc   eUR                  5       nUc   e[        UU[        R                  R                  R                  UR                  5       [        R                   S9[        R                  R                  R                  UR#                  5       [        R                   S9[        R                  R                  R%                  UR'                  5       R(                  [        R                   S9UR+                  5       S9$ s  snf )Nc              3  B   #    U  H  n[        U[        5      v   M     g 7fr8   )r   r   .0xs     r1   	<genexpr>*TensorMeta.from_irnodes.<locals>.<genexpr>c  s     A&Qz!Z00&s   fake)r   layout)fallback)rV   r   r   r   r   r   )r   r   from_irnodesallr   LayoutBuffer	get_dtype
get_devicer   r#   graphsizevars
size_hintsget_sizer!   unbacked_symint_fallback
get_stride	size_hint
get_layoutr   get_name)clsirnodesr   r=   noder   rV   s          r1   r   TensorMeta.from_irnodes]  s]    gx((>E Fg!1!1!!4gF FA&AAAAAMdBII&&99&6D    "!!!''""--88 .  GG$$//!88 0  77##--!((88 .  
 	
 !Gs   Fc                    [        U R                  U R                  U R                  U R                  U R
                  S9$ )N)rV   r   
extra_size)r   r   r   rV   r   r   r   s    r1   	to_tensorTensorMeta.to_tensor  s2    JJLL;;**{{
 	
r0   r)   )r   z/Union[LayoutOrBuffer, Sequence[LayoutOrBuffer]]r   #Union[TensorMeta, list[TensorMeta]])r   torch.Tensor)
r+   r,   r-   r.   __annotations__r   classmethodr   r   r/   r)   r0   r1   r   r   T  sQ    ((++KD-!
E!
	,!
 !
F
r0   r   c                      \ rS rSrSr          SS jr      SS jrSS jrSS.     SS jjrSS.     SS	 jjr	S
r
g)BenchmarkRequesti  a  
Only handle triton template benchmark for now. The extern kernel benchmark
can be done inside the same process since they usually don't cause crash.

Important: Instances of this class and subclasses have to be serializable
across process boundaries. Do not put CUDA Tensors in here!
c                   ^ Xl         [        U[        5      (       a  U/nX l        [        T[        [
        45      (       a0  [        T5      S:  a  [        U4S jT 5       5      (       d   eTS   mTU l        X@l	        g )Nr    c              3  n   >#    U  H*  nS   H   n[        TS   U5      [        X5      :H  v   M"     M,     g7f))rV   r   r   r   r   r   N)getattr)r   r   attroutput_tensor_metas      r1   r   ,BenchmarkRequest.__init__.<locals>.<genexpr>  s=      / Q .q148GA<LL Q M/s   25r   )
kernel_namer   r   input_tensor_metatupler   r   r   r  
extra_args)rX   r  r  r  r  s      ` r1   rY   BenchmarkRequest.__init__  s     ''44!2 3!2(5$-88%&* /    
 "4A!6"4$r0   c                   [         er8   NotImplementedErrorrX   outinput_tensorss      r1   make_run_fnBenchmarkRequest.make_run_fn  s
     "!r0   c                    g r8   r)   r   s    r1   cleanup_run_fnBenchmarkRequest.cleanup_run_fn  s    r0   Nr  c                   [         er8   r  rX   fnr  r  s       r1   do_benchBenchmarkRequest.do_bench  s
     "!r0   c               :   [         R                  [        R                  5      nU(       a  [        R                  " 5       nUcG  [        U5      S:X  d   e[        S U R                   5       5      nU R                  R                  5       nU(       a-  [        R                  " 5       W-
  n[        R                  " 5       n U R                  " USU06nU(       a-  [        R                  " 5       W-
  n[        R                  " 5       nU R                  " U/UQUP76 nU(       a:  [        R                  " 5       W-
  n	[         R                  S[!        U 5      WWU	5        U R#                  5         U$ ! [         a#    [         R                  S5        [        S5      s $ f = f)Nr   c              3  @   #    U  H  oR                  5       v   M     g 7fr8   )r   r   s     r1   r   -BenchmarkRequest.benchmark.<locals>.<genexpr>  s     !P9OA++--9Os   r  z0Skipping op due to nonzero workspace requirementr   z6InChildProcess %s: load %f, create tensor %f, bench %f)rC   isEnabledForloggingDEBUGtimer   r  r  r  r   r  r'   infor   r"  rD   rt   r  )
rX   r  r  rD   start_tscreate_tensor_elapser!  load_elapseresbench_elapses
             r1   r   BenchmarkRequest.benchmark  sL   
 ++GMM:yy{H ;}%***!!P9O9O!PPM))335C#'99;#9 yy{H	 !!=:c:B ))+0Kyy{HmmB44499;1L  HD	$ 	
+ 1 	  RS<	 s   ?E- -*FF)r  r  r  r  )
r  rt   r  r  r  r  r  Iterable[Any]r   r   r  r  r  r  r   zCallable[[], None]r   r  r  r  zOptional[torch.Tensor]r   r   )r+   r,   r-   r.   r   rY   r  r  r"  r   r/   r)   r0   r1   r  r    s    %% ?% @	%
 "% 
%6"*"1="	"
 '+	" %" $	"
 
" '+)$) $) 
	) )r0   r  c                  ^    \ rS rSrSr     S         S	S jjrSS.     S
S jjrSrg)_TestBenchmarkRequesti  z
Supports unit testing. Defined in this file instead of the test file so the
TuningProcess sub-process can unpickle these objects.
Nc                @    Xl         X l        X0l        X@l        XPl        g r8   )r=   rV   sleepexccrash)rX   r=   rV   r8  r9  r:  s         r1   rY   _TestBenchmarkRequest.__init__  s     

r0   r  c                  U R                   b=  [        R                  R                  [        S 5      [        U R                   5      :X  d   eU R                  (       a   [        R                  " U R                  5        U R                  (       a  U R                  eU R                  (       a  [        R                  " S5        U R                  $ )Nr    )rV   rE   rG   rH   r$   rt   r8  r*  r9  r:  rr   exitr=   r  s      r1   r   _TestBenchmarkRequest.benchmark   sx     ;;"::>>"6=T[[AQQQQ::JJtzz"88((N::HHQK{{r0   )r:  rV   r9  r=   r8  )g        NNNF)
r=   r   rV   r   r8  zOptional[float]r9  zOptional[Exception]r:  r   r4  )r+   r,   r-   r.   r   rY   r   r/   r)   r0   r1   r6  r6    sv      $!%#'  	
 !  KO*1G	 r0   r6  c                  0    \ rS rSrSS.     SS jjrSrg)GPUDeviceBenchmarkMixini  Nr  c                  [        S / UQUP 5       5      n[        U5      S::  d
   SU 35       e[        S U 5       S5      n[        U5      n[        U5      S:X  a  [        [	        U5      5      nOUR                  5       nUR                  U5         [        R                  " U5      nUR                  5         S S S 5        U$ ! , (       d  f       W$ = f)Nc              3    #    U  H{  n[        U[        R                  5      (       d  M$  [        UR                  R
                  5      (       d  MJ  UR                  R                  c  Mc  UR                  R                  v   M}     g 7fr8   )r   torchTensorr   rV   typeindexr   tensors     r1   r   3GPUDeviceBenchmarkMixin.do_bench.<locals>.<genexpr>  s^      $
/&%,,/   v}}))*   ##	  FMM/s   #B"BB(Br    zCan not mix devices c              3     #    U  HA  n[        UR                  R                  5      (       d  M)  UR                  R                  v   MC     g 7fr8   )r   rV   rE  rG  s     r1   r   rI    s5      +F&--,,- #""+s
   (AAcuda)
r   r   nextr   itercurrent_devicerV   r"   benchmark_gpusynchronize)	rX   r!  r  r  device_idx_setdevice_typer   
device_idxr/  s	            r1   r"   GPUDeviceBenchmarkMixin.do_bench  s     $ $
/M/3/$
 
 >"a'P+??O)PP'+
 
 4K@~!#d>23J)88:J$$Z0++B/C((* 1 
	 10 
s   'C
Cr)   r4  r+   r,   r-   r.   r"  r/   r)   r0   r1   r@  r@    s/    
 '+	 % $	
 
 r0   r@  c                  0    \ rS rSrSS.     SS jjrSrg)CPUDeviceBenchmarkMixini1  Nr  c               .    [         R                  " U5      $ r8   )r"   benchmark_cpur   s       r1   r"   CPUDeviceBenchmarkMixin.do_bench2  s     ((,,r0   r)   r4  rU  r)   r0   r1   rW  rW  1  s/    
 '+	- %- $	-
 
- -r0   rW  c                     ^  \ rS rSr     S                           SU 4S jjjr      S	S jrS rS
S jrSrU =r	$ )TritonBenchmarkRequesti;  c                   > [         TU ]  XX45        XPl        X`l        Xpl        Xl        Xl        Xl        Xl        Xl	        Xl
        g r8   )superrY   module_pathmodule_cache_key
num_stages	num_warpsnum_consumer_groupsnum_buffers_warp_specmatrix_instr_nonkdimwaves_per_eukpack)rX   r  r  r  r  r_  r`  ra  rb  rc  rd  re  rf  rg  	__class__s                 r1   rY   TritonBenchmarkRequest.__init__>  sH      	9KX& 0$"#6 %:"$8!(
r0   c                  [         R                  " U R                  U R                  5      n[        R                  SU R                  U R                  5        [        X0R                  5      R                  n[        U R                  5      nSUR                  l        0 nSS KnSUR                  U5      R                  ;   a  SUS'   UR                   R"                  S:X  a  SnOPUR                   R"                  n	[%        U	5      n
U
R'                  U R(                  R                   R*                  5      n[-        [        X0R                  5      [.        R0                  R2                  R4                  R6                  5      (       a"  [8        R:                  " U/UQUPUQ70 UDSU0D6$ [8        R:                  " U/UQUPUQ70 UDUSS.D6$ )	Nz"benchmark module key: %s, path: %sFr   warmupcpustreamT)rm  benchmark_run)r   load_by_key_pathr`  r_  rC   rD   r
  r  runr   r  __self__with_bandwidth_infoinspect	signature
parametersrV   rE  r   get_raw_streamr  rF  r   rC  	_inductorruntimetriton_heuristicsDebugAutotuner	functoolspartial)rX   r  r  mod
run_methodr  
warmup_argrs  rm  rR  r   s              r1   r  "TritonBenchmarkRequest.make_run_fnY  s    **4+@+@$BRBRS0!!	
 S"2"2377
$//*
27
/ 
w((4???#(Jx ::??e#F**//K7D%44''..44F C))*OO##55DD
 
 $$  	
    $$  	
  " r0   c                    [         R                  " U R                  U R                  5      n[	        XR
                  5      R                  5         g r8   )r   ro  r`  r_  r
  r  
precompile)rX   r}  s     r1   r  !TritonBenchmarkRequest.precompile  s7    **4+@+@$BRBRS%%&113r0   c                Z    SU R                   < SU R                  < SU R                  < 3$ )Nself.kernel_name=z, self.module_path=z, self.module_cache_key=)r  r_  r`  r   s    r1   __str__TritonBenchmarkRequest.__str__  s2    #$""$$8t'7'7&99RD<Q<Q;STTr0   )	rg  re  r`  r_  rd  rc  ra  rb  rf  )r   r   r   r   r   )r  rt   r  r  r  r  r  r2  r_  rt   r`  rt   ra  r   rb  r   rc  r   rd  r   re  r   rf  r   rg  r   r   r   r3  r   rt   )
r+   r,   r-   r.   rY   r  r  r  r/   __classcell__rh  s   @r1   r\  r\  ;  s     $%%&$% ? @	
 "     !  # "   
 64*41=4	4l4U Ur0   r\  c                      \ rS rSrSrg)TritonGPUBenchmarkRequesti  r)   Nr*   r)   r0   r1   r  r    r2   r0   r  c                      \ rS rSrSrg)TritonCPUBenchmarkRequesti  r)   Nr*   r)   r0   r1   r  r    r2   r0   r  c                     ^  \ rS rSrSr            SU 4S jjrS r      SS jrSS jrS r	SS jr
SS	 jrS
rU =r$ )CUDABenchmarkRequesti  aM  
A class to handle CUDA (CUTLASS) benchmark requests. This class is for
managing the lifecycle of a CUDA kernel benchmark, including compiling
the source code, managing workspace memory, and executing the kernel.

Important: Instances of this class have to be serializable across
process boundaries. Do not put CUDA Tensors in here!
c                   > [         TU ]  XX45        XPl        SU l        S U l        S U l        SU l        SU l        SU l        [        R                  " U R                  S5      u  U l        U l        g )Nr   F so)r^  rY   source_codeworkspace_size	workspaceDLL_workspace_size_updatedhash_keysource_filer   writerX   r  r  r  r  r  rh  s         r1   rY   CUDABenchmarkRequest.__init__  sk     	9KX&#$15)-',$ "*7*=*=d>N>NPT*U't'r0   c                    [         R                  SU 5        [        R                  " U R                  S5        [         R                  SU 5        g)zk
Precompile the CUDA source code to populate the CUDACodeCache.
This may happen in a separate thread pool.
Precompiling %sr  Done precompiling %sN)rC   rD   r   compiler  r   s    r1   r  CUDABenchmarkRequest.precompile  s<    
 	.5d..53T:r0   c          	       ^ U R                  5         U R                  5         [        U5      U/-    Vs/ s H  n[        UR	                  5       5      PM     nn[
        R                  SU R                  U R                  U R                  U R                  UU R                  5        [        [        R                  R                  5       R                  5      n[!        U R                  U R                  5      n[        S5      nU R"                  S:  af  [        R$                  " U R"                  S-   S-  [        R&                  UR(                  S9U l        [        U R*                  R	                  5       5      n[,        R.                  " U/UQU R                  QSPUPUP76 n U" 5         U$ s  snf ! [0         a-  n	[3        U	5      mU4S jn
U R5                  5         U
s Sn	A	$ Sn	A	ff = f)zS
Create a function to run the CUDA kernel with the given input and output tensors.
zqmake_run_fn: self.kernel_name=%s, self.source_file=%s, self.hash_key=%s, self.DLL=%s, args=%s, self.extra_args=%sr         )r   rV   Nc                    > [        T 5      er8   )RuntimeError)err_msgs   r1   raise_runtime_error=CUDABenchmarkRequest.make_run_fn.<locals>.raise_runtime_error  s    "7++r0   )ensure_dll_loadedupdate_workspace_sizer   r	   data_ptrrC   rD   r  r  r  r  r  rC  rK  current_streamcuda_streamr
  r  zerosfloat64rV   r  r{  r|  r  rt   r  )rX   r  r  rH  args
stream_ptrr~  workspace_ptrretr>   r  r  s              @r1   r   CUDABenchmarkRequest.make_run_fn  s    	 ""$:>}:MQTPU:UV:U*+:UVMMHHOO	
 ejj779EEF
TXXt'7'78
 ""[[$$q(Q.mmzzDN
 %T^^%<%<%>?M 

 __
 	

 
 
		'E 
W WD  	'!fG, !&&	's#   #F*!F/ /
G&9"G!G&!G&c           
         U R                   (       a  g U R                  5         [        U R                   Vs1 s H  oR                  iM     sn5      n[        US-   5       Vs/ s H  n[        S 5      PM     nn[        [        R                  R                  5       R                  5      n[        U R                  U R                  5      n[        5       nU" / UQU R                  Q[!        U5      PS PUP76   [        R                  R#                  5         UR$                  U l        [(        R+                  SU R&                  U R                  U R,                  U R.                  U R                  UU R                  5        SU l         g s  snf s  snf )Nr    zupdate_workspace_size called: new workspace size=%d, self.kernel_name=%s, self.source_file=%s, self.hash_key=%s, self.DLL=%s, args=%s, self.extra_args=%sT)r  r  r   r  r   r   r	   rC  rK  r  r  r
  r  r  r   r  r   rP  valuer  rC   rD   r  r  )rX   metaunique_input_count_r  r  r~  c_workspace_sizes           r1   r  *CUDABenchmarkRequest.update_workspace_size  s^   ''  #'#9#9:#94YY#9:
 )..@1.D(EF(E1(EFejj779EEF
TXXt'7'78
#: 	
	
__	
  	
 	
 	
 	

 .44 hMMHHOO		
 (,$; ;Fs   F"Fc                    U R                   c5  [        R                  " U R                  S5      u  U l         U l        U l        g g )Nr  )r  r   rS   r  r  r  r   s    r1   r  &CUDABenchmarkRequest.ensure_dll_loaded  s:    888E8J8J  $95DHdmT%5 r0   c                n    U R                   b!  U R                   R                  5         S U l         S U l        g r8   )r  rz   r  r   s    r1   r  #CUDABenchmarkRequest.cleanup_run_fn$  s(    88HHNNDHr0   c                Z    SU R                   < SU R                  < SU R                  < 3$ )Nr  z, self.source_file=z, self.hash_key=)r  r  r  r   s    r1   r  CUDABenchmarkRequest.__str__*  s0    #$""$$8t'7'7&99JDMM;KLLr0   )r  r  r  r  r  r  r  r  rt   r  r  r  r  r  r2  r  rt   r   r   r3  r   r  )r+   r,   r-   r.   r   rY   r  r  r  r  r  r  r/   r  r  s   @r1   r  r    s    VV ?V @	V
 "V V 
V$;4*41=4	4l",HM Mr0   r  c                  t   ^  \ rS rSr            SU 4S jjrS r      S	S jrS
S jrSS jrSr	U =r
$ )CppBenchmarkRequesti.  c                `   > [         TU ]  XX45        XPl        [        U5      U l        S U l        g r8   )r^  rY   r  r   r  r  r  s         r1   rY   CppBenchmarkRequest.__init__2  s.     	9KX& -6:r0   c                    [         R                  SU 5        [        R                  " U R                  SS9  [         R                  SU 5        g )Nr  rl  rR  r  )rC   rD   r   rS   r  r   s    r1   r  CppBenchmarkRequest.precompile?  s<     	.5$**>3T:r0   c               h   [         R                  " U R                  SS9U l        [	        U5      U/-    Vs/ s H  o3R                  5       PM     nn[        R                  SU R                  U R                  UU R                  5        [        U R                  U R                  5      n[        S U R                   5       5      (       d   e[        R                  /[        U5      [        [	        U R                  5      5      -   -  Ul        [         R"                  " U/UQU R                  Q76 $ s  snf )Nrl  r  zJmake_run_fn: self.kernel_name=%s, self.DLL=%s, args=%s, self.extra_args=%sc              3  V   #    U  H  n[        U[        R                  5      v   M!     g 7fr8   )r   ctypesc_ulonglong)r   args     r1   r   2CppBenchmarkRequest.make_run_fn.<locals>.<genexpr>U  s      R/3:c6#5#566/s   '))r   rS   r  r  r   r  rC   rD   r  r  r
  r   r  r  r   argtypesr{  r|  )rX   r  r  rH  r  r~  s         r1   r  CppBenchmarkRequest.make_run_fnF  s     $$T%5%55I04]0Cse0KL0Kf!0KLXHHOO	
 TXXt'7'78
R$//RRRRR%112ID122


   

 __
 	
! Ms   D/c                    U R                   b8   [        U R                   S5      (       a  U R                   R                  5         g g g )Nrz   )r  hasattrrz   r   s    r1   r  "CppBenchmarkRequest.cleanup_run_fna  s<    88 txx))  *	  r0   c                "    SU R                   < 3$ )Nr  )r  r   s    r1   r  CppBenchmarkRequest.__str__i  s    #$""$%%r0   )r  r  r  r  r3  r   r  )r+   r,   r-   r.   rY   r  r  r  r  r/   r  r  s   @r1   r  r  .  st    ;; ?; @	;
 "; ; 
;;
*
1=
	
6!& &r0   r  c                 Z    [        5       n [        R                  " U R                  5        U $ r8   )r   atexitrp   r   )pools    r1   get_tuning_process_poolr  m  s    D
OODMM"Kr0   c                4    [        5       R                  U 5      $ )zG
Do benchmarking in a subprocess and return the perf number (latency).
)r  r   )r   s    r1   benchmark_in_sub_processr  t  s     #$..w77r0   )r   r   r   )W
__future__r   r  r  dataclassesr{  r(  rE   rM   r   rm   rw   rr   r*  r   collections.abcr   r   concurrent.futuresr   r   r   r	   r
   typingr   r   r   r   r   r   rC  torch._inductor.async_compiletorch._dynamo.device_interfacer   torch._dynamo.testingr   torch._inductorr   torch._inductor.codecacher   r   r   r   r   torch._inductor.utilsr   r   r   torch._loggingr   torch.utils._ordered_setr   typesr    torch._inductor.select_algorithmr   r  r!   runtime.benchmarkingr"   virtualizedr#   r$   r+   rC   r:   r'   r4   r   r   r   LayoutOrBuffer	dataclassr   r  r6  r@  rW  r\  r  r  r  r  cacher  r  r)   r0   r1   <module>r     s   "      	     
   . 1 2 2 D D  $ C .   L K , /  E  -  . "8\:		 	l l^e eP ryy"))+, 3
 3
 3
l ] ] ]@, D   F- -YU- YUx	 79O 		 79O 	LM24D LM^<&13C <&~  8'8&8r0   