
    h5c                        S SK r S SKrS SKrS SKrS SKrS SKrS SKJr  S SKJr  S SK	J
r
  S SKJr  S SKJr  S SKJr  \R"                  R%                  \R"                  R'                  \5      5      r\R"                  R+                  \S5      /r\R"                  R+                  \S	5      rS
/r\ R2                  " 5       S 5       r\ R2                  " 5       S 5       r " S S\5      rS rSSSSSS.rSSSSSS.r Sr!S r" " S S5      r#\$" S \%" S5       5       5      r&S\&S'   S\&S '   S \&S'   S! r'S" r( " S# S$\5      r) " S% S&\5      r*g)'    N)Path)knobs)compile_module_from_src)_allocation)	GPUTarget)	GPUDriverincludelibcudac            	      *   [         R                  R                  =n (       a  U /$ [        R                  " SS/5      R                  5       nUR                  5        Vs/ s H  nSU;   d  M  UR                  5       S   PM      nnU Vs/ s H"  n[        R                  R                  U5      PM$     nn[        R                  " S5      nU(       al  U(       de  UR                  S5       Vs/ s HI  n[        R                  R                  [        R                  R                  US5      5      (       d  MG  UPMK     nnSnU(       a  US[        U5      -  -  nUS	-  nO
US
-  nUS-  n[        S U 5       5      (       d   U5       eU$ s  snf s  snf s  snf )Nz/sbin/ldconfigz-plibcuda.so.1LD_LIBRARY_PATH:zlibcuda.so cannot found!
z!Possible files are located at %s.z:Please create a symlink of libcuda.so to any of the files.z<Please make sure GPU is set up and then run "/sbin/ldconfig"z- (requires sudo) to refresh the linker cache.c              3      #    U  HB  n[         R                  R                  [         R                  R                  US 5      5      v   MD     g7f)r   N)ospathexistsjoin).0r   s     W/var/www/fran/franai/venv/lib/python3.13/site-packages/triton/backends/nvidia/driver.py	<genexpr>libcuda_dirs.<locals>.<genexpr>(   s/     Sddrww~~bggll4@AAds   A
A)r   nvidialibcuda_path
subprocesscheck_outputdecode
splitlinessplitr   r   dirnamegetenvr   r   strany)	env_libcuda_pathlibslinelocslocdirsenv_ld_library_pathdirmsgs	            r   libcuda_dirsr.      s_    <<4444 !!""$4d#;<CCED *.):U):nPT>TDJJL):DU,01DSBGGOOC DD1))$564288=s=PRPWPWP\P\]`bpPqAr=s
&C2SY>>KKMM>>SdSSSXUXXSK V1 ts   
F+F)F*AF4Fc                  $    [         /[        5       Q$ N)libdevice_dirr.        r   library_dirsr4   ,   s    +LN++r3   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )	CudaUtils6   c                 n   > [        U S5      (       d  [        [        U ]  U 5      U l        U R                  $ )Ninstance)hasattrsuperr6   __new__r9   )cls	__class__s    r   r<   CudaUtils.__new__8   s-    sJ'' C8=CL||r3   c                 Z   [        [        [        R                  R	                  [
        S5      5      R                  5       S[        5       [        [        S9nUR                  U l
        UR                  U l        UR                  U l        UR                  U l        UR                  U l        g )Nzdriver.c
cuda_utilssrcnamer4   include_dirs	libraries)r   r   r   r   r   r!   	read_textr4   rE   rF   load_binaryget_device_propertiescuOccupancyMaxActiveClustersset_printf_fifo_sizefill_tma_descriptor)selfmods     r   __init__CudaUtils.__init__=   s    %RWW\\':67AAC%%
 ??%(%>%>",/,L,L)$'$<$<!#&#:#: r3   )rJ   rL   rI   rH   rK   )__name__
__module____qualname____firstlineno__r<   rO   __static_attributes____classcell__r>   s   @r   r6   r6   6   s    
; ;r3   r6   c                     U S   S:X  a  gU R                  S5      (       a  g0 SS_SS	_S
S_SS_SS_SS_SS_SS_SS_SS_SS_SS_SS_SS_SS_SS_U    $ )Nr   *CUdeviceptr
tensordescCUtensorMapi1int32_ti8int8_ti16int16_ti32i64int64_tu1uint32_tu8uint8_tu16uint16_tu32u64uint64_tfp16doublebf16fp32f32fp64	nvTmaDesc)
startswith)tys    r   	ty_to_cpprx   Q   s    	!u|	}}\""ih 	y 	y	
 	y 	j 	i 	z 	z 	z 	 	 	 	x 	  	]!" 	#
 
r3   rk   rg   rn   )ro   rq   rr   rs   rt   	pack_fp16	pack_bf16	pack_fp32	pack_fp64iiiKKppOOOOOc                 Z  ^^^^ U4S jnU4S jmU4S jmU4S jmU" UR                  5       5      n[        U5       VVs0 s H  u  pVXV_M	     nnnSR                  UR                  5        Vs/ s H  nT" U5      PM     sn5      n[        U-   n	/ n
UR                  5        H  nT" X5        M     [        U
5       VVs0 s H  u  pVXV_M	     nnn[	        U5      S:  a)  SSR                  S UR                  5        5       5      -   OSn/ nUR                  5        HU  u  pWUS	:X  a  M  U[        ;   a  UR                  [        U    S
U 35        M6  UR                  [        U5       S
U 35        MW     SR                  U5      n/ nUR                  5        H~  u  pWUS   S:X  a  UR                  SU S35        M%  U[        ;   a  UR                  SU S35        MF  US:X  a  UR                  SU 35        Mb  US	:w  d  Mj  UR                  SU 35        M     [        [	        U5      5      nSnUR                  5        VVs/ s H  u  pWUS   S:X  d  M  SU SU SU SU S3	PM!     nnnUR                  5        VVs/ s H  u  pWUS:X  d  M  SU SU SU S3PM     nnnUR                  5        VVs/ s H-  u  pWU[        ;   d  M  [        U    SU S[        U    SU S3PM/     nnnUR                  5        VVs/ s H  u  pWUS	:w  d  M  SU 3PM     nnnUR                  S 5        S![	        U5      S:  a  SU-   OS S"SR                  U5       S#UR                  UR                  5        VVs/ s H  u  pWT" U5       SU S$3PM     snn5       S%U	 S&U S'UR                  U5       SUR                  U5       SUR                  U5       S([	        U5      S:  a  SSR                  U5      -   OS S)3nU$ s  snnf s  snf s  snnf s  snnf s  snnf s  snnf s  snnf s  snnf )*Nc                   > / nSnU  GH9  n[        U[        5      (       Ga  UR                  S5      (       a  T
(       a  T
U   OS nUS-  n[        R                  " SU5      nUR                  S5      nUR                  S5      nUR                  S5      S-   nUc;  UR                  SU-   5        [        SU-  5       H  n	UR                  S5        M     OUR                  S	5        [        U5       H  n	UR                  S
5        M     [        U5       H  n	UR                  S5        M     GM(  UR                  U5        GM<     T
(       a  U[        T
5      :X  d   eU$ )Nr   r[      ztensordesc<([^[>]*)\[([^]]*)\]   ,rY   rd   ru   rc   )

isinstancer#   rv   rematchgroupcountappendrangelen)	signatureoutputtensordesc_idxsigmetar   dtypeshapendim_tensordesc_metas             r   _expand_signature(make_launcher.<locals>._expand_signature~   s+    C#s##|(D(D:I~6t!#!CSIAA{{3'!+<MM#+. #1t8_e, - MM+.tAMM%( %tAMM%( % c"7 : #nO8L&LLLr3   c                 v   > [        U [        5      (       a  U  H  nT" X!5        M     g UR                  U 5        g r0   )r   tupler   )r   r   x_flatten_signatures      r   r   )make_launcher.<locals>._flatten_signature   s0    c5!!"1-  MM#r3   c                    > [        U [        5      (       a!  SR                  [        TU 5      5      nSU S3$ U S   S:X  a  gU S;   a  g[	        U 5      $ )Nr   []r   rY   z	PyObject*	constexprru   )r   r   r   maprx   )rw   val_extracted_types     r   r   &make_launcher.<locals>._extracted_type   sW    b%  ((334Cse1:a5C<++}r3   c                    > [        U [        5      (       a!  SR                  [        TU 5      5      nSU S3$ U S   S:X  a  gU S;   a  gU R	                  S5      (       a  gS	S
SSSSSSSSS.
[        U 5         $ )N ()r   rY   Or   r[   dlbhiLBHIK)
rp   longr`   rb   r^   re   ri   rk   rg   rn   )r   r   r   r   rv   rx   )rw   r   	format_ofs     r   r    make_launcher.<locals>.format_of   s    b%  ''#i,-Cse1:a5C<++==&&
 B- 	r3   r   r   z, c              3   0   #    U  H  u  pS U 3v   M     g7f)z&_argNr2   )r   r   rw   s      r   r    make_launcher.<locals>.<genexpr>   s      L:K5:Ks   r   z argrY   ptr_infoz.dev_ptr_arg_storageru   z*tma_ptrz
  zDevicePtrInfo ptr_infoz = getPointer(_argz); if (!ptr_infoz.valid) return NULL;zCUtensorMap* tma_ptrz = getTmaDesc(_argz); if (!tma_ptrz) return NULL;z _argz_storage = z(_argz);z&argz&global_scratchaR  
#include "cuda.h"
#include <stdbool.h>
#include <Python.h>
#include <dlfcn.h>

static inline void gpuAssert(CUresult code, const char *file, int line)
{
   if (code != CUDA_SUCCESS)
   {
      const char* prefix = "Triton Error [CUDA]: ";
      const char* str;
      cuGetErrorString(code, &str);
      char err[1024] = {0};
      strcat(err, prefix);
      strcat(err, str);
      PyGILState_STATE gil_state;
      gil_state = PyGILState_Ensure();
      PyErr_SetString(PyExc_RuntimeError, err);
      PyGILState_Release(gil_state);
   }
}

#define CUDA_CHECK(ans) { gpuAssert((ans), __FILE__, __LINE__); }

typedef CUresult (*cuLaunchKernelEx_t)(const CUlaunchConfig* config, CUfunction f, void** kernelParams, void** extra);

static cuLaunchKernelEx_t getLaunchKernelExHandle() {
  // Open the shared library
  void* handle = dlopen("libcuda.so.1", RTLD_LAZY);
  if (!handle) {
    PyErr_SetString(PyExc_RuntimeError, "Failed to open libcuda.so.1");
    return NULL;
  }
  // Clear any existing error
  dlerror();
  cuLaunchKernelEx_t cuLaunchKernelExHandle = (cuLaunchKernelEx_t)dlsym(handle, "cuLaunchKernelEx");
  // Check for errors
  const char *dlsym_error = dlerror();
  if (dlsym_error) {
    PyErr_SetString(PyExc_RuntimeError, "Failed to retrieve cuLaunchKernelEx from libcuda.so.1");
    return NULL;
  }
  return cuLaunchKernelExHandle;
}

static void _launch(int gridX, int gridY, int gridZ, int num_warps, int num_ctas, int launch_cooperative_grid, int launch_pdl, int clusterDimX, int clusterDimY, int clusterDimZ, int shared_memory, CUstream stream, CUfunction function, CUdeviceptr global_scratchz) {
  void *params[] = { a,   };
  if (gridX*gridY*gridZ > 0) {
    // 4 attributes that we can currently pass maxmimum
    CUlaunchAttribute launchAttr[4];
    static cuLaunchKernelEx_t cuLaunchKernelExHandle = NULL;
    if (cuLaunchKernelExHandle == NULL) {
      cuLaunchKernelExHandle = getLaunchKernelExHandle();
    }
    CUlaunchConfig config;
    config.gridDimX = gridX;
    config.gridDimY = gridY;
    config.gridDimZ = gridZ;

    if (num_ctas != 1) {
      config.gridDimX *= clusterDimX;
      config.gridDimY *= clusterDimY;
      config.gridDimZ *= clusterDimZ;
    }

    config.blockDimX = 32 * num_warps;
    config.blockDimY = 1;
    config.blockDimZ = 1;
    config.sharedMemBytes = shared_memory;
    config.hStream = stream;
    config.attrs = launchAttr;
    int num_attrs = 0;

    if (launch_pdl != 0) {
      CUlaunchAttribute pdlAttr = { .id = CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION, .value = 1};
      launchAttr[num_attrs] = pdlAttr;
      ++num_attrs;
    }

    if (launch_cooperative_grid != 0) {
      CUlaunchAttribute coopAttr = { .id = CU_LAUNCH_ATTRIBUTE_COOPERATIVE, .value = 1};
      launchAttr[num_attrs] = coopAttr;
      ++num_attrs;
    }

    if (num_ctas != 1) {
      CUlaunchAttribute clusterAttr = {};
      clusterAttr.id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
      clusterAttr.value.clusterDim.x = clusterDimX;
      clusterAttr.value.clusterDim.y = clusterDimY;
      clusterAttr.value.clusterDim.z = clusterDimZ;
      launchAttr[num_attrs] = clusterAttr;
      ++num_attrs;

      CUlaunchAttribute clusterSchedulingAttr = {};
      clusterSchedulingAttr.id = CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE;
      clusterSchedulingAttr.value.clusterSchedulingPolicyPreference = CU_CLUSTER_SCHEDULING_POLICY_SPREAD;
      launchAttr[num_attrs] = clusterSchedulingAttr;
      ++num_attrs;
    }

    config.numAttrs = num_attrs;

    CUDA_CHECK(cuLaunchKernelExHandle(&config, function, params, 0));
  }
}

typedef struct _DevicePtrInfo {
    CUdeviceptr dev_ptr;
    bool valid;
} DevicePtrInfo;

static inline DevicePtrInfo getPointer(PyObject *obj, int idx) {
  DevicePtrInfo ptr_info;
  ptr_info.dev_ptr = 0;
  ptr_info.valid = true;
  if (PyLong_Check(obj)) {
    ptr_info.dev_ptr = PyLong_AsUnsignedLongLong(obj);
    return ptr_info;
  }
  if (obj == Py_None) {
    // valid nullptr
    return ptr_info;
  }
  PyObject *ptr = PyObject_GetAttrString(obj, "data_ptr");
  if(ptr){
    PyObject *empty_tuple = PyTuple_New(0);
    PyObject *ret = PyObject_Call(ptr, empty_tuple, NULL);
    Py_DECREF(empty_tuple);
    Py_DECREF(ptr);
    if (!PyLong_Check(ret)) {
      PyErr_SetString(PyExc_TypeError, "data_ptr method of Pointer object must return 64-bit int");
      ptr_info.valid = false;
      return ptr_info;
    }
    ptr_info.dev_ptr = PyLong_AsUnsignedLongLong(ret);
    if(!ptr_info.dev_ptr)
      return ptr_info;
    uint64_t dev_ptr;
    int status = cuPointerGetAttribute(&dev_ptr, CU_POINTER_ATTRIBUTE_DEVICE_POINTER, ptr_info.dev_ptr);
    if (status == CUDA_ERROR_INVALID_VALUE) {
        PyErr_Format(PyExc_ValueError,
                     "Pointer argument (at %d) cannot be accessed from Triton (cpu tensor?)", idx);
        ptr_info.valid = false;
    } else if (status != CUDA_SUCCESS) {
        CUDA_CHECK(status);  // Catch any other cuda API errors
        ptr_info.valid = false;
    }
    ptr_info.dev_ptr = dev_ptr;
    Py_DECREF(ret);  // Thanks ChatGPT!
    return ptr_info;
  }
  PyErr_SetString(PyExc_TypeError, "Pointer argument must be either uint64 or have data_ptr method");
  ptr_info.valid = false;
  return ptr_info;
}

static inline CUtensorMap* getTmaDesc(PyObject *obj) {
  if (sizeof(CUtensorMap*) != 8) {
    PyErr_SetString(PyExc_SystemError, "getTmaDesc() requires 64-bit compilation");
    return NULL;
  }

  PyObject *method_handle = PyObject_GetAttrString(obj, "tma_desc_cpu_ptr");
  if (!method_handle) {
    PyErr_SetString(PyExc_TypeError, "tma_desc_cpu_ptr() method does not exist");
    return NULL;
  }

  PyObject *empty_tuple = PyTuple_New(0);
  if (!empty_tuple) {
    Py_DECREF(method_handle);
    PyErr_SetString(PyExc_SystemError, "Internal Python error!");
    return NULL;
  }
  PyObject *method_ret = PyObject_Call(method_handle, empty_tuple, NULL);
  Py_DECREF(empty_tuple);
  Py_DECREF(method_handle);
  if (!method_ret) {
    PyErr_SetString(PyExc_SystemError, "Internal Python error!");
    return NULL;
  }

  if (!PyLong_Check(method_ret)) {
    PyErr_SetString(PyExc_TypeError, "tma_desc_cpu_ptr() must return 64-bit int");
    Py_DECREF(method_ret);
    return NULL;
  }

  uint64_t ptr_as_uint = PyLong_AsUnsignedLongLong(method_ret);
  Py_DECREF(method_ret);
  if (!ptr_as_uint) {
    PyErr_SetString(PyExc_ValueError, "received NULL ptr from tma_desc_cpu_ptr()");
    return NULL;
  }
  if (ptr_as_uint % 64 != 0) {
    PyErr_SetString(PyExc_ValueError, "tma_desc_cpu_ptr() must be 64-byte aligned");
    return NULL;
  }

  return (CUtensorMap*)(ptr_as_uint);
}

static void ensureCudaContext() {
  CUcontext pctx;
  CUDA_CHECK(cuCtxGetCurrent(&pctx));
  if (!pctx) {
    // Ensure device context.
    CUdevice device;
    CUDA_CHECK(cuDeviceGet(&device, 0));
    CUDA_CHECK(cuDevicePrimaryCtxRetain(&pctx, device));
    CUDA_CHECK(cuCtxSetCurrent(pctx));
  }
}

static uint16_t pack_fp16(double f) {
    uint16_t result;
    // from https://github.com/python/pythoncapi-compat
#if 0x030600B1 <= PY_VERSION_HEX && PY_VERSION_HEX <= 0x030B00A1 && !defined(PYPY_VERSION)
    _PyFloat_Pack2(f, (unsigned char*)&result, 1);
#else
    PyFloat_Pack2(f, (unsigned char*)&result, 1);
#endif
    return result;
}

static uint16_t pack_bf16(double f) {
    float f32 = (float)f;
    uint32_t u32 = *(uint32_t*)&f32;
    return (uint16_t)(u32 >> 16);
}

static uint32_t pack_fp32(double f) {
    float f32 = (float)f;
    return *(uint32_t*)&f32;
}

static uint64_t pack_fp64(double f) {
    return *(uint64_t*)&f;
}

static PyObject* launch(PyObject* self, PyObject* args) {
  // ensure cuda context is valid before calling any CUDA APIs, e.g. before getPointer calls cuPointerGetAttributes
  ensureCudaContext();

  int gridX, gridY, gridZ;
  uint64_t _stream;
  uint64_t _function;
  int launch_cooperative_grid;
  int launch_pdl;
  PyObject *launch_enter_hook = NULL;
  PyObject *launch_exit_hook = NULL;
  PyObject *kernel_metadata = NULL;
  PyObject *launch_metadata = NULL;
  PyObject *global_scratch_obj = NULL;
  ;z
  if(!PyArg_ParseTuple(args, "a7  ", &gridX, &gridY, &gridZ,
                                           &_stream, &_function, &launch_cooperative_grid, &launch_pdl, &global_scratch_obj,
                                           &kernel_metadata, &launch_metadata,
                                           &launch_enter_hook, &launch_exit_hooka  )) {
    return NULL;
  }

  int num_warps, num_ctas, shared_memory, clusterDimX, clusterDimY, clusterDimZ;
  if (!PyArg_ParseTuple(kernel_metadata, "iiiiii", &num_warps, &num_ctas, &shared_memory, &clusterDimX, &clusterDimY, &clusterDimZ)) {
    PyErr_SetString(PyExc_TypeError, "kernel_metadata must be a tuple");
    return NULL;
  }

  // extract launch metadata
  if (launch_enter_hook != Py_None){
    PyObject* args = Py_BuildValue("(O)", launch_metadata);
    PyObject* ret = PyObject_CallObject(launch_enter_hook, args);
    Py_DECREF(args);
    if (!ret)
      return NULL;
    Py_DECREF(ret);
  }

  CUdeviceptr global_scratch = 0;
  if (global_scratch_obj != Py_None) {
    DevicePtrInfo global_scratch_info = getPointer(global_scratch_obj, -1);
    if (!global_scratch_info.valid) {
      return NULL;
    }
    global_scratch = global_scratch_info.dev_ptr;
  }

  // raise exception asap
  z
  Py_BEGIN_ALLOW_THREADS;
  _launch(gridX, gridY, gridZ, num_warps, num_ctas, launch_cooperative_grid, launch_pdl, clusterDimX, clusterDimY, clusterDimZ, shared_memory, (CUstream)_stream, (CUfunction)_function, global_scratchaC  );
  Py_END_ALLOW_THREADS;
  if (PyErr_Occurred()) {
    return NULL;
  }

  if(launch_exit_hook != Py_None){
    PyObject* args = Py_BuildValue("(O)", launch_metadata);
    PyObject* ret = PyObject_CallObject(launch_exit_hook, args);
    Py_DECREF(args);
    if (!ret)
      return NULL;
    Py_DECREF(ret);
  }

  Py_RETURN_NONE;
}

static PyMethodDef ModuleMethods[] = {
  {"launch", launch, METH_VARARGS, "Entry point for all kernels with this signature"},
  {NULL, NULL, 0, NULL} // sentinel
};

static struct PyModuleDef ModuleDef = {
  PyModuleDef_HEAD_INIT,
  "__triton_launcher",
  NULL, //documentation
  -1, //size
  ModuleMethods
};

PyMODINIT_FUNC PyInit___triton_launcher(void) {
  PyObject *m = PyModule_Create(&ModuleDef);
  if(m == NULL) {
    return NULL;
  }
  PyModule_AddFunctions(m, ModuleMethods);
  return m;
}
)values	enumerater   _BASE_ARGS_FORMATr   itemsFLOAT_STORAGE_TYPEr   rx   r   FLOAT_PACK_FUNCTION)	constantsr   r   r   expand_signaturer   srw   args_formatformatflat_signaturer   	args_listarg_decl_list	arg_declsinternal_args_listparamsnewline	ptr_decls	tma_declsfloat_storage_declsrC   r   r   r   s     `                   @@@r   make_launcherr   |   s   $L. ))9)9);<"+,<"=>"=$!"=I>''93C3C3EF3ER9R=3EFGK,FN!3/ ""+N";<";$!";I<PST]P^abPbtyy L)//:K LLLhjI M"##  $6r$:#;4s!CD  IbM?$qc!:; # 		-(I"a5C<%%8&<=%%%%QCx&89;%%n5;%%QCj1 # 3y>"F G __&&EAa5C< 	f
 #5aS1#=MaSPde&   foetetevev\a\] 	X
qc!3A3oaSWev   __&&EA## 	Zb!
"%s+6I"6M5NeTUSVVXY&  
 '0oo&7M&7UQ2;LjQCj&7FM
MM#$.F\ [^  _h  [i  lm  [m  GK  NW  GW  su  Fv vyy() Q*b <<	@QR@QuqOB'(aS2@QRST U  &x (Q R[P[ \< <<	 
<<	 
<<#$% &H qt  uG  qH  KL  qL  IM  PT  PY  PY  Zl  Pm  Im  RT  HU 'UM	MC\
 JE ?F =8

 ND SsH   O>/P P	PP PP6P
 P P!	P!5P'c                   $    \ rS rSrSrS rS rSrg)TmaDescKernelParamiS     c                 `    SS K nUR                  U R                  UR                  SS9U l        g )Nr   cpur   device)torchemptyTMA_DESC_SIZEuint8descrM   r   s     r   rO   TmaDescKernelParam.__init__V  s%    KK 2 2%++eKT	r3   c                 6    U R                   R                  5       $ r0   )r   data_ptr)rM   s    r   tma_desc_cpu_ptr#TmaDescKernelParam.tma_desc_cpu_ptr[  s    yy!!##r3   )r   N)rQ   rR   rS   rT   r   rO   r   rU   r2   r3   r   r   r   S  s    MU
$r3   r   c              #   (   #    U  H  oU4v   M
     g 7fr0   r2   )r   r   s     r   r   r   `  s     :	1A	s      
      	   c           
      D   Uc=  U R                   /U R                  QU R                  QU R                  QU R                  Q$ US   nUS   nUS   nUS   nUS   nU R                   R                  5       nU R                  nU R                  n	U	S   S:X  d   e[	        5       n
U
/UQU	QnU(       a  [        U5      nUS==   S-  ss'   [        R                  R                  R                  R                  R                  U
R                  5       UUU[        U   UUU	5        U$ )	Nswizzle	elem_size	elem_type
block_size
fp4_paddedr   r   r   )baser   stridesr   r   listtritonruntimedriveractiveutilsrL   r   TMA_DTYPE_DEVICE_TO_HOST)argmetadatar   r   r   r   r   r   r   r   r   results               r   make_tensordesc_argr   f  s'    M399Ms{{MSYYMMMy!G%I%I,'J,'Jxx  "HIIEkkG2;!D%U%W%FUb	Q	
NN  &&:: +	 Mr3   c                 8   ^ ^^^ SSK Jm  SSKJm  UUU U4S jnU$ )Nr   )TensorDescriptorc                  f  > U S [        [        5       nU [        [        5      S  nSn/ n[        U5       HX  u  pV[        UT	T45      (       a/  T(       a  TU   OS nUS-  nUR	                  [        Xg5      5        MG  UR                  U5        MZ     T(       a  U[        T5      :X  d   eT
" / UQUQ76 $ )Nr   r   )r   r   r   r   extendr   r   )args	meta_argsraw_kernel_argsr   
final_argsr   r   r   GluonTensorDescriptorr   launcherr   s           r   inner%wrap_handle_tensordesc.<locals>.inner  s    0#/01	s#4567
0FA# 02GHII:I~6t!#!!"5c"@A!!#& 1 #nO8L&LLL00Z00r3   )triton.tools.tensor_descriptorr   'triton.experimental.gluon.nvidia.hopper)r  r   r  r  r   s   `` @@r   wrap_handle_tensordescr    s    ?a1 1 Lr3   c                        \ rS rSrS rS rSrg)CudaLauncheri  c                 $  ^ [        TS5      (       a  TR                  O	[        5       nU4S jnUR                  5        VVs0 s H  u  pVU" U5      U_M     nnnTR                  R                  5        VVs0 s H  u  pVXV_M	     nnn[        USS 5      n[        X7U5      m[        TS[        5       [        [        S9n	[        S UR                  5        5       5      n
[        R                  " [        R                   UR"                  S5      U l        U
(       a  ['        U	R(                  U5      OU	R(                  U l        UR*                  U l        UR,                  U l        UR.                  U l        UR0                  U l        g s  snnf s  snnf )Nr   c                 ~   > [        U [        5      (       a&  TR                  R                  R	                  U 5      4$ U $ r0   )r   r#   fn	arg_namesindex)r   rC   s    r   <lambda>'CudaLauncher.__init__.<locals>.<lambda>  s2    Z3=O=OSVV--33A69VUVVr3   r   __triton_launcherrB   c              3   r   #    U  H-  n[        U[        5      =(       a    UR                  S 5      v   M/     g7f)r[   N)r   r#   rv   )r   r   s     r   r   (CudaLauncher.__init__.<locals>.<genexpr>  s+     !vcu\_*S#"6"W3>>,;W"Wcus   57r   )r:   r   dictr   r   getattrr   r   r4   rE   rF   r$   r   	functoolsreduceoperatormulcluster_dimsnum_ctasr  launchglobal_scratch_sizeglobal_scratch_alignlaunch_cooperative_grid
launch_pdl)rM   rC   r   r   arg_idxidxvaluer   r   rN   has_tensor_desc_args    `         r   rO   CudaLauncher.__init__  sA   %,S+%>%>CMMDF	V;D??;LM;LZSWS\5(;L	M25--2E2E2GH2GJCSZ2G	H!(,=tDI/B%$%%
 "!vclcscscu!vv!((x7L7LaPM`,SZZIfifpfp#+#?#? $,$A$A!'/'G'G$"--% NHs   F7Fc           
         U R                   S:  aD  X-  U-  nXpR                  -  U R                   -  n[        R                  " XR                  U5      n	OS n	U R
                  " XX4XPR                  U R                  U	/UQ76   g Nr   )r!  r  r   
_allocatorr"  r   r#  r$  )
rM   gridXgridYgridZstreamfunctionr  	grid_size
alloc_sizeglobal_scratchs
             r   __call__CudaLauncher.__call__  s}    ##a'-I"]]2T5M5MMJ(33J@Y@Y[abN!NE%;W;WY]YhYh"	+%)	+r3   )r"  r!  r   r#  r$  r  N)rQ   rR   rS   rT   rO   r5  rU   r2   r3   r   r  r    s    ..+r3   r  c                   \   ^  \ rS rSrU 4S jrS rS rS r\S 5       r	S r
S rS	 rS
rU =r$ )
CudaDriveri  c                 V   > [        5       U l        [        U l        [        TU ]  5         g r0   )r6   r   r  launcher_clsr;   rO   )rM   r>   s    r   rO   CudaDriver.__init__  s    [
(r3   c                 |    U R                  5       nU R                  U5      nUS   S-  US   -   nSn[        SX#5      $ )Nr   r   r       r   )get_current_deviceget_device_capabilityr   )rM   r   
capability	warp_sizes       r   get_current_targetCudaDriver.get_current_target  sI    ((*//7
]R'*Q-7
	77r3   c                 J    SS K nUR                  SU R                  5       5      $ )Nr   r   )r   r   r>  r   s     r   get_active_torch_device"CudaDriver.get_active_torch_device  s    ||FD$;$;$=>>r3   c                 "    SS K nUR                  $ r+  )r   r   r   s     r   get_device_interfaceCudaDriver.get_device_interface  s    zzr3   c                       SS K n U R                  R                  5       =(       a    U R                  R                  S L $ ! [
         a     gf = f)Nr   F)r   r   is_availableversionhipImportError)r   s    r   	is_activeCudaDriver.is_active  sC    	::**,L%--2C2Ct2KL 		s   <? 
AAc                     SSK Jn  U$ )Nr   )do_bench)triton.testingrR  )rM   rR  s     r   get_benchmarkerCudaDriver.get_benchmarker  s
    +r3   c                 \    SS K nSnUR                  [        US-  5      UR                  SS9$ )Nr   i      r   r   )r   r   int)rM   r   
cache_sizes      r   get_empty_cache_for_benchmark(CudaDriver.get_empty_cache_for_benchmark  s.    
 '
{{3zQ/uyy{PPr3   c                 $    UR                  5         g r0   )zero_)rM   caches     r   clear_cacheCudaDriver.clear_cache  s    r3   )r:  r   )rQ   rR   rS   rT   rO   rB  rE  rH  staticmethodrO  rT  rZ  r_  rU   rV   rW   s   @r   r8  r8    s@    
8?  Q r3   r8  )+r  r  r   r   r   r   pathlibr   r   triton.runtime.buildr   triton.runtimer   triton.backends.compilerr   triton.backends.driverr   r   r!   realpath__file__r   rE   r1   rF   	lru_cacher.   r4   objectr6   rx   r   r   r   r   r   r  r   r   r   r  r  r8  r2   r3   r   <module>rk     sn     	   	   8 & . ,
''//"''**84
5Wi01We,H	  . , ,; ;6
4     # Tn	$ 	$  :b	::         %P,!+6 !+H, ,r3   