
    8h1                     F   S SK r S SKrS SKJrJr  S SKrS SKrS SKJr  S SK	J
r
  S SKJrJrJrJr  S SKJr  S SKJr  S SKJr  S S	KJr  S
SKJrJr  S
SKJrJr  S
SKJ r   S
SK!J"r"  S
SK#J$r$  S
SKJ%r%J&r&  S
SK'J(r(J)r)J*r+J,r,  S
SK-J.r.J/r/J0r0J1r1  S
SK2J3r3J4r4J5r5J6r6J7r7J8r8J9r9J:r:J;r;J<r<J=r=  SSK>J?r?J@r@JArAJBrBJCrCJDrDJErEJFrFJGrGJHrH   S SKIrI\" \IR                  5      rKSrL\R                  " \O5      rP\R                  R                  rR\R                  R                  rS\1" S\C\R                  R                  b  \KS:  a  SOSSSS9rV\1" S\ESS 9rWS!rXS"rYS#rZ\1" S$\E\Z\X-   \Y-   S 9r[\ R                  S% 5       r]\/" \R                  S&5      r_\/" \R                  S'\RR                  R                  S(9rb\/" \R                  S)5      rd\/" \R                  S*SS+9rf\/" \R                  S,\RR                  R                  S(9riS- rjS. rk\ R                  S/\m4S0 j5       rnSSSS1.S2 jroSES3 jrp\/" \oS5      rqS4 rr\," \RR                  SS59SS6.S7 j5       rs\," \RR                  SS59SS6.S8 j5       rt\," \RR                  SS59SSSS9.S: j5       ru\," \RR                  SS59SSS;.S< j5       rv\(" \RR                  R                  \)5        \," \RR                  R                  SS59     SFS= j5       rw\ R                  S>\\x   S/\m4S? j5       ryS@ rz  SGSA\\x   4SB jjr{SC r|SD r}g! \M a    \" S5      rKSrL GNIf = f)H    N)AnyOptional)counters)AutoHeuristicSelectAlgorithm)	AHContextcontext_add_stridescontext_add_using_tf32mm_operations)CppGemmTemplate)V)make_fx)TorchVersion   )configir)CUTLASS2xGemmTemplateCUTLASS3xGemmTemplate)CKTileGemmTemplate)CKGemmTemplate)SubgraphTemplate)FlexibleLayout	is_triton)add_layout_constraintconstrain_to_fx_strides	loweringsregister_lowering)autotune_select_algorithmExternKernelChoicerealize_inputsTritonTemplate)_use_cutlass_for_opget_k_splitsget_tma_workspace_arguse_aten_gemm_kernelsuse_ck_gemm_templateuse_ck_tile_gemm_templateuse_cpp_gemm_templateuse_cutlass_templateuse_decompose_k_choiceuse_triton_templateuse_triton_tma_template   )
_is_static_problemaddmm_epiloguemm_argsmm_config_kwargsmm_grid
mm_optionspersistent_mm_gridpersistent_mm_optionsscale_mm_epiloguescaled_mm_optionsTz0.0.0Fmmz3.3.0a3
  
{{def_kernel("A", "B")}}
    M = {{size("A", 0)}}
    N = {{size("B", 1)}}
    K = {{size("A", 1)}}
    if M * N == 0:
        # early exit due to zero-size input(s)
        return
    stride_am = {{stride("A", 0)}}
    stride_ak = {{stride("A", 1)}}
    stride_bk = {{stride("B", 0)}}
    stride_bn = {{stride("B", 1)}}

    # based on triton.ops.matmul
    pid = tl.program_id(0)
    grid_m = (M + BLOCK_M - 1) // BLOCK_M
    grid_n = (N + BLOCK_N - 1) // BLOCK_N

    # re-order program ID for better L2 performance
    width = GROUP_M * grid_n
    group_id = pid // width
    group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
    pid_m = group_id * GROUP_M + (pid % group_size)
    pid_n = (pid % width) // (group_size)
    tl.assume(pid_m >= 0)
    tl.assume(pid_n >= 0)

    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
    if ((stride_am == 1 and stride_ak == M) or (stride_am == K and stride_ak == 1)) and M >= BLOCK_M:
        offs_a_m = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
    else:
        offs_a_m = rm % M
    if ((stride_bk == 1 and stride_bn == K) or (stride_bk == N and stride_bn == 1)) and N >= BLOCK_N:
        offs_b_n = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
    else:
        offs_b_n = rn % N
    offs_k = tl.arange(0, BLOCK_K)
    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)

    for k_idx in range(0, tl.cdiv(K, BLOCK_K)):
        {% if not EVEN_K %}
        a_mask = offs_k[None, :] < (K - k_idx * BLOCK_K)
        b_mask = offs_k[:, None] < (K - k_idx * BLOCK_K)
        {% endif %}
        a_k_idx_vals = offs_k[None, :] + (k_idx * BLOCK_K)
        b_k_idx_vals = offs_k[:, None] + (k_idx * BLOCK_K)

        idx_m = offs_a_m[:, None]
        idx_n = a_k_idx_vals
        {{load_input("A", "a", ("idx_m", "idx_n"), mask=None if EVEN_K else "a_mask", indent_width=8)}}

        idx_m = b_k_idx_vals
        idx_n = offs_b_n[None, :]
        {{load_input("B", "b", ("idx_m", "idx_n"), mask=None if EVEN_K else "b_mask", indent_width=8)}}

        {% if USE_FAST_ACCUM %}
        acc = tl.dot(a, b, acc, allow_tf32=ALLOW_TF32, out_dtype=ACC_TYPE)
        {% else %}
        acc += tl.dot(a, b, allow_tf32=ALLOW_TF32, out_dtype=ACC_TYPE)
        {% endif %}

    # rematerialize rm and rn to save registers
    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
    idx_m = rm[:, None]
    idx_n = rn[None, :]
    mask = (idx_m < M) & (idx_n < N)

    # inductor generates a suffix
    {{store_output(("idx_m", "idx_n"), "acc", "mask")}}
a
  
{{def_kernel("A", "B")}}
    M = {{size("A", 0)}}
    N = {{size("B", 1)}}
    K = {{size("A", 1)}}
    if M * N == 0:
        # early exit due to zero-size input(s)
        return
    stride_am = {{stride("A", 0)}}
    stride_ak = {{stride("A", 1)}}
    stride_bk = {{stride("B", 0)}}
    stride_bn = {{stride("B", 1)}}

    # based on triton.ops.matmul
    pid = tl.program_id(0)
    grid_m = (M + BLOCK_M - 1) // BLOCK_M
    grid_n = (N + BLOCK_N - 1) // BLOCK_N

    # re-order program ID for better L2 performance
    width = GROUP_M * grid_n
    group_id = pid // width
    group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
    pid_m = group_id * GROUP_M + (pid % group_size)
    pid_n = (pid % width) // (group_size)
    tl.assume(pid_m >= 0)
    tl.assume(pid_n >= 0)

    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
    if (stride_am == 1 and stride_ak == M) or (stride_am == K and stride_ak == 1):
        offs_a_m = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
    else:
        offs_a_m = rm % M
    if (stride_bk == 1 and stride_bn == K) or (stride_bk == N and stride_bn == 1):
        offs_b_n = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
    else:
        offs_b_n = rn % N
    offs_k = tl.arange(0, BLOCK_K)
    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)

    for k_idx in range(0, tl.cdiv(K, BLOCK_K)):
        {% if not EVEN_K %}
        a_mask = offs_k[None, :] < (K - k_idx * BLOCK_K)
        b_mask = offs_k[:, None] < (K - k_idx * BLOCK_K)
        {% endif %}
        a_k_idx_vals = offs_k[None, :] + (k_idx * BLOCK_K)
        b_k_idx_vals = offs_k[:, None] + (k_idx * BLOCK_K)

        idx_m = offs_a_m[:, None]
        idx_n = a_k_idx_vals
        {{load_input("A", "a", ("idx_m", "idx_n"), mask=None if EVEN_K else "a_mask", indent_width=8)}}

        idx_m = b_k_idx_vals
        idx_n = offs_b_n[None, :]
        {{load_input("B", "b", ("idx_m", "idx_n"), mask=None if EVEN_K else "b_mask", indent_width=8)}}
        {% if USE_FAST_ACCUM %}
        acc = tl.dot(a, b, acc, allow_tf32=ALLOW_TF32, out_dtype=ACC_TYPE)
        {% else %}
        acc += tl.dot(a, b, allow_tf32=ALLOW_TF32, out_dtype=ACC_TYPE)
        {% endif %}

    # rematerialize rm and rn to save registers
    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
    idx_m = rm[:, None]
    idx_n = rn[None, :]
    mask = (idx_m < M) & (idx_n < N)

    # inductor generates a suffix
    {{store_output(("idx_m", "idx_n"), "acc", "mask")}}
)namegridsource"cache_codegen_enabled_for_templateprologue_loads_all_inputsmm_persistent_tmaa  
{{def_kernel("A", "B")}}
    M = {{size("A", 0)}}
    N = {{size("B", 1)}}
    K = {{size("A", 1)}}
    if M * N == 0:
        # early exit due to zero-size input(s)
        return

    start_pid = tl.program_id(0)
    grid_m = tl.cdiv(M, BLOCK_M)
    grid_n = tl.cdiv(N, BLOCK_N)
    k_tiles = tl.cdiv(K, BLOCK_K)
    num_tiles = grid_m * grid_n
    tiles_per_SM = num_tiles // NUM_SMS
    if start_pid < num_tiles % NUM_SMS:
        tiles_per_SM += 1

    tile_id = start_pid - NUM_SMS
    ki = -1

    width = GROUP_M * grid_n
    rk_for_mask = tl.arange(0, BLOCK_K)
    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)

    workspace_base = ws_ptr + start_pid * 2 * TMA_SIZE
    a_desc_ptr = workspace_base
    b_desc_ptr = workspace_base + TMA_SIZE

    {%- if TMA_EXPERIMENTAL_API %}
    triton.language.extra.cuda.experimental_device_tensormap_create2d(
        desc_ptr=a_desc_ptr,
        global_address=A,
        load_size=[BLOCK_M, BLOCK_K] if A_ROW_MAJOR else [BLOCK_K, BLOCK_M],
        global_size=[M, K] if A_ROW_MAJOR else [K, M],
        element_ty=A.dtype.element_ty,
    )
    triton.language.extra.cuda.experimental_device_tensormap_create2d(
        desc_ptr=b_desc_ptr,
        global_address=B,
        load_size=[BLOCK_K, BLOCK_N] if B_ROW_MAJOR else [BLOCK_N, BLOCK_K],
        global_size=[K, N] if B_ROW_MAJOR else [N, K],
        element_ty=B.dtype.element_ty,
    )

    tl.extra.cuda.experimental_tensormap_fenceproxy_acquire(a_desc_ptr)
    tl.extra.cuda.experimental_tensormap_fenceproxy_acquire(b_desc_ptr)

    a_desc = a_desc_ptr
    b_desc = b_desc_ptr
    {%- else %}
    a_desc = triton.language.make_tensor_descriptor(
        base=A,
        shape=[M, K] if A_ROW_MAJOR else [K, M],
        strides=[K, 1] if A_ROW_MAJOR else [M, 1],
        block_shape=[BLOCK_M, BLOCK_K] if A_ROW_MAJOR else [BLOCK_K, BLOCK_M],
    )
    b_desc = triton.language.make_tensor_descriptor(
        base=B,
        shape=[K, N] if B_ROW_MAJOR else [N, K],
        strides=[N, 1] if B_ROW_MAJOR else [K, 1],
        block_shape=[BLOCK_K, BLOCK_N] if B_ROW_MAJOR else [BLOCK_N, BLOCK_K],
    )
    {%- endif %}

    pid_m = 0
    pid_n = 0
    rm = 0
    rn = 0

    for _ in range(0, k_tiles * tiles_per_SM):
        ki = tl.where(ki == k_tiles - 1, 0, ki + 1)
        if ki == 0:
            tile_id += NUM_SMS
            # re-order program ID for better L2 performance
            group_id = tile_id // width
            group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
            pid_m = group_id * GROUP_M + (tile_id % group_size)
            pid_n = (tile_id % width) // (group_size)

            rm = pid_m * BLOCK_M
            rn = pid_n * BLOCK_N

        rk = ki * BLOCK_K

        {%- if TMA_EXPERIMENTAL_API %}
        a = tl._experimental_descriptor_load(
            a_desc,
            [rm, rk] if A_ROW_MAJOR else [rk, rm],
            [BLOCK_M, BLOCK_K] if A_ROW_MAJOR else [BLOCK_K, BLOCK_M],
            A.dtype.element_ty,
        )
        b = tl._experimental_descriptor_load(
            b_desc,
            [rk, rn] if B_ROW_MAJOR else [rn, rk],
            [BLOCK_K, BLOCK_N] if B_ROW_MAJOR else [BLOCK_N, BLOCK_K],
            B.dtype.element_ty,
        )
        {%- else %}
        a = tl.load_tensor_descriptor(
            a_desc,
            [rm, rk] if A_ROW_MAJOR else [rk, rm],
        )
        b = tl.load_tensor_descriptor(
            b_desc,
            [rk, rn] if B_ROW_MAJOR else [rn, rk],
        )
        {%- endif %}
        acc += tl.dot(
            a if A_ROW_MAJOR else a.T,
            b if B_ROW_MAJOR else b.T,
            allow_tf32=ALLOW_TF32,
        )

        if ki == k_tiles - 1:
            # rematerialize rm and rn to save registers
            rcm = rm + tl.arange(0, BLOCK_M)
            rcn = rn + tl.arange(0, BLOCK_N)
            idx_m = rcm[:, None]
            idx_n = rcn[None, :]
            mask = (idx_m < M) & (idx_n < N)

            # inductor generates a suffix
            {{store_output(("idx_m", "idx_n"), "acc", "mask", indent_width=12)}}
            acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)

)r8   r9   r:   a  
@triton.jit
def load_scales(a_scale_ptr, b_scale_ptr, SCALING_ROWWISE: tl.constexpr):
    if SCALING_ROWWISE:
        # For row-wise scaling, we'll return the pointers
        return a_scale_ptr, b_scale_ptr
    else:
        # For per-tensor scaling, we'll load the scalar values
        a_scale = tl.load(a_scale_ptr)
        b_scale = tl.load(b_scale_ptr)
        return a_scale, b_scale
a'  
@triton.jit
def apply_scaling(
    accumulator,
    a_scale,
    b_scale,
    SCALING_ROWWISE: tl.constexpr,
    offs_cm,
    offs_cn,
    M,
    N,
    stride_a_scale_m,
    stride_b_scale_n,
):
    if SCALING_ROWWISE:
        # For row-wise scaling, we need to load the scales for each row/column
        a_scales = tl.load(
            a_scale + (offs_cm * stride_a_scale_m),
            mask=offs_cm < M,
            other=0.0,
        )
        b_scales = tl.load(
            b_scale + (offs_cn * stride_b_scale_n),
            mask=offs_cn < N,
            other=0.0,
        )
        acc_scale = a_scales[:, None] * b_scales[None, :]
    else:
        # For per-tensor scaling, we can directly use the loaded scalar values
        acc_scale = a_scale * b_scale

    return accumulator * acc_scale
a  
{{def_kernel("A", "B", "A_inverse_scale", "B_inverse_scale")}}
    M = {{size("A", 0)}}
    N = {{size("B", 1)}}
    K = {{size("A", 1)}}
    if M * N == 0:
        # early exit due to zero-size input(s)
        return

    stride_am = {{stride("A", 0)}}
    stride_ak = {{stride("A", 1)}}
    stride_bk = {{stride("B", 0)}}
    stride_bn = {{stride("B", 1)}}

    if SCALING_ROWWISE:
        stride_a_scale_m = 1
        stride_b_scale_n = 1
    else:
        stride_a_scale_m = 0
        stride_b_scale_n = 0

    start_pid = tl.program_id(axis=0)
    num_pid_m = tl.cdiv(M, BLOCK_M)
    num_pid_n = tl.cdiv(N, BLOCK_N)
    k_tiles = tl.cdiv(K, BLOCK_K)
    num_tiles = num_pid_m * num_pid_n

    workspace_base = ws_ptr + start_pid * 2 * TMA_SIZE
    a_desc_ptr = workspace_base
    b_desc_ptr = workspace_base + TMA_SIZE

    {%- if TMA_EXPERIMENTAL_API %}
    triton.language.extra.cuda.experimental_device_tensormap_create2d(
        desc_ptr=a_desc_ptr,
        global_address=A,
        load_size=[BLOCK_M, BLOCK_K],
        global_size=[M, K],
        element_ty=A.dtype.element_ty,
    )
    triton.language.extra.cuda.experimental_device_tensormap_create2d(
        desc_ptr=b_desc_ptr,
        global_address=B,
        load_size=[BLOCK_N, BLOCK_K],
        global_size=[N, K],
        element_ty=B.dtype.element_ty,
    )

    tl.extra.cuda.experimental_tensormap_fenceproxy_acquire(a_desc_ptr)
    tl.extra.cuda.experimental_tensormap_fenceproxy_acquire(b_desc_ptr)

    a_desc = a_desc_ptr
    b_desc = a_desc_ptr
    {%- else %}
    a_desc = triton.language.make_tensor_descriptor(
        base=A,
        shape=[M, K],
        strides=[K, 1],
        block_shape=[BLOCK_M, BLOCK_K],
    )
    b_desc = triton.language.make_tensor_descriptor(
        base=B,
        shape=[N, K],
        strides=[K, 1],
        block_shape=[BLOCK_N, BLOCK_K],
    )
    {%- endif %}

    tiles_per_SM = num_tiles // NUM_SMS
    if start_pid < num_tiles % NUM_SMS:
        tiles_per_SM += 1

    tile_id = start_pid - NUM_SMS
    ki = -1

    pid_m = 0
    pid_n = 0
    offs_am = 0
    offs_bn = 0

    num_pid_in_group = GROUP_M * num_pid_n
    accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
    a_scale, b_scale = load_scales(A_inverse_scale, B_inverse_scale, SCALING_ROWWISE)

    for _ in range(0, k_tiles * tiles_per_SM):
        ki = tl.where(ki == k_tiles - 1, 0, ki + 1)
        if ki == 0:
            tile_id += NUM_SMS
            group_id = tile_id // num_pid_in_group
            first_pid_m = group_id * GROUP_M
            group_size_m = min(num_pid_m - first_pid_m, GROUP_M)
            pid_m = first_pid_m + (tile_id % group_size_m)
            pid_n = (tile_id % num_pid_in_group) // group_size_m

            offs_am = pid_m * BLOCK_M
            offs_bn = pid_n * BLOCK_N

        offs_k = ki * BLOCK_K

        {%- if TMA_EXPERIMENTAL_API %}
        a = tl._experimental_descriptor_load(
            a_desc_ptr, [offs_am, offs_k], [BLOCK_M, BLOCK_K],  A.dtype.element_ty
        )
        b = tl._experimental_descriptor_load(
            b_desc_ptr, [offs_bn, offs_k], [BLOCK_N, BLOCK_K],  B.dtype.element_ty
        )
        {%- else %}
        a = tl.load_tensor_descriptor(a_desc, [offs_am, offs_k])
        b = tl.load_tensor_descriptor(b_desc, [offs_bn, offs_k])
        {%- endif %}
        if USE_FAST_ACCUM:
            accumulator = tl.dot(a, b.T, accumulator)
        else:
            accumulator += tl.dot(a, b.T)

        if ki == k_tiles - 1:
            # Apply inverse scaling
            offs_cm = offs_am + tl.arange(0, BLOCK_M)
            offs_cn = offs_bn + tl.arange(0, BLOCK_N)
            # Apply scaling
            accumulator = apply_scaling(
                accumulator,
                a_scale,
                b_scale,
                SCALING_ROWWISE,
                offs_cm,
                offs_cn,
                M,
                N,
                stride_a_scale_m,
                stride_b_scale_n,
            )

            idx_m = offs_cm[:, None]
            idx_n = offs_cn[None, :]
            mask = (idx_m < M) & (idx_n < N)
            # inductor generates a suffix
            {{store_output(("idx_m", "idx_n"), "accumulator", "mask", indent_width=12)}}
            accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
scaled_mm_device_tmac                     [        U 5      $ N)r   )fns    S/var/www/fran/franai/venv/lib/python3.13/site-packages/torch/_inductor/kernel/mm.pylazy_register_extern_choicerC   6  s    b!!    z
at::mm_outzat::addmm_out)op_overloadzat::_int_mm_outzat::_sparse_semi_structured_mm)has_out_variantzat::_scaled_mm_outc                 d    U R                  5       [        R                  [        R                  4;   $ r@   )	get_dtypetorchint8uint8)mats    rB   _is_int8_matrM   N  s     ==?uzz5;;777rD   c                     X-  S:  $ )Ni     )mnks      rB   _is_large_block_for_cpurS   R  s    55=rD   returnc                      [         R                  R                  5       (       d  g[         R                  R                  [         R                  R	                  5       5      n U R
                  S:H  $ )zEReturns true if the device is a NVIDIA B200, otherwise returns false.F
   )rI   cudais_availableget_device_propertiescurrent_devicemajor)device_propertiess    rB   
using_b200r]   W  sM     ::""$$

889R9R9TU""b((rD   outalphabetac          	          U R                  S5      S:X  d  U R                  S5      S:X  a  [        R                  " U S   XX4US9$ [        R                  " XX#XES9$ )z
Giving torch.addmm a 1D tensor calls a different (faster) cublasLt
kernel under the hood.  There are a few shapes where this is slower,
but they are rare.
r   r,   r^   )stridesizerI   addmm)inpmat1mat2r_   r`   ra   s         rB   
bias_addmmri   a  sO     zz!}SXXa[A-{{3q643$OO;;s$uHHrD   c                 z  ^ ^ S[         4S jnS[         4S jnS[         4S jn[        R                  " U" T R                  5       5      =(       d    U" T R	                  5       5      U 4S j5        [        R                  " U" TR                  5       5      =(       d    U" TR	                  5       5      U4S j5        g )NrT   c                 \    [         R                  R                  R                  U S   S5      $ )Nr,   r   graphsizevarsstatically_known_equalsrc   s    rB   is_row_major.check_supported_striding.<locals>.is_row_majorm  #    ww77q	1EErD   c                 \    [         R                  R                  R                  U S   S5      $ Nr   r,   rl   rp   s    rB   is_col_major.check_supported_striding.<locals>.is_col_majorp  rs   rD   c                     [        [        R                  R                  R	                  U S   S5      =(       d-    [        R                  R                  R	                  U S   S5      5      $ ru   )boolr   rm   rn   ro   )rd   s    rB   has_zero_dim.check_supported_striding.<locals>.has_zero_dims  sQ    GG44T!Wa@ Dww77QC
 	
rD   c                  *   > ST R                  5        3$ )Nz$mat_a must be row_major, got stride 
get_stride)mat_as   rB   <lambda>*check_supported_striding.<locals>.<lambda>|      6u7G7G7I6JKrD   c                  *   > ST R                  5        3$ )Nz$mat_b must be col_major, got stride r}   )mat_bs   rB   r   r     r   rD   )ry   rI   _checkr~   get_size)r   r   rq   rv   rz   s   ``   rB   check_supported_stridingr   l  s    F FF F
d 
 
LLU%%'(JL9I,JK 
LLU%%'(JL9I,JKrD   c                    U R                   S   nUR                   S   nU R                   S   nXR-  nUn[        R                  " U R                  X7U5      S5      nUR                  XvU5      n	[        R                  " X[        R
                  S9n
[        R                  " U
S5      nUR                  U R                  5      $ )Nr   r,   )r,   r   r   	out_dtype)	shaperI   permutereshapebmmfloat32sumtodtype)abk_splitsrP   rQ   rR   k_partsB
a_reshaped
b_reshapedresultreduced_bufs               rB   
decomposeKr     s    	
A	
A	
AmGAqyyw7CJ1q)JYYzGF))FA&K>>!''""rD   )type_promotion_kindlayoutc                8  ^ [        XUS9u  p4pRp[        R                  " U 5      nSn[        S   SU SU SU 3==   S-  ss'   [        R                  SUUUU R                  5       UR                  5       U5        Un[        R                  (       d>  [        R                  (       d)  [        UR                  UR                  UR                  S9n[        5       (       a  [        R!                  X4U5      /O/ n	[#        U5      u  p[$        R&                  R)                  U5      n[$        R&                  R+                  U5      n[$        R&                  R-                  U5      nU R                  5       nU(       Ga'  [/        U5      (       Ga  U" UUU40 [1        U[2        UR4                  5      D6 H)  n[6        R8                  " U	4X4US	.[;        UX4XR5      D6  M+     [=        X5      (       as  U" UUU40 [1        U[2        UR4                  5      D6 HK  n[>        R8                  " U	4X4U[A        S
U RC                  5       S9S.[;        UX4XR5      D[E        X5      D6  MM     SSK#J$m  [K        U4S jU RM                  5       U RO                  5       URM                  5       URO                  5       4 5       5      n[Q        X4U5      (       a  U(       d  SSK)J*n  S
SK+J,n  [[        X4U5      nU H  n[$        R\                  R^                  Ra                  [b        Rd                  " [b        Rf                  " UU5      S5      5      (       d  M]  U" 5          U" 5       n[i        SU S3[k        [l        Rn                  " [p        US9U5      S9nSSS5        WR9                  U	X4US	9  M     U(       a9  [s        X#XE5      (       a(  [u        S5      (       a  [v        Rx                  " XX/5        U(       a)  [{        X#XE5      (       a  [|        R~                  " XX/5        U(       a)  [        X#XE5      (       a  [        R                  " XX/5        [        X U5      (       a  [        R                  " U	UX/5        X/nU(       Ga=  [/        U5      (       Ga,  [        R                  R                  R                  U5      (       a  [        U 5      (       a  / n[        5       (       a  UR                  S5        [        U	5      nU" X4U40 [1        U[2        5      D6 H)  n[6        R8                  " U	4X4US	.[;        UX4XR5      D6  M+     [        U UUUUU	UU[        5       SSUS9n[        R                  R                  R                  U5      (       d2  Ub*  [        U5      S:  a  U	 Vs/ s H  nUU;   d  M  UPM     n	nOU	SU n	[        R                   H.  nU	R                  [        U5      R!                  X4U5      5        M0     [        XyX/U5      $ ! , (       d  f       GN= fs  snf )zW
Lowering for autotuning aten.mm with different backends (Aten, Triton, CUTLASS, etc.)
r   r7   aten_mm_infozaten.mm__r,   zOTuned aten.mm: m=%s, n=%s, k=%s, mat1_dtype=%s, mat2_dtype=%s, output_layout=%sdevicer   rd   input_nodesr   r   num_tma_descriptorsr   r   r   workspace_argr   )get_free_symbolsc              3   J   >#    U  H  n[        T" US S95      S:  v   M     g7f)T)unbacked_onlyr   N)len).0itrr   s     rB   	<genexpr>tuned_mm.<locals>.<genexpr>  s.      
  D9:Q>s    #)enable_python_dispatcher)select_decomp_tabledecompose_k_mm__split)r   )r8   make_fx_graphN	extern_mmrV   )top_kalways_included)Rr/   r   get_device_typer   loginforH   inductor_configmax_autotunemax_autotune_gemmr   r   r   rd   r$   aten_mmbindr-   r   choicesget_base_mm_configsget_persistent_mm_configsget_extra_mm_configsr*   r0   rS   itemsizemm_templatemaybe_append_choicer2   r+   persistent_tma_mm_templater#   
get_devicer4   torch._inductor.irr   anyr   r~   r)   torch._dispatch.pythonr   decompositionr   r"   rm   rn   statically_known_truesympyEqModr   r   	functoolspartialr   r(   r!   r   add_cutlass_gemm_choicesr%   r   add_ck_gemm_choicesr&   r   add_choicesr'   r   rI   	_inductorr   run_autoheuristicr   appendr   mm_autoheuristicr
   collect_autoheuristicexternal_matmulrC   r   )rg   rh   r   rP   rQ   rR   device_typer8   aten_layoutr   static_shape
is_nonzero
mm_configspersistent_mm_configsextra_mm_configsr   r   unbacked_symbolsr   r   r   k_splitdecompositionsdecompose_k_subgraph_templater   r    num_choices_before_extra_configs
ah_choiceschoicer   s                                @rB   tuned_mmr     s   
 #*$V"DA!T$$T*KD ^xs!A3as3494HHY			 K((O,M,M$==6;;
 6K5L5LtlK	01RT   2&9L..{;JII??Lyy55kBNNE)&11 
 {,CU^^T	
F ++!L VQ15	
 #4../ #!8%..	 +>>
!%!"7,-#0#	
 !q9
 ,D7
( 	8  
 !!	
 
 "!**3CG;#A!,H#ww''==HHUYYq'2A6  -/%8%:N4D.wiv>&-%--j7K*'51 0 .AA!%! B # $0 	 A11%%66wU*6a;;**7TLI/1@@&&wEV400##L	
 ,K''OO""44T::dOO """";/+.w<(&!
'5LM
F ++!L VQ15	
 &O+

 %%;;DAA%#j/A*=
 18Pf6Z;O6P!"C#CD,,215::D<PQ - %TTL&IIm 0/^ Qs   7X
X'X
X	c                (   [        XU[        R                  S9u  p4pRp[        S   SU SU SU 3==   S-  ss'   [        R                  SUUUU R                  5       UR                  5       U5        [        R                  " U 5      n[        U5      u  pxU=(       a    U=(       a    [        X#XE5      n	[        5       (       a  [        R                  X4U5      /O/ n
U	(       a(  [        S5      (       a  [        R                   " XX/SSS	9  ["        R$                  R'                  U5      nU(       aT  [)        USS
9(       aE  U" X4U40 [+        U[,        5      D6 H)  n[.        R0                  " U
4X4US.[3        XXEU5      D6  M+     [5        SXU/U5      $ )Nr   r   r   zaten._int_mm_r   r,   zTTuned aten._int_mm: m=%s, n=%s, k=%s, mat1_dtype=%s, mat2_dtype=%s, output_layout=%sint_mmTfuseablenon_fuseable)enable_int32r   )r/   rI   int32r   r   r   rH   r   r   r-   r(   r$   aten__int_mmr   r!   r   r   r   r   get_int8_mm_configsr*   r0   rS   r   r   r2   r   )rg   rh   r   rP   rQ   rR   r   r   r   use_cutlassr   int8_mm_configsr   s                rB   tuned_int_mmr   R  s   ")6U[[#A!T
 ^}QCq1QC89Q>9HH^			 $$T*K1&9LW:W2FvRS2WK 6K5L5L		D<	01RT  *84466d\Dt	
 ii33K@O)&tD%!
'5LM
F ++!L Vf5	
 %XwtfMMrD   )r`   ra   r   c                <   [         R                  " U5      n[        XXS9u  pxppn
[        U5      u  p[        S   SU SU SU	 3==   S-  ss'   [
        R                  SUUU	UR                  5       UR                  5       U5        U(       a*  [        R                  (       d  [        R                  (       dw  SSKJnJn  [        X]5      (       a&  U" UR                  UR                   UR"                  S	9n[%        5       (       a  [&        R)                  XU4UUUS
9/O/ n[+        SXX/U5      $ [%        5       (       a  [&        R)                  XU4UUUS
9/O/ n[%        5       (       a{  U
R-                  5       S   S:X  ad  U
R/                  5       R0                  S:X  aF  [        R2                  R4                  (       a'  UR7                  S[8        R)                  XU4XSUS
95        [:        R<                  R?                  U5      n[:        R<                  RA                  U5      nUR                  5       nU(       Ga0  [C        U5      (       Ga  U" UUU	40 [E        U[F        URH                  5      D6 HZ  n[J        RL                  " U4XU4US.[O        UXxX5      DS[Q        UR                   X45      [S        SUR                   X4/5      S.D6  M\     [U        X5      (       a  U" UUU	40 [E        U[F        URH                  5      D6 He  n[V        RL                  " U4XU4U[Y        SUR/                  5       S9S.[O        UXxX5      D[[        X5      DS[Q        UR                   X45      S.D6  Mg     U(       a>  []        XWX5      (       a-  [_        S5      (       a  [`        Rb                  " UUXU
/UU/ SQS9  U(       a.  [e        XWX5      (       a  [f        Rh                  " UUXU
/UU/ SQS9  [k        XQU5      (       a  [l        Rn                  " UUXU/UUSS9  [+        SXX/U5      $ )Nr   r   zaten.addmm_r   r,   zRTuned aten.addmm: m=%s, n=%s, k=%s, mat1_dtype=%s, mat2_dtype=%s, output_layout=%sr   )FixedLayoutr   r   )r`   ra   re   rW   r   r.   )prefix_argsepilogue_fnepilogue_fn_hashr   r   r   )r   r   )r   r   r,   )r`   ra   input_reorderT)r`   ra   has_bias)8r   r   r/   r-   r   r   r   rH   r   r   r   r   r   r   
isinstancer   r   rd   r$   
aten_addmmr   r   r~   r   typetritonautotune_cublasLtinsertaten_bias_addmmr   r   r   r   r*   r0   rS   r   r   r   r2   r.   strr+   r   r#   r4   r(   r!   r   r   r%   r   r   r'   r   r   )rf   rg   rh   r`   ra   r   r   rP   rQ   rR   inp_expandedr   r   r   r   r   r   r   r   r   s                       rB   tuned_addmmr    s+   $$T*K07C0W-A!T1&9L ^{1#Qqc1#671<7HH\			 ))_-N-N 	Cf**#}}FLLv{{F %&& %	     	 )';LfUU !"" OOT*	  	
   	##%a(A-##%**f4""44 	  T*Fd ! 	
 ..{;JII??LNNE)&11 
 {,CU^^T	
F ++)6 VQ15	
 *6<<E!$&6e%R!S
  #4../ #!8%..	 +>>!-T :!"7,-#0#	 !q9 ,D7 !" .v||U I. 	 A11((66&#	
 *6a;;**&#	
 V400##&	
 %4f rD   )r   r   c                   SSK Jn  U" XU5      u  pnU R                  5       u  pgUR                  5       u  pUR                  5       u  p[        R                  R
                  R                  Xh5      n[        R                  R
                  R                  SU-  U
5      nUc:  SSKJn  U" UR                  5       U(       a  UOUR                  5       X/US/5      nO
Ub   S5       e[        5       (       a  [        R                  XU4XCS9/O/ nX-  S:w  a:  [        XLX5      (       a)  [        S5      (       a  [         R"                  " XXU/S	S	S
9  [%        SXX/U5      $ )Nr   )r   r   )r   r,   z,out_dtype is ignored if layout is specified.r   sparse_semi_structured_mmTr   ) torch._inductor.select_algorithmr   r   r   rm   rn   guard_equalsr   r   r   rH   r$   aten__sparse_semi_structured_mmr   r(   r!   r   r   r   )rg   	mat1_metarh   r   r   r   m1k1m2r   k2rQ   rP   rR   r   r   s                   rB   tuned_sparse_semi_structured_mmr    s\    @*4DADT]]_FB EBMMOEB	%%b-A	%%a"fb1A~2OO"I(8FF	
  P"PP  !""	 ,00$' 1 	
   	

 A11 ;<<66d)4tRV	
 %#WY.Ev rD   c	                 ,	   [        XXS9u  ppp[        S   SU	 SU
 SU 3==   S-  ss'   [        R                  SU	U
UU R	                  5       UR	                  5       U5        [
        R                  " U 5      n[        X5        [        X#5      u  pU(       d  XX4nO[        U5      nXXU4n[        R                  XXgS9n/ n[        5       (       a  UR                  U5        UR                  [        R                  :w  a  [!        SUX5      $ [#        U5      u  nn[$        R&                  R)                  U5      n[$        R&                  R+                  U5      nU(       Ga  [-        US	S
9(       Ga  U(       aV  [/        UR1                  5       5      [/        UR1                  5       5      S-   :X  a  [2        [4        R6                     " US5      nOUn[/        UR1                  5       5      S:X  d  [/        UR1                  5       5      S:X  a  [/        UR1                  5       5      [/        UR1                  5       5      :X  d   e[2        [4        R6                     " [2        [4        R6                     " US5      S5      n[2        [4        R6                     " [2        [4        R6                     " US5      S5      nOUnUnU(       a
  U UUUU4nSnOXUU4nSn[9        X5      (       aY  U(       dR  U" XU5       HE  n[;        UU	U
UUUUUS	S9	n[<        R>                  " U4UU[A        SU RC                  5       S9S.UD6  MG     U" XU5       H  n[$        RD                  RF                  RI                  [J        RL                  " US5      5      (       a  MH  [O        5       (       aE  [$        RD                  RF                  RI                  [J        RP                  " US5      5      (       a  M  [;        UXXX#U5      n[R        R>                  " U4UUS.UDU[U        5       SS.D6  M     U(       a8  [W        XX5      (       a'  [Y        S5      (       a  [Z        R\                  " UUUUS9  U(       a(  [_        XX5      (       a  [`        Rb                  " UX5        [!        SUX5      $ )a	  
Performs an optimized matrix multiplication where scaling factors are applied
to the inputs and/or output.

Args:
    mat1 (Tensor): First input matrix
    mat2 (Tensor): Second input matrix
    scale1 (Tensor): Scale factor applied to mat1 (supports broadcasting)
    scale2 (Tensor): Scale factor applied to mat2 (supports broadcasting)
    bias (Tensor, optional): Optional bias tensor to add to the result
    layout: Layout hint for optimization

Returns:
    Tensor: The result of the scaled matrix multiplication
r   r   zaten._scaled_mm.default_r   r,   z_Tuned aten._scaled_mm.default: m=%s, n=%s, k=%s, mat1_dtype=%s, mat2_dtype=%s, output_layout=%s)r   use_fast_accum	scaled_mmT)enable_float8r      r   )
device_tmar   r          r   r5   )suffix_argsr   r   )r  )2r/   r   r   r   rH   r   r   r   r   aten__fp8_mmr   r$   r   r   rI   r   r   r-   r   r   get_scaled_mm_configs get_scaled_persistent_mm_configsr*   r   r   Laten	unsqueezer+   r6   scaled_mm_device_tma_templater   r#   r   rm   rn   guard_or_falser   Ler]   Ltr   r5   r(   r!   r   r   r%   r   r   )r   r   scale_ascale_bbiasscale_resultr   r  r   rP   rQ   rR   r   scale_a_realscale_b_realr   	bias_realaten_choicer   r   r   scaled_mm_configsscaled_persistent_mm_configstriton_biastriton_scale_atriton_scale_btriton_input_nodesr  r   kwargss                                 rB   tuned_scaled_mmr9  O  s(   6 %,V%!A!U ^7s!A3asCDIDHHi			 $$U+KU*!/!AL \@"4(	\K##y $ K G{# }}%(g{SS&v.MAz		77D#$99#M#M$  )&EEC()S-AA-EEDNN+D!4KKw!"a'3w/?/?/A+Ba+Gw'')*c'2B2B2D.EEEEt~~.q/@!/LaPNt~~.q/@!/LaPN$N$N" K"'!OK #5006qQ?*"#
 .AA	 2!"7,-$//1#		 	 @. (a0Fww..uxx2?? || 0 0 ? ?B P P&a>F ++. 	
 (-/!4 12 	 A11,,66)		
 *6a;;**7FH$[';OOrD   indexc                 p    [         R                  R                  U =(       d    S5      nUR                  S:*  $ )Nr      )rI   rW   rY   r[   )r:  propss     rB   _is_sm7x_or_older_gpur>    s)    JJ,,UZa8E;;!rD   c                 &    [        S U  5       5      $ )Nc              3   B   #    U  H  n[        U[        5      v   M     g 7fr@   )r  int)r   dims     rB   r   dims_are_int.<locals>.<genexpr>  s     4tz#s##ts   )all)dimss    rB   dims_are_intrF    s    4t444rD   r   c           
         ^ [        XX#U5      u  p#n[        X#U/5      (       d  g [        X5      u  pU4S jnS nU" X$X0XU5      n[        UUUUTUU	S9nU
b  UR	                  XS9$ UR                  5       $ )Nc                 >  > [        5       nUR                  SU 5        UR                  SU5        UR                  SU5        UR                  SUR                  R                  SS9  UR                  SUR                  R                  SS9  [	        USU5        [	        US	U5        UR                  S
UR                  R                  5       SS9  UR                  SUR                  R                  5       SS9  TS:X  a  [        XsR                  R                  5        U$ )NrP   rR   rQ   
mat1_dtypeT)is_categorical
mat2_dtyperg   rh   mat1_iscontigmat2_iscontigr7   )r   add_featurer   r   r   is_contiguousr	   )	rP   rR   rQ   rg   rh   mat1_stridemat2_stridecontextr8   s	           rB   get_context%mm_autoheuristic.<locals>.get_context  s    +C#C#C#L$++*;*;DQL$++*;*;DQGV[9GV[9T[[668 	 	
 	T[[668 	 	
 4<"7KK,=,=>rD   c                      g r@   rO   rO   rD   rB   fallback"mm_autoheuristic.<locals>.fallback+  s    rD   )rV  r   r   rR  r8   augment_contextprecondition)r   )get_size_hintsrF  get_size_hints_stridesr   get_top_k_choices_callerget_choice_caller)rg   rh   rP   rQ   rR   r   r8   r   opsrY  r   r   rP  rQ  rS  rV  rR  autoheuristics         `           rB   r   r     s     Tq1GA!q	""5dAK& !KHG0!M 55 6 
 	
 **,,rD   c                    [        U[        5      (       a  [        U[        5      (       dZ  [        R                  R                  R                  U R                  5       [        R                  R                  R                  S9u  p$[        U[        5      (       a  [        U[        5      (       dZ  [        R                  R                  R                  UR                  5       [        R                  R                  R                  S9u  pCX#U4$ )NrV  )r  rA  r   rm   rn   
size_hintsr   rI   r   r   unbacked_symint_fallback)rg   rh   rP   rQ   rR   s        rB   rZ  rZ  B  s    aZ3%7%7!!,,MMO__++DD - 

 aZ3%7%7!!,,MMO__++DD - 
 7NrD   c                 j   U R                   R                  nUR                   R                  nX#/n/ nU Hs  n[        U[        5      (       dJ  [        R
                  R                  R                  U[        R                  R                  R                  S9nUR                  U5        Mu     US   US   4$ )Nra  r   r,   )r   rc   r  rA  r   rm   rn   rb  rI   r   r   rc  r   )rg   rh   rP  rQ  stridesstrides_hintsrc   s          rB   r[  r[  Q  s    ++$$K++$$K(GM&#&&WW%%00//HH 1 F 	V$  ]1---rD   )rT   N)NNNFN)NN)~r   loggingtypingr   r   r   rI   torch._dynamo.utilsr   +torch._inductor.autoheuristic.autoheuristicr   1torch._inductor.autoheuristic.autoheuristic_utilsr   r   r	   r
   )torch._inductor.codegen.cpp_gemm_templater   torch._inductor.virtualizedr   "torch.fx.experimental.proxy_tensorr   torch.torch_versionr    r   r   r   codegen.cuda.gemm_templater   r   ,codegen.rocm.ck_tile_universal_gemm_templater   'codegen.rocm.ck_universal_gemm_templater   codegen.subgraphr   r   r   loweringr   r   r   r#  r   select_algorithmr   r   r   r    utilsr!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   	mm_commonr-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r  __version__triton_version
has_tritonImportError	getLogger__name__r   r^  r$  primsversionhipr   r   load_scalesapply_scalingr  r&  cacherC   r7   r   re   defaultr  _int_mmr   _sparse_semi_structured_mmr  
_scaled_mmr_   r   rM   rS   	lru_cachery   r]   ri   r   r  r   r   r   r  r  r9  rA  r>  rF  r   rZ  r[  rO   rD   rB   <module>r     s         ( T  F ) 6 , , U M D / *       !&"4"45NJ
 !yy~~				T MM%.G*CQG	XFP (,"sZx ,		~B H FJ
Z !/		#m3!  " " UXX|
4	KKdjj.@.@
 "%--1BC"4	$$$#  "	*8K8K
8
 )D ) ) (,11 I4 %Z6# 4775#' wJ 6wJt 4<<T:'+ ,N ;,N^ 4::48*+!D U 9Up 422M(,T- N-` doo--/F G 4??**E 
hP FhPV # 4  
5  :- C=:-z.S(  !'*NJs   >L L L 