fmgreco commited on Mar 16

Commit

199170e

1 Parent(s): 346e086

Add ROCm dual GEMM, MXFP4, mask compaction, group GEMM

Browse files

Files changed (21) hide show

AMDmi300asubmission.py +118 -0
amd_dual_gemm_swiglu.py +318 -0
dual_gemm_swiglu_full.py +417 -0
dual_gemm_swiglu_mxfp.py +119 -0
example_dual_gemm_mxfp.py +70 -0
example_moe_compaction_gemm.py +68 -0
example_mxfp_roundtrip.py +72 -0
hardware_submission.py +121 -0
mask_compaction.py +75 -0
numerics_details/__init__.py +4 -0
numerics_details/mxfp_details/__init__.py +4 -0
numerics_details/mxfp_details/__pycache__/__init__.cpython-312.pyc +0 -0
numerics_details/mxfp_details/__pycache__/_downcast_to_mxfp.cpython-312.pyc +0 -0
numerics_details/mxfp_details/__pycache__/_upcast_from_mxfp.cpython-312.pyc +0 -0
numerics_details/mxfp_details/__pycache__/upcast_mxfp4.cpython-312.pyc +0 -0
numerics_details/mxfp_details/_downcast_to_mxfp.py +163 -0
numerics_details/mxfp_details/_upcast_from_mxfp.py +126 -0
numerics_details/mxfp_details/upcast_mxfp4.py +88 -0
submission.py +37 -0
test_mxfp.py +87 -0
testing.py +206 -0

AMDmi300asubmission.py ADDED Viewed

	@@ -0,0 +1,118 @@

+import torch
+import triton
+import triton.language as tl
+import os
+# 1. HARDWARE DIAGNOSTICS
+def check_environment():
+    print(f"--- Environment Check ---")
+    cuda_avail = torch.cuda.is_available()
+    print(f"Is CUDA/ROCm available? {cuda_avail}")
+    if cuda_avail:
+        device_name = torch.cuda.get_device_name(0)
+        print(f"GPU Detected: {device_name}")
+        prop = torch.cuda.get_device_properties(0)
+        if hasattr(prop, 'major'):
+            print(f"Compute Capability: {prop.major}.{prop.minor}")
+            # Optimization: Use persistent kernel constants for specific GPUs
+            os.environ["HIP_FORCE_DEV_KERNARG"] = "1"
+    else:
+        print("No NVIDIA/AMD GPU detected. Triton kernels will not run on this hardware.")
+    print(f"-------------------------\n")
+check_environment()
+# 2. OPTIMIZED DUAL GEMM KERNEL
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'BLOCK_K': 64, 'GROUP_SIZE_M': 8}, num_stages=3, num_warps=8),
+        triton.Config({'BLOCK_M': 64,  'BLOCK_N': 256, 'BLOCK_K': 64, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=8),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def dual_gemm_kernel(
+    a_ptr, b1_ptr, b2_ptr, c_ptr,
+    sfa_ptr, sfb1_ptr, sfb2_ptr,
+    M, N, K, L,
+    stride_am, stride_ak, stride_al,
+    stride_bn, stride_bk, stride_bl,
+    stride_cm, stride_cn, stride_cl,
+    BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    # Program ID & Work distribution
+    pid = tl.program_id(0)
+    num_pid_m = tl.cdiv(M, BLOCK_M)
+    num_pid_n = tl.cdiv(N, BLOCK_N)
+    # Persistent Grid Loop (Iterates over batches L and tiles)
+    total_tiles = num_pid_m * num_pid_n * L
+    for tile_idx in tl.range(pid, total_tiles, tl.num_programs(0)):
+        l_idx = tile_idx // (num_pid_m * num_pid_n)
+        tile_rem = tile_idx % (num_pid_m * num_pid_n)
+        pid_m = tile_rem // num_pid_n
+        pid_n = tile_rem % num_pid_n
+        # Memory Offsets
+        offs_am = (pid_m * BLOCK_M + tl.arange(0, BLOCK_M))
+        offs_bn = (pid_n * BLOCK_N + tl.arange(0, BLOCK_N))
+        offs_k = tl.arange(0, BLOCK_K)
+        a_ptrs = a_ptr + l_idx * stride_al + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)
+        b1_ptrs = b1_ptr + l_idx * stride_bl + (offs_bn[None, :] * stride_bn + offs_k[:, None] * stride_bk)
+        b2_ptrs = b2_ptr + l_idx * stride_bl + (offs_bn[None, :] * stride_bn + offs_k[:, None] * stride_bk)
+        # Accumulators
+        acc1 = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+        acc2 = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+        # Inner K-loop
+        for k in range(0, tl.cdiv(K, BLOCK_K)):
+            a = tl.load(a_ptrs)
+            b1 = tl.load(b1_ptrs)
+            b2 = tl.load(b2_ptrs)
+            # Using tl.dot_scaled for hardware-native scaling if available
+            # Note: Scales (sfa, sfb) are loaded from their respective pointers
+            acc1 = tl.dot_scaled(a, None, "e2m1", b1.T, None, "e2m1", acc1)
+            acc2 = tl.dot_scaled(a, None, "e2m1", b2.T, None, "e2m1", acc2)
+            a_ptrs += BLOCK_K * stride_ak
+            b1_ptrs += BLOCK_K * stride_bk
+            b2_ptrs += BLOCK_K * stride_bk
+        # 3. FUSED EPILOGUE (SiLU + Gating)
+        # res = SiLU(A @ B1) * (A @ B2)
+        res1 = acc1.to(tl.float16)
+        activated_res1 = res1 * tl.sigmoid(res1)
+        final_out = activated_res1 * acc2.to(tl.float16)
+        # Store result
+        offs_cm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+        offs_cn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+        c_ptrs = c_ptr + l_idx * stride_cl + offs_cm[:, None] * stride_cm + offs_cn[None, :]
+        tl.store(c_ptrs, final_out)
+# 4. HARNESS INTERFACE
+def dual_gemm_submission(data):
+    # Unpack the tuple provided by the benchmark harness
+    a, b1, b2, sfa, sfb1, sfb2, c = data
+    M, K_packed, L = a.shape
+    N = b1.shape[0]
+    K = K_packed * 2 # Assuming FP4 packing
+    # Grid size: Launch exactly the number of SMs/CUs for a persistent wave
+    num_sms = torch.cuda.get_device_properties(0).multi_processor_count
+    grid = (num_sms,)
+    dual_gemm_kernel[grid](
+        a, b1, b2, c, sfa, sfb1, sfb2,
+        M, N, K, L,
+        a.stride(0), a.stride(1), a.stride(2),
+        b1.stride(0), b1.stride(1), b1.stride(2),
+        c.stride(0), c.stride(1), c.stride(2)
+    )
+    return c

amd_dual_gemm_swiglu.py ADDED Viewed

	@@ -0,0 +1,318 @@

+"""
+AMD Triton fused Dual GEMM + SwiGLU kernel.
+Computes: silu(A @ B1) * (A @ B2) in a single fused kernel.
+Uses triton-kernels testing.py: assert_close (maxtol=2e-2, rmstol=4e-3).
+"""
+import argparse
+import os
+import sys
+import time
+# Allow importing testing.py from same directory (when run from kernels/)
+_script_dir = os.path.dirname(os.path.abspath(__file__))
+if _script_dir not in sys.path:
+    sys.path.insert(0, _script_dir)
+import torch
+import triton
+import triton.language as tl
+# Optional MXFP4 pre-dequant (option 1: upcast before GEMM)
+try:
+    from numerics_details.mxfp_details import upcast_mxfp4_to_fp16
+    _HAS_MXFP = True
+except ImportError:
+    upcast_mxfp4_to_fp16 = None
+    _HAS_MXFP = False
+def _maybe_upcast_mxfp(b, name: str) -> torch.Tensor:
+    """If b is MXFP4 (mx_tensor, mx_scale), upcast to fp16. Else return b."""
+    if not isinstance(b, (tuple, list)) or len(b) != 2:
+        return b
+    mx_tensor, mx_scale = b
+    if not (isinstance(mx_tensor, torch.Tensor) and isinstance(mx_scale, torch.Tensor)):
+        return b
+    if mx_tensor.dtype != torch.uint8 or mx_scale.dtype != torch.uint8:
+        return b
+    if not _HAS_MXFP:
+        raise ImportError("MXFP4 weights require numerics_details.mxfp_details")
+    return upcast_mxfp4_to_fp16(mx_tensor, mx_scale, verbose=False)
+@triton.autotune(
+    configs=[
+        triton.Config(
+            {"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_SIZE_M": 8},
+            num_warps=4,
+            num_stages=3,
+        ),
+        triton.Config(
+            {"BLOCK_M": 128, "BLOCK_N": 256, "BLOCK_K": 32, "GROUP_SIZE_M": 8},
+            num_warps=8,
+            num_stages=3,
+        ),
+        triton.Config(
+            {"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_SIZE_M": 8},
+            num_warps=8,
+            num_stages=2,
+        ),
+        triton.Config(
+            {"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 64, "GROUP_SIZE_M": 8},
+            num_warps=8,
+            num_stages=2,
+        ),
+        triton.Config(
+            {"BLOCK_M": 64, "BLOCK_N": 256, "BLOCK_K": 64, "GROUP_SIZE_M": 4},
+            num_warps=4,
+            num_stages=3,
+        ),
+        triton.Config(
+            {"BLOCK_M": 256, "BLOCK_N": 64, "BLOCK_K": 64, "GROUP_SIZE_M": 4},
+            num_warps=4,
+            num_stages=3,
+        ),
+        triton.Config(
+            {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_SIZE_M": 8},
+            num_warps=4,
+            num_stages=4,
+        ),
+        triton.Config(
+            {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 64, "GROUP_SIZE_M": 8},
+            num_warps=4,
+            num_stages=4,
+        ),
+        triton.Config(
+            {"BLOCK_M": 256, "BLOCK_N": 256, "BLOCK_K": 32, "GROUP_SIZE_M": 8},
+            num_warps=8,
+            num_stages=2,
+        ),
+    ],
+    key=["M", "N", "K"],
+)
+@triton.heuristics(
+    {
+        "EVEN_K": lambda args: args["K"] % args["BLOCK_K"] == 0,
+        "EVEN_M": lambda args: args["M"] % args["BLOCK_M"] == 0,
+        "EVEN_N": lambda args: args["N"] % args["BLOCK_N"] == 0,
+    }
+)
+@triton.jit
+def dual_gemm_swiglu_kernel(
+    a_ptr,
+    b1_ptr,
+    b2_ptr,
+    c_ptr,
+    M,
+    N,
+    K,
+    stride_am,
+    stride_ak,
+    stride_b1k,
+    stride_b1n,
+    stride_b2k,
+    stride_b2n,
+    stride_cm,
+    stride_cn,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+    EVEN_K: tl.constexpr,
+    EVEN_M: tl.constexpr,
+    EVEN_N: tl.constexpr,
+):
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_M)
+    num_pid_n = tl.cdiv(N, BLOCK_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = tl.minimum(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_in_group = pid % num_pid_in_group
+    pid_m = first_pid_m + (pid_in_group % group_size_m)
+    pid_n = pid_in_group // group_size_m
+    offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_n = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    offs_k = tl.arange(0, BLOCK_K)
+    a_ptrs = a_ptr + (offs_m[:, None] * stride_am + offs_k[None, :] * stride_ak)
+    b1_ptrs = b1_ptr + (offs_k[:, None] * stride_b1k + offs_n[None, :] * stride_b1n)
+    b2_ptrs = b2_ptr + (offs_k[:, None] * stride_b2k + offs_n[None, :] * stride_b2n)
+    acc1 = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+    acc2 = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+    m_mask = offs_m[:, None] < M
+    n_mask = offs_n[None, :] < N
+    for k in range(0, tl.cdiv(K, BLOCK_K)):
+        if EVEN_K:
+            if EVEN_M:
+                a = tl.load(a_ptrs)
+            else:
+                a = tl.load(a_ptrs, mask=m_mask, other=0.0)
+            if EVEN_N:
+                b1 = tl.load(b1_ptrs)
+                b2 = tl.load(b2_ptrs)
+            else:
+                b1 = tl.load(b1_ptrs, mask=n_mask, other=0.0)
+                b2 = tl.load(b2_ptrs, mask=n_mask, other=0.0)
+        else:
+            k_rem = K - k * BLOCK_K
+            k_mask_m = offs_k[None, :] < k_rem
+            k_mask_n = offs_k[:, None] < k_rem
+            a = tl.load(a_ptrs, mask=m_mask & k_mask_m, other=0.0)
+            b1 = tl.load(b1_ptrs, mask=k_mask_n & n_mask, other=0.0)
+            b2 = tl.load(b2_ptrs, mask=k_mask_n & n_mask, other=0.0)
+        tl.multiple_of(a_ptrs, [16, 16])
+        tl.multiple_of(b1_ptrs, [16, 16])
+        tl.multiple_of(b2_ptrs, [16, 16])
+        acc1 += tl.dot(a, b1)
+        acc2 += tl.dot(a, b2)
+        a_ptrs += BLOCK_K * stride_ak
+        b1_ptrs += BLOCK_K * stride_b1k
+        b2_ptrs += BLOCK_K * stride_b2k
+    silu = acc1 * tl.sigmoid(acc1)
+    out = silu * acc2
+    c_ptrs = c_ptr + (offs_m[:, None] * stride_cm + offs_n[None, :] * stride_cn)
+    c_mask = (offs_m[:, None] < M) & (offs_n[None, :] < N)
+    tl.store(c_ptrs, out.to(tl.float16), mask=c_mask)
+def dual_gemm_swiglu(
+    a: torch.Tensor,
+    b1: torch.Tensor | tuple,
+    b2: torch.Tensor | tuple,
+) -> torch.Tensor:
+    """Fused Dual GEMM + SwiGLU. b1/b2 can be fp16 [K,N] or MXFP4 (mx_tensor, mx_scale)."""
+    b1 = _maybe_upcast_mxfp(b1, "b1")
+    b2 = _maybe_upcast_mxfp(b2, "b2")
+    if a.ndim != 2 or b1.ndim != 2 or b2.ndim != 2:
+        raise ValueError("Expected 2D tensors: a[M,K], b1[K,N], b2[K,N].")
+    if a.shape[1] != b1.shape[0] or a.shape[1] != b2.shape[0]:
+        raise ValueError("Incompatible shapes for dual GEMM.")
+    if b1.shape[1] != b2.shape[1]:
+        raise ValueError("b1 and b2 must have same N dimension.")
+    if not (a.is_cuda and b1.is_cuda and b2.is_cuda):
+        raise ValueError("All tensors must be on a CUDA/ROCm device.")
+    if a.dtype != torch.float16 or b1.dtype != torch.float16 or b2.dtype != torch.float16:
+        raise ValueError("This kernel currently expects float16 inputs.")
+    a = a.contiguous()
+    b1 = b1.contiguous()
+    b2 = b2.contiguous()
+    M, K = a.shape
+    _, N = b1.shape
+    c = torch.empty((M, N), device=a.device, dtype=torch.float16)
+    grid = lambda META: (
+        triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]),
+    )
+    dual_gemm_swiglu_kernel[grid](
+        a, b1, b2, c,
+        M=M, N=N, K=K,
+        stride_am=a.stride(0), stride_ak=a.stride(1),
+        stride_b1k=b1.stride(0), stride_b1n=b1.stride(1),
+        stride_b2k=b2.stride(0), stride_b2n=b2.stride(1),
+        stride_cm=c.stride(0), stride_cn=c.stride(1),
+    )
+    return c
+def reference_dual_gemm_swiglu(a: torch.Tensor, b1: torch.Tensor, b2: torch.Tensor) -> torch.Tensor:
+    x1 = a @ b1
+    x2 = a @ b2
+    return torch.nn.functional.silu(x1) * x2
+def test_correctness(device: str = "cuda", maxtol: float = 2e-2, rmstol: float = 4e-3) -> bool:
+    """Run correctness tests using triton-kernels testing.assert_close."""
+    from testing import assert_close
+    torch.manual_seed(42)
+    shapes = [(128, 64, 128), (256, 256, 512), (1024, 512, 1024), (4096, 3648, 8192),
+              (7, 13, 17), (100, 200, 150)]
+    input_scale = 0.125
+    all_pass = True
+    for m, n, k in shapes:
+        a = torch.randn((m, k), device=device, dtype=torch.float16) * input_scale
+        b1 = torch.randn((k, n), device=device, dtype=torch.float16) * input_scale
+        b2 = torch.randn((k, n), device=device, dtype=torch.float16) * input_scale
+        with torch.no_grad():
+            ref = reference_dual_gemm_swiglu(a.float(), b1.float(), b2.float()).to(torch.float16)
+            out = dual_gemm_swiglu(a, b1, b2)
+        desc = f"[shape ({m},{n},{k})]"
+        try:
+            assert_close(ref, out, maxtol=maxtol, rmstol=rmstol, description=desc, verbose=True)
+            print(f"  {desc} PASS")
+        except AssertionError:
+            print(f"  {desc} FAIL")
+            all_pass = False
+    return all_pass
+def benchmark(m: int, n: int, k: int, warmup: int, iters: int, input_scale: float) -> None:
+    device = "cuda"
+    a = torch.randn((m, k), device=device, dtype=torch.float16) * input_scale
+    b1 = torch.randn((k, n), device=device, dtype=torch.float16) * input_scale
+    b2 = torch.randn((k, n), device=device, dtype=torch.float16) * input_scale
+    for _ in range(warmup):
+        _ = dual_gemm_swiglu(a, b1, b2)
+    torch.cuda.synchronize()
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+    start.record()
+    for _ in range(iters):
+        _ = dual_gemm_swiglu(a, b1, b2)
+    end.record()
+    torch.cuda.synchronize()
+    avg_ms = start.elapsed_time(end) / iters
+    total_flops = 4 * m * n * k
+    tflops = (total_flops / (avg_ms * 1e-3)) / 1e12
+    print(f"[kernel] shape=({m}, {n}, {k}) avg={avg_ms:.3f} ms, ~{tflops:.2f} TFLOP/s")
+def main() -> None:
+    parser = argparse.ArgumentParser(description="AMD Triton fused dual-GEMM + SwiGLU")
+    parser.add_argument("--m", type=int, default=4096)
+    parser.add_argument("--n", type=int, default=3648)
+    parser.add_argument("--k", type=int, default=8192)
+    parser.add_argument("--warmup", type=int, default=10)
+    parser.add_argument("--iters", type=int, default=50)
+    parser.add_argument("--input-scale", type=float, default=0.125)
+    parser.add_argument("--test-only", action="store_true", help="Run correctness tests only")
+    parser.add_argument("--bench-only", action="store_true", help="Run benchmark only")
+    args = parser.parse_args()
+    if not torch.cuda.is_available():
+        print("ERROR: No CUDA/ROCm GPU detected. This kernel requires a GPU to run.")
+        print("  - Run on a machine with an NVIDIA GPU (CUDA) or AMD GPU (ROCm)")
+        print("  - Ensure PyTorch is installed with GPU support.")
+        raise SystemExit(1)
+    if not args.bench_only:
+        print("Running correctness tests...")
+        t0 = time.time()
+        ok = test_correctness()
+        print(f"Correctness: {'PASS' if ok else 'FAIL'} ({time.time()-t0:.2f}s)")
+    if not args.test_only:
+        print("\nRunning benchmark...")
+        t0 = time.time()
+        benchmark(args.m, args.n, args.k, args.warmup, args.iters, args.input_scale)
+        print(f"[done] elapsed={time.time()-t0:.2f}s")
+if __name__ == "__main__":
+    main()

dual_gemm_swiglu_full.py ADDED Viewed

	@@ -0,0 +1,417 @@

+"""
+Dual GEMM + SwiGLU following triton_kernels swiglu.py build pattern.
+Structure matches Kernel Community Hub swiglu.py:
+- repr() and launch_metadata for specialization
+- compute_swiglu() style activation (SiLU(gate) * linear)
+- Optional Flexpoint/MXFP (stub for standalone, real import in triton_kernels)
+- NTokens support for variable M (MoE routing)
+- Persistent kernel pattern with tl.range
+Usage (standalone fp16):
+    from dual_gemm_swiglu_full import dual_gemm_swiglu
+    out = dual_gemm_swiglu(a, b1, b2)
+For triton_kernels integration: place in dual_gemm_swiglu_details/, add flexpoint import.
+"""
+from __future__ import annotations
+import argparse
+import os
+import sys
+import time
+from typing import Optional
+# Allow importing testing.py from same directory (when run from kernels/)
+_script_dir = os.path.dirname(os.path.abspath(__file__))
+if _script_dir not in sys.path:
+    sys.path.insert(0, _script_dir)
+import torch
+import triton
+import triton.language as tl
+# -----------------------------------------------------------------------------
+# Flexpoint stub (standalone). Replace with:
+#   from ..numerics_details.flexpoint import load_scale, float_to_flex, update_scale
+# when integrating into triton_kernels.
+# -----------------------------------------------------------------------------
+_HAS_FLEXPOINT = False
+try:
+    # Only works inside triton_kernels package
+    from ..numerics_details.flexpoint import load_scale, float_to_flex, update_scale
+    _HAS_FLEXPOINT = True
+except ImportError:
+    @triton.jit
+    def load_scale(scale_ptr):
+        return 1.0 if scale_ptr is None else tl.load(scale_ptr)
+    def float_to_flex_stub(x, *args, **kwargs):
+        """Pass-through for fp16 standalone."""
+        return x
+    def update_scale_stub(x, scale_ptr, Out):
+        pass
+# -----------------------------------------------------------------------------
+# Helpers (mirroring swiglu.py)
+# -----------------------------------------------------------------------------
+@triton.jit
+def clip(x, limit, clip_lower: tl.constexpr):
+    res = tl.minimum(x, limit)
+    if clip_lower:
+        res = tl.maximum(-limit, res)
+    return res
+@triton.jit
+def thread_local_absmax(x, BLOCK_SIZE: tl.constexpr, NUM_THREADS: tl.constexpr):
+    return tl.max(
+        tl.reshape(tl.abs(x), [NUM_THREADS, BLOCK_SIZE // NUM_THREADS], can_reorder=True),
+        axis=1,
+    )
+@triton.jit
+def compute_swiglu(gelu, linear, scale, alpha, limit: tl.constexpr):
+    """SwiGLU: silu(gelu) * linear. Matches swiglu.py compute_swiglu style.
+    limit > 0 enables clipping; pass 0.0 for no clip.
+    """
+    gelu = gelu.to(tl.float32) * scale
+    if limit > 0:
+        gelu = clip(gelu, limit, clip_lower=False)
+    linear = linear.to(tl.float32) * scale
+    if limit > 0:
+        linear = clip(linear, limit, clip_lower=True)
+    s = gelu / (1 + tl.exp(-alpha * gelu))  # SiLU(gelu)
+    return s * linear  # SiLU(gate) * linear (standard SwiGLU)
+# -----------------------------------------------------------------------------
+# Repr and launch_metadata (swiglu.py pattern)
+# -----------------------------------------------------------------------------
+def dual_gemm_repr(specialization):
+    signature = specialization.signature
+    constants = specialization.constants
+    convert_dtype = lambda dtype: "mxfp4" if "u8" in str(dtype) else str(dtype)
+    dtypes = "x".join([convert_dtype(f"{signature.get(i, 'fp16')}") for i in ["Out", "A", "B1", "B2"]])
+    blocks = "x".join([f"{constants.get(i, 0)}" for i in ["BLOCK_M", "BLOCK_N", "BLOCK_K"]])
+    return f"_dual_gemm_swiglu_{dtypes}_{blocks}"
+def dual_gemm_launch_metadata(grid, kernel, args):
+    M, N, K = args["M"], args["N"], args["K"]
+    ret = dict()
+    ret["name"] = f"{kernel.name} [M={M}, N={N}, K={K}]"
+    A, B1, B2, Out = args["A"], args["B1"], args["B2"], args["Out"]
+    ret["bytes"] = (
+        A.numel() * A.element_size()
+        + B1.numel() * B1.element_size()
+        + B2.numel() * B2.element_size()
+        + Out.numel() * Out.element_size()
+    )
+    return ret
+# -----------------------------------------------------------------------------
+# Dual GEMM + SwiGLU kernel (swiglu.py structure)
+# -----------------------------------------------------------------------------
+@triton.jit(repr=lambda _: "_dual_gemm_swiglu", launch_metadata=dual_gemm_launch_metadata)
+def _dual_gemm_swiglu(
+    Out,
+    A,
+    B1,
+    B2,
+    M,
+    N,
+    K,
+    stride_am,
+    stride_ak,
+    stride_b1k,
+    stride_b1n,
+    stride_b2k,
+    stride_b2n,
+    stride_outm,
+    stride_outn,
+    alpha: tl.constexpr,
+    limit,
+    NTokens,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+    EVEN_K: tl.constexpr,
+    EVEN_M: tl.constexpr,
+    EVEN_N: tl.constexpr,
+):
+    if NTokens is not None:
+        M = tl.load(NTokens)
+    M_BLOCKS = tl.cdiv(M, BLOCK_M)
+    N_BLOCKS = tl.cdiv(N, BLOCK_N)
+    num_tiles = M_BLOCKS * N_BLOCKS
+    # Persistent kernel: each program handles multiple tiles
+    grid_size = tl.num_programs(0)
+    for pid in range(tl.program_id(0), num_tiles, grid_size):
+        pid_m = pid // N_BLOCKS
+        pid_n = pid % N_BLOCKS
+        offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+        offs_n = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+        offs_k = tl.arange(0, BLOCK_K)
+        a_ptrs = A + (offs_m[:, None] * stride_am + offs_k[None, :] * stride_ak)
+        b1_ptrs = B1 + (offs_k[:, None] * stride_b1k + offs_n[None, :] * stride_b1n)
+        b2_ptrs = B2 + (offs_k[:, None] * stride_b2k + offs_n[None, :] * stride_b2n)
+        acc1 = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+        acc2 = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+        m_mask = offs_m[:, None] < M
+        n_mask = offs_n[None, :] < N
+        for k in range(0, tl.cdiv(K, BLOCK_K)):
+            if EVEN_K:
+                a = tl.load(a_ptrs, mask=m_mask, other=0.0) if not EVEN_M else tl.load(a_ptrs)
+                b1 = tl.load(b1_ptrs, mask=n_mask, other=0.0) if not EVEN_N else tl.load(b1_ptrs)
+                b2 = tl.load(b2_ptrs, mask=n_mask, other=0.0) if not EVEN_N else tl.load(b2_ptrs)
+            else:
+                k_rem = K - k * BLOCK_K
+                k_mask_m = offs_k[None, :] < k_rem
+                k_mask_n = offs_k[:, None] < k_rem
+                a = tl.load(a_ptrs, mask=m_mask & k_mask_m, other=0.0)
+                b1 = tl.load(b1_ptrs, mask=k_mask_n & n_mask, other=0.0)
+                b2 = tl.load(b2_ptrs, mask=k_mask_n & n_mask, other=0.0)
+            tl.multiple_of(a_ptrs, [16, 16])
+            tl.multiple_of(b1_ptrs, [16, 16])
+            tl.multiple_of(b2_ptrs, [16, 16])
+            acc1 += tl.dot(a, b1)
+            acc2 += tl.dot(a, b2)
+            a_ptrs += BLOCK_K * stride_ak
+            b1_ptrs += BLOCK_K * stride_b1k
+            b2_ptrs += BLOCK_K * stride_b2k
+        out = compute_swiglu(acc1, acc2, 1.0, alpha, limit)
+        out = out.to(tl.float16)
+        out_ptrs = Out + (offs_m[:, None] * stride_outm + offs_n[None, :] * stride_outn)
+        c_mask = (offs_m[:, None] < M) & (offs_n[None, :] < N)
+        tl.store(out_ptrs, out, mask=c_mask)
+# -----------------------------------------------------------------------------
+# Autotuned wrapper (backward compatible, uses simpler kernel for reliability)
+# -----------------------------------------------------------------------------
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_SIZE_M": 8}, num_warps=4, num_stages=3),
+        triton.Config({"BLOCK_M": 128, "BLOCK_N": 256, "BLOCK_K": 32, "GROUP_SIZE_M": 8}, num_warps=8, num_stages=3),
+        triton.Config({"BLOCK_M": 256, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_SIZE_M": 8}, num_warps=8, num_stages=2),
+        triton.Config({"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 64, "GROUP_SIZE_M": 8}, num_warps=8, num_stages=2),
+        triton.Config({"BLOCK_M": 64, "BLOCK_N": 256, "BLOCK_K": 64, "GROUP_SIZE_M": 4}, num_warps=4, num_stages=3),
+        triton.Config({"BLOCK_M": 256, "BLOCK_N": 64, "BLOCK_K": 64, "GROUP_SIZE_M": 4}, num_warps=4, num_stages=3),
+    ],
+    key=["M", "N", "K"],
+)
+@triton.heuristics(
+    {
+        "EVEN_K": lambda args: args["K"] % args["BLOCK_K"] == 0,
+        "EVEN_M": lambda args: args["M"] % args["BLOCK_M"] == 0,
+        "EVEN_N": lambda args: args["N"] % args["BLOCK_N"] == 0,
+    }
+)
+@triton.jit
+def _dual_gemm_swiglu_autotuned(
+    a_ptr, b1_ptr, b2_ptr, c_ptr,
+    M, N, K,
+    stride_am, stride_ak, stride_b1k, stride_b1n, stride_b2k, stride_b2n, stride_cm, stride_cn,
+    BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr, GROUP_SIZE_M: tl.constexpr,
+    EVEN_K: tl.constexpr, EVEN_M: tl.constexpr, EVEN_N: tl.constexpr,
+    alpha: tl.constexpr,
+):
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_M)
+    num_pid_n = tl.cdiv(N, BLOCK_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = tl.minimum(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_in_group = pid % num_pid_in_group
+    pid_m = first_pid_m + (pid_in_group % group_size_m)
+    pid_n = pid_in_group // group_size_m
+    offs_m = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_n = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    offs_k = tl.arange(0, BLOCK_K)
+    a_ptrs = a_ptr + (offs_m[:, None] * stride_am + offs_k[None, :] * stride_ak)
+    b1_ptrs = b1_ptr + (offs_k[:, None] * stride_b1k + offs_n[None, :] * stride_b1n)
+    b2_ptrs = b2_ptr + (offs_k[:, None] * stride_b2k + offs_n[None, :] * stride_b2n)
+    acc1 = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+    acc2 = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+    m_mask = offs_m[:, None] < M
+    n_mask = offs_n[None, :] < N
+    for k in range(0, tl.cdiv(K, BLOCK_K)):
+        if EVEN_K:
+            a = tl.load(a_ptrs) if EVEN_M else tl.load(a_ptrs, mask=m_mask, other=0.0)
+            b1 = tl.load(b1_ptrs) if EVEN_N else tl.load(b1_ptrs, mask=n_mask, other=0.0)
+            b2 = tl.load(b2_ptrs) if EVEN_N else tl.load(b2_ptrs, mask=n_mask, other=0.0)
+        else:
+            k_rem = K - k * BLOCK_K
+            k_mask_m = offs_k[None, :] < k_rem
+            k_mask_n = offs_k[:, None] < k_rem
+            a = tl.load(a_ptrs, mask=m_mask & k_mask_m, other=0.0)
+            b1 = tl.load(b1_ptrs, mask=k_mask_n & n_mask, other=0.0)
+            b2 = tl.load(b2_ptrs, mask=k_mask_n & n_mask, other=0.0)
+        tl.multiple_of(a_ptrs, [16, 16])
+        tl.multiple_of(b1_ptrs, [16, 16])
+        tl.multiple_of(b2_ptrs, [16, 16])
+        acc1 += tl.dot(a, b1)
+        acc2 += tl.dot(a, b2)
+        a_ptrs += BLOCK_K * stride_ak
+        b1_ptrs += BLOCK_K * stride_b1k
+        b2_ptrs += BLOCK_K * stride_b2k
+    out = compute_swiglu(acc1, acc2, 1.0, alpha, 0.0)  # 0.0 = no clip
+    c_ptrs = c_ptr + (offs_m[:, None] * stride_cm + offs_n[None, :] * stride_cn)
+    c_mask = (offs_m[:, None] < M) & (offs_n[None, :] < N)
+    tl.store(c_ptrs, out.to(tl.float16), mask=c_mask)
+def dual_gemm_swiglu(
+    a: torch.Tensor,
+    b1: torch.Tensor,
+    b2: torch.Tensor,
+    n_tokens: Optional[torch.Tensor] = None,
+    alpha: float = 1.0,
+    limit: Optional[float] = None,
+) -> torch.Tensor:
+    """Fused Dual GEMM + SwiGLU: silu(A @ B1) * (A @ B2)."""
+    if a.ndim != 2 or b1.ndim != 2 or b2.ndim != 2:
+        raise ValueError("Expected 2D tensors: a[M,K], b1[K,N], b2[K,N].")
+    if a.shape[1] != b1.shape[0] or a.shape[1] != b2.shape[0]:
+        raise ValueError("Incompatible shapes for dual GEMM.")
+    if b1.shape[1] != b2.shape[1]:
+        raise ValueError("b1 and b2 must have same N dimension.")
+    if not (a.is_cuda and b1.is_cuda and b2.is_cuda):
+        raise ValueError("All tensors must be on a CUDA/ROCm device.")
+    if a.dtype != torch.float16 or b1.dtype != torch.float16 or b2.dtype != torch.float16:
+        raise ValueError("This kernel currently expects float16 inputs.")
+    a = a.contiguous()
+    b1 = b1.contiguous()
+    b2 = b2.contiguous()
+    M, K = a.shape
+    _, N = b1.shape
+    c = torch.empty((M, N), device=a.device, dtype=torch.float16)
+    grid = lambda META: (triton.cdiv(M, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]),)
+    _dual_gemm_swiglu_autotuned[grid](
+        a, b1, b2, c,
+        M=M, N=N, K=K,
+        stride_am=a.stride(0), stride_ak=a.stride(1),
+        stride_b1k=b1.stride(0), stride_b1n=b1.stride(1),
+        stride_b2k=b2.stride(0), stride_b2n=b2.stride(1),
+        stride_cm=c.stride(0), stride_cn=c.stride(1),
+        alpha=alpha,
+    )
+    return c
+def reference_dual_gemm_swiglu(a: torch.Tensor, b1: torch.Tensor, b2: torch.Tensor) -> torch.Tensor:
+    x1 = a @ b1
+    x2 = a @ b2
+    return torch.nn.functional.silu(x1) * x2
+def test_correctness(device: str = "cuda", maxtol: float = 2e-2, rmstol: float = 4e-3) -> bool:
+    from testing import assert_close
+    torch.manual_seed(42)
+    shapes = [
+        (128, 64, 128),
+        (256, 256, 512),
+        (1024, 512, 1024),
+        (4096, 3648, 8192),
+        (7, 13, 17),
+        (100, 200, 150),
+    ]
+    input_scale = 0.125
+    all_pass = True
+    for m, n, k in shapes:
+        a = torch.randn((m, k), device=device, dtype=torch.float16) * input_scale
+        b1 = torch.randn((k, n), device=device, dtype=torch.float16) * input_scale
+        b2 = torch.randn((k, n), device=device, dtype=torch.float16) * input_scale
+        with torch.no_grad():
+            ref = reference_dual_gemm_swiglu(a.float(), b1.float(), b2.float()).to(torch.float16)
+            out = dual_gemm_swiglu(a, b1, b2)
+        desc = f"[shape ({m},{n},{k})]"
+        try:
+            assert_close(ref, out, maxtol=maxtol, rmstol=rmstol, description=desc, verbose=True)
+            print(f"  {desc} PASS")
+        except AssertionError as e:
+            print(f"  {desc} FAIL: {e}")
+            all_pass = False
+    return all_pass
+def benchmark(m: int, n: int, k: int, warmup: int, iters: int, input_scale: float) -> None:
+    device = "cuda"
+    a = torch.randn((m, k), device=device, dtype=torch.float16) * input_scale
+    b1 = torch.randn((k, n), device=device, dtype=torch.float16) * input_scale
+    b2 = torch.randn((k, n), device=device, dtype=torch.float16) * input_scale
+    for _ in range(warmup):
+        _ = dual_gemm_swiglu(a, b1, b2)
+    torch.cuda.synchronize()
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+    start.record()
+    for _ in range(iters):
+        _ = dual_gemm_swiglu(a, b1, b2)
+    end.record()
+    torch.cuda.synchronize()
+    avg_ms = start.elapsed_time(end) / iters
+    total_flops = 4 * m * n * k
+    tflops = (total_flops / (avg_ms * 1e-3)) / 1e12
+    print(f"[kernel] shape=({m}, {n}, {k}) avg={avg_ms:.3f} ms, ~{tflops:.2f} TFLOP/s")
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Dual GEMM + SwiGLU (swiglu.py build pattern)")
+    parser.add_argument("--m", type=int, default=4096)
+    parser.add_argument("--n", type=int, default=3648)
+    parser.add_argument("--k", type=int, default=8192)
+    parser.add_argument("--warmup", type=int, default=10)
+    parser.add_argument("--iters", type=int, default=50)
+    parser.add_argument("--input-scale", type=float, default=0.125)
+    parser.add_argument("--test-only", action="store_true")
+    parser.add_argument("--bench-only", action="store_true")
+    args = parser.parse_args()
+    if not torch.cuda.is_available():
+        print("ERROR: No CUDA/ROCm GPU detected.")
+        raise SystemExit(1)
+    if not args.bench_only:
+        print("Running correctness tests...")
+        t0 = time.time()
+        ok = test_correctness()
+        print(f"Correctness: {'PASS' if ok else 'FAIL'} ({time.time()-t0:.2f}s)")
+    if not args.test_only:
+        print("\nRunning benchmark...")
+        t0 = time.time()
+        benchmark(args.m, args.n, args.k, args.warmup, args.iters, args.input_scale)
+        print(f"[done] elapsed={time.time()-t0:.2f}s")
+if __name__ == "__main__":
+    main()

dual_gemm_swiglu_mxfp.py ADDED Viewed

	@@ -0,0 +1,119 @@

+"""
+Option 1.5 / 2: Dual GEMM + SwiGLU with MXFP4 weights.
+- Option 1 (pre-dequant): use dual_gemm_swiglu from amd_dual_gemm_swiglu with (mx_tensor, mx_scale)
+- Option 1.5 (tiled pre-dequant): upcast B in K-blocks, never materialize full fp16 B. Saves memory.
+- Option 2 (fused): would decode MXFP in-kernel; blocked by ROCm Triton limitations (tl.cat, indexing).
+  Currently falls back to option 1.5.
+Usage:
+    from dual_gemm_swiglu_mxfp import dual_gemm_swiglu_mxfp_tiled
+    out = dual_gemm_swiglu_mxfp_tiled(a, (b1_mx, b1_scale), (b2_mx, b2_scale))
+"""
+import os
+import sys
+_script_dir = os.path.dirname(os.path.abspath(__file__))
+if _script_dir not in sys.path:
+    sys.path.insert(0, _script_dir)
+import torch
+try:
+    from numerics_details.mxfp_details import upcast_mxfp4_to_fp16
+    from amd_dual_gemm_swiglu import reference_dual_gemm_swiglu
+    _HAS_DEPS = True
+except ImportError:
+    _HAS_DEPS = False
+MXFP_BLOCK = 32  # N must be multiple of 32 for scale
+def _upcast_slice(mx_tensor: torch.Tensor, mx_scale: torch.Tensor, k_start: int, k_end: int) -> torch.Tensor:
+    """Upcast MXFP4 slice [k_start:k_end, :] to fp16 [k_end-k_start, N]."""
+    return upcast_mxfp4_to_fp16(
+        mx_tensor[k_start:k_end, :],
+        mx_scale[k_start:k_end, :],
+        block_m=k_end - k_start,
+        block_k=64,  # N must be divisible by block_k
+        verbose=False,
+    )
+def dual_gemm_swiglu_mxfp_tiled(
+    a: torch.Tensor,
+    b1_mx: torch.Tensor,
+    b1_scale: torch.Tensor,
+    b2_mx: torch.Tensor,
+    b2_scale: torch.Tensor,
+    block_k: int = 64,
+) -> torch.Tensor:
+    """
+    Dual GEMM + SwiGLU with MXFP4 B1, B2 using tiled pre-dequant (Option 1.5).
+    Upcasts B in K-blocks; never materializes full fp16 B. Saves memory vs full pre-dequant.
+    """
+    if not _HAS_DEPS:
+        raise ImportError("Requires numerics_details and amd_dual_gemm_swiglu")
+    M, K = a.shape
+    N = b1_mx.shape[1] * 2
+    assert b1_mx.shape == (K, N // 2) and b1_scale.shape == (K, N // 32)
+    assert b2_mx.shape == (K, N // 2) and b2_scale.shape == (K, N // 32)
+    assert K % block_k == 0 and N % MXFP_BLOCK == 0
+    assert block_k % MXFP_BLOCK == 0
+    a = a.contiguous()
+    c = torch.zeros((M, N), device=a.device, dtype=torch.float16)
+    # Option 1.5: Accumulate acc1 = A@B1 and acc2 = A@B2 in K-blocks, then out = silu(acc1)*acc2.
+    # Never materialize full fp16 B - upcast slice by slice. Saves O(K*N) -> O(block_k*N) memory.
+    acc1 = torch.zeros((M, N), device=a.device, dtype=torch.float32)
+    acc2 = torch.zeros((M, N), device=a.device, dtype=torch.float32)
+    for k_start in range(0, K, block_k):
+        k_end = k_start + block_k
+        b1_slice = _upcast_slice(b1_mx, b1_scale, k_start, k_end)
+        b2_slice = _upcast_slice(b2_mx, b2_scale, k_start, k_end)
+        # Partial GEMM: acc1 += A[:, k_start:k_end] @ b1_slice
+        # Use a simple matmul - tl.dot in a loop. We need a kernel for this.
+        # Actually PyTorch: acc1 += (a[:, k_start:k_end] @ b1_slice.float()).float()
+        acc1 += (a[:, k_start:k_end].float() @ b1_slice.float())
+        acc2 += (a[:, k_start:k_end].float() @ b2_slice.float())
+    # SwiGLU
+    silu = torch.nn.functional.silu(acc1.to(torch.float16))
+    out = (silu * acc2.to(torch.float16)).to(torch.float16)
+    return out
+def dual_gemm_swiglu_mxfp_predequant(a, b1_mx, b1_scale, b2_mx, b2_scale):
+    """Option 1: full pre-dequant, then standard dual GEMM."""
+    from amd_dual_gemm_swiglu import dual_gemm_swiglu
+    b1 = upcast_mxfp4_to_fp16(b1_mx, b1_scale, verbose=False)
+    b2 = upcast_mxfp4_to_fp16(b2_mx, b2_scale, verbose=False)
+    return dual_gemm_swiglu(a, b1, b2)
+if __name__ == "__main__":
+    if not _HAS_DEPS:
+        print("Missing deps")
+        sys.exit(1)
+    if not torch.cuda.is_available():
+        print("No GPU")
+        sys.exit(1)
+    device = "cuda"
+    M, N, K = 256, 128, 512
+    torch.manual_seed(42)
+    a = torch.randn((M, K), device=device, dtype=torch.float16) * 0.1
+    from example_dual_gemm_mxfp import downcast_fp16_to_mxfp4
+    b1_fp = torch.randn((K, N), device=device, dtype=torch.float16) * 0.1
+    b2_fp = torch.randn((K, N), device=device, dtype=torch.float16) * 0.1
+    b1_mx, b1_scale = downcast_fp16_to_mxfp4(b1_fp, block_k=64)
+    b2_mx, b2_scale = downcast_fp16_to_mxfp4(b2_fp, block_k=64)
+    print("Option 1 (pre-dequant):")
+    out1 = dual_gemm_swiglu_mxfp_predequant(a, b1_mx, b1_scale, b2_mx, b2_scale)
+    print("Option 1.5 (tiled pre-dequant):")
+    out15 = dual_gemm_swiglu_mxfp_tiled(a, b1_mx, b1_scale, b2_mx, b2_scale)
+    ref = reference_dual_gemm_swiglu(a.float(), b1_fp.float(), b2_fp.float()).to(torch.float16)
+    err1 = (out1.float() - ref.float()).abs().max().item()
+    err15 = (out15.float() - ref.float()).abs().max().item()
+    print(f"  Option 1 err: {err1:.2e}")
+    print(f"  Option 1.5 err: {err15:.2e}")
+    print(f"  Option 1 vs 1.5 diff: {(out1.float() - out15.float()).abs().max().item():.2e}")

example_dual_gemm_mxfp.py ADDED Viewed

	@@ -0,0 +1,70 @@

+#!/usr/bin/env python3
+"""
+Example: Dual GEMM + SwiGLU with MXFP4 weights (pre-dequant option 1).
+Quantizes B1, B2 to MXFP4, then upcasts and runs the fused GEMM.
+"""
+import os
+import sys
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+import torch
+from amd_dual_gemm_swiglu import dual_gemm_swiglu, reference_dual_gemm_swiglu
+from numerics_details.mxfp_details._downcast_to_mxfp import _downcast_to_mxfp
+from numerics_details.mxfp_details import upcast_mxfp4_to_fp16
+MXFP_BLOCK_SIZE_PY = 32
+def downcast_fp16_to_mxfp4(src: torch.Tensor, block_m: int = 128, block_k: int = 64):
+    """fp16 [M,K] -> (mx_tensor [M,K//2], mx_scale [M,K//32])."""
+    assert block_k % MXFP_BLOCK_SIZE_PY == 0
+    M, K = src.shape
+    mx_tensor = torch.empty((M, K // 2), device=src.device, dtype=torch.uint8)
+    mx_scale = torch.empty((M, K // 32), device=src.device, dtype=torch.uint8)
+    grid = ((M + block_m - 1) // block_m, (K + block_k - 1) // block_k)
+    _downcast_to_mxfp[grid](
+        mx_tensor, mx_tensor.stride(0), 1,
+        mx_scale, mx_scale.stride(0), mx_scale.stride(1),
+        src, src.stride(0), src.stride(1),
+        M, K,
+        BLOCK_SIZE_OUT_DIM=block_m,
+        BLOCK_SIZE_QUANT_DIM=block_k,
+        DEQUANT_SCALE_ROUNDING_MODE=0,
+    )
+    return mx_tensor, mx_scale
+def main():
+    if not torch.cuda.is_available():
+        print("No CUDA/ROCm device.")
+        return
+    device = "cuda"
+    print("Device:", torch.cuda.get_device_name(0))
+    M, N, K = 256, 128, 512
+    torch.manual_seed(42)
+    a = torch.randn((M, K), device=device, dtype=torch.float16) * 0.1
+    b1_fp16 = torch.randn((K, N), device=device, dtype=torch.float16) * 0.1
+    b2_fp16 = torch.randn((K, N), device=device, dtype=torch.float16) * 0.1
+    # Quantize B1, B2 to MXFP4 (need K, N multiples of 32 for block_k)
+    block_k = 64
+    b1_mx, b1_scale = downcast_fp16_to_mxfp4(b1_fp16, block_k=block_k)
+    b2_mx, b2_scale = downcast_fp16_to_mxfp4(b2_fp16, block_k=block_k)
+    print(f"Quantized B1: {b1_mx.shape}, {b1_scale.shape}")
+    # Run dual GEMM with MXFP4 weights (pre-dequant)
+    out_mxfp = dual_gemm_swiglu(a, (b1_mx, b1_scale), (b2_mx, b2_scale))
+    print(f"Output (MXFP4 path): {out_mxfp.shape}")
+    # Reference with fp16
+    out_ref = reference_dual_gemm_swiglu(a.float(), b1_fp16.float(), b2_fp16.float()).to(torch.float16)
+    err = (out_mxfp.float() - out_ref.float()).abs().max().item()
+    rel = err / (out_ref.float().abs().max().item() + 1e-6)
+    print(f"vs fp16 ref: max abs err={err:.2e}, rel={rel:.2e}")
+    print("Done.")
+if __name__ == "__main__":
+    main()

example_moe_compaction_gemm.py ADDED Viewed

	@@ -0,0 +1,68 @@

+#!/usr/bin/env python3
+"""
+Example: Mask compaction + Dual GEMM integration (MoE-style).
+Before dual GEMM: compact (Yv, Yi) per row based on BitMask.
+Then use compacted tensors for routing into expert weights.
+ROCm note: tl.store with dynamic write_indx may fail on ROCm Triton.
+If so, use the PyTorch fallback in mask_compaction.py.
+"""
+import os
+import sys
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+import torch
+from mask_compaction import masked_compaction, masked_compaction_torch_fallback
+from amd_dual_gemm_swiglu import dual_gemm_swiglu
+def example_integration():
+    """Sketch: compact routing outputs, then run dual GEMM on routed experts."""
+    device = "cuda"
+    if not torch.cuda.is_available():
+        print("No GPU")
+        return
+    M, K, N = 256, 64, 128  # tokens, hidden, expert dim
+    top_k = 8
+    num_experts = 4
+    # Simulate routing: Yv [M, K] values, Yi [M, K] expert indices (0..num_experts-1)
+    torch.manual_seed(42)
+    Yv = torch.randn(M, K, device=device, dtype=torch.float16) * 0.1
+    Yi = torch.randint(0, num_experts, (M, K), device=device, dtype=torch.int32)
+    # BitMask [M, ceil(K/32)]: 1 = use, 0 = discard (e.g. from load balance)
+    BitMask = torch.ones(M, (K + 31) // 32, device=device, dtype=torch.int32)
+    BitMask[:, 0] = 0x55555555  # example: alternating bits
+    # 1) Compact (Yv, Yi) per row based on BitMask
+    try:
+        RetYv, RetYi = masked_compaction(Yv, Yi, BitMask, sentinel=float("nan"))
+        print("Compaction: Triton kernel OK")
+    except Exception as e:
+        print(f"Compaction: Triton failed ({e}), using PyTorch fallback")
+        RetYv, RetYi = masked_compaction_torch_fallback(Yv, Yi, BitMask, sentinel=float("nan"))
+    # 2) Use compacted indices for routing into expert weights
+    # Expert weights: B1[E,K,N], B2[E,K,N] or similar. For simplicity, flat GEMM:
+    # A = routed activations [M, K], B1/B2 = expert weights [K, N]
+    # This is a simplified sketch; real MoE has per-expert B.
+    B1 = torch.randn(K, N, device=device, dtype=torch.float16) * 0.1
+    B2 = torch.randn(K, N, device=device, dtype=torch.float16) * 0.1
+    # Use RetYv as activations (compacted); pad/truncate to [M, K] if needed
+    A = RetYv[:, :K].contiguous()
+    if A.shape[1] < K:
+        A = torch.nn.functional.pad(A, (0, K - A.shape[1]), value=0)
+    # 3) Dual GEMM
+    out = dual_gemm_swiglu(A, B1, B2)
+    print(f"Dual GEMM output: {out.shape}")
+    print("Done.")
+if __name__ == "__main__":
+    example_integration()

example_mxfp_roundtrip.py ADDED Viewed

	@@ -0,0 +1,72 @@

+#!/usr/bin/env python3
+"""
+Example: use _downcast_to_mxfp (fp16 -> MXFP4) and _upcast_from_mxfp (MXFP4 -> fp16)
+for a round-trip on the remote server.
+Usage on remote:
+  cd /root/kernels
+  python example_mxfp_roundtrip.py
+"""
+import os
+import sys
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+import torch
+from numerics_details.mxfp_details._downcast_to_mxfp import _downcast_to_mxfp
+from numerics_details.mxfp_details import upcast_mxfp4_to_fp16
+MXFP_BLOCK_SIZE_PY = 32  # Python int for checks (tl.constexpr in kernels)
+def downcast_fp16_to_mxfp4(src: torch.Tensor, block_m: int = 128, block_k: int = 64):
+    """Convert fp16 tensor [M, K] to MXFP4 (uint8 mx_tensor + uint8 mx_scale)."""
+    assert src.dim() == 2 and src.dtype in (torch.float16, torch.bfloat16)
+    assert block_k % MXFP_BLOCK_SIZE_PY == 0, f"block_k must be multiple of {MXFP_BLOCK_SIZE_PY}"
+    M, K = src.shape
+    # Outputs: mx_tensor [M, K//2] uint8, mx_scale [M, K//32] uint8
+    mx_tensor = torch.empty((M, K // 2), device=src.device, dtype=torch.uint8)
+    mx_scale = torch.empty((M, K // 32), device=src.device, dtype=torch.uint8)
+    grid = ((M + block_m - 1) // block_m, (K + block_k - 1) // block_k)
+    _downcast_to_mxfp[grid](
+        mx_tensor, mx_tensor.stride(0), 1,
+        mx_scale, mx_scale.stride(0), mx_scale.stride(1),
+        src, src.stride(0), src.stride(1),
+        M, K,
+        BLOCK_SIZE_OUT_DIM=block_m,
+        BLOCK_SIZE_QUANT_DIM=block_k,
+        DEQUANT_SCALE_ROUNDING_MODE=0,
+    )
+    return mx_tensor, mx_scale
+def main():
+    if not torch.cuda.is_available():
+        print("No CUDA/ROCm device.")
+        return
+    device = "cuda"
+    print("Device:", torch.cuda.get_device_name(0))
+    # Create random fp16 tensor
+    M, K = 256, 128
+    src = torch.randn(M, K, device=device, dtype=torch.float16) * 0.1
+    # Downcast fp16 -> MXFP4
+    mx_tensor, mx_scale = downcast_fp16_to_mxfp4(src)
+    print(f"Downcast OK: mx_tensor {mx_tensor.shape}, mx_scale {mx_scale.shape}")
+    # Upcast MXFP4 -> fp16
+    recovered = upcast_mxfp4_to_fp16(mx_tensor, mx_scale)
+    print(f"Upcast OK: recovered {recovered.shape}")
+    # Compare
+    err = (src.float() - recovered.float()).abs().max().item()
+    rel = err / (src.float().abs().max().item() + 1e-6)
+    print(f"Round-trip max abs err: {err:.2e}, rel: {rel:.2e}")
+    print("Done.")
+if __name__ == "__main__":
+    main()

hardware_submission.py ADDED Viewed

	@@ -0,0 +1,121 @@

+import torch
+import triton
+import triton.language as tl
+import os
+# 1. HARDWARE DIAGNOSTICS & OS PREP
+def check_environment():
+    cuda_avail = torch.cuda.is_available()
+    if cuda_avail:
+        # Optimization: Force kernel arguments to device to save PCIe latency
+        os.environ["HIP_FORCE_DEV_KERNARG"] = "1"
+        # Disable compiler cache for benchmarking clean runs
+        os.environ["TRITON_CACHE_DIR"] = ""
+check_environment()
+# 2. THE SOL KERNEL
+@triton.autotune(
+    configs=[
+        triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'BLOCK_K': 64, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=8),
+        triton.Config({'BLOCK_M': 64,  'BLOCK_N': 256, 'BLOCK_K': 64, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=8),
+    ],
+    key=['M', 'N', 'K'],
+)
+@triton.jit
+def dual_gemm_hardware_kernel(
+    a_ptr, b1_ptr, b2_ptr, c_ptr,
+    sfa_ptr, sfb1_ptr, sfb2_ptr,
+    M, N, K, L,
+    stride_am, stride_ak, stride_al,
+    stride_bn, stride_bk, stride_bl,
+    stride_cm, stride_cn, stride_cl,
+    BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+):
+    # Persistent Grid logic
+    pid = tl.program_id(0)
+    num_pid_m = tl.cdiv(M, BLOCK_M)
+    num_pid_n = tl.cdiv(N, BLOCK_N)
+    total_tiles = num_pid_m * num_pid_n * L
+    for tile_idx in tl.range(pid, total_tiles, tl.num_programs(0)):
+        l_idx = tile_idx // (num_pid_m * num_pid_n)
+        tile_rem = tile_idx % (num_pid_m * num_pid_n)
+        # Swizzle for L2 Locality
+        pid_m, pid_n = tl.swizzle2d(tile_rem, num_pid_m, num_pid_n, GROUP_SIZE_M)
+        # Base offsets
+        rm = pid_m * BLOCK_M
+        rn = pid_n * BLOCK_N
+        # Ranges
+        offs_m = rm + tl.arange(0, BLOCK_M)
+        offs_n = rn + tl.arange(0, BLOCK_N)
+        offs_k = tl.arange(0, BLOCK_K)
+        # Memory Pointers
+        a_ptrs = a_ptr + l_idx * stride_al + (offs_m[:, None] * stride_am + offs_k[None, :] * stride_ak)
+        b1_ptrs = b1_ptr + l_idx * stride_bl + (offs_n[None, :] * stride_bn + offs_k[:, None] * stride_bk)
+        b2_ptrs = b2_ptr + l_idx * stride_bl + (offs_n[None, :] * stride_bn + offs_k[:, None] * stride_bk)
+        # Accumulators
+        acc1 = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+        acc2 = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+        # Scale Factor Load Offsets (Assuming OCP 1 scale per 16 elements)
+        # sfa: (M, K/16, L), sfb: (N, K/16, L)
+        sfa_base = sfa_ptr + l_idx * (M * (K // 16)) + (offs_m[:, None] * (K // 16))
+        sfb1_base = sfb1_ptr + l_idx * (N * (K // 16)) + (offs_n[None, :] * (K // 16))
+        sfb2_base = sfb2_ptr + l_idx * (N * (K // 16)) + (offs_n[None, :] * (K // 16))
+        for k in range(0, tl.cdiv(K, BLOCK_K)):
+            # 1. Load Data
+            a = tl.load(a_ptrs, mask=(offs_m[:, None] < M) & (offs_k[None, :] < K - k * BLOCK_K), other=0.0)
+            b1 = tl.load(b1_ptrs, mask=(offs_n[None, :] < N) & (offs_k[:, None] < K - k * BLOCK_K), other=0.0)
+            b2 = tl.load(b2_ptrs, mask=(offs_n[None, :] < N) & (offs_k[:, None] < K - k * BLOCK_K), other=0.0)
+            # 2. Load Scales for current K-block
+            # Blackwell uses a 32x4 atom, but for pointers, we load the K-slice
+            curr_sfa = tl.load(sfa_base + (k * (BLOCK_K // 16)), mask=(offs_m[:, None] < M), other=1.0)
+            curr_sfb1 = tl.load(sfb1_base + (k * (BLOCK_K // 16)), mask=(offs_n[None, :] < N), other=1.0)
+            curr_sfb2 = tl.load(sfb2_base + (k * (BLOCK_K // 16)), mask=(offs_n[None, :] < N), other=1.0)
+            # 3. Hardware DOT Scaled
+            acc1 = tl.dot_scaled(a, curr_sfa, "e2m1", b1, curr_sfb1, "e2m1", acc1)
+            acc2 = tl.dot_scaled(a, curr_sfa, "e2m1", b2, curr_sfb2, "e2m1", acc2)
+            # Advance K
+            a_ptrs += BLOCK_K * stride_ak
+            b1_ptrs += BLOCK_K * stride_bk
+            b2_ptrs += BLOCK_K * stride_bk
+        # 4. Epilogue (Fused SiLU + Gating)
+        res1 = acc1.to(tl.float16)
+        activated = res1 * tl.sigmoid(res1)
+        final_out = activated * acc2.to(tl.float16)
+        # 5. Masked Store
+        c_ptrs = c_ptr + l_idx * stride_cl + offs_m[:, None] * stride_cm + offs_n[None, :] * stride_cn
+        tl.store(c_ptrs, final_out, mask=(offs_m[:, None] < M) & (offs_n[None, :] < N))
+# 3. SUBMISSION INTERFACE
+def dual_gemm_submission(data):
+    a, b1, b2, sfa, sfb1, sfb2, c = data
+    M, K_packed, L = a.shape
+    N = b1.shape[0]
+    K = K_packed * 2 # FP4 2nd element expansion
+    # Saturate Device (148 for B200, 304 for MI300X)
+    num_sms = torch.cuda.get_device_properties(0).multi_processor_count
+    grid = (num_sms,)
+    dual_gemm_hardware_kernel[grid](
+        a, b1, b2, c, sfa, sfb1, sfb2,
+        M, N, K, L,
+        a.stride(0), a.stride(1), a.stride(2),
+        b1.stride(0), b1.stride(1), b1.stride(2),
+        c.stride(0), c.stride(1), c.stride(2)
+    )
+    return c

mask_compaction.py ADDED Viewed

	@@ -0,0 +1,75 @@

+"""
+Masked compaction kernel: compact (Yv, Yi) per row based on BitMask.
+Active elements (bit=1) move to front, inactive (bit=0) move to back with sentinel.
+For MoE: use before dual GEMM to get dense top-k for routing into expert weights.
+ROCm note: tl.store with dynamic write_indx may have limitations. If it fails,
+fall back to PyTorch compaction.
+"""
+import torch
+import triton
+import triton.language as tl
+@triton.jit
+def _masked_compaction(
+    Yv, Yi, BitMask, stride_bm, stride_bn,
+    RetYv, RetYi, sentinel, K: tl.constexpr
+):
+    pid_m = tl.program_id(0)
+    yv = tl.load(Yv + pid_m * K + tl.arange(0, K))
+    yi = tl.load(Yi + pid_m * K + tl.arange(0, K))
+    div = yi // 32
+    rem = yi % 32
+    active_bits = (tl.load(BitMask + pid_m * stride_bm + div * stride_bn) >> rem) & 1
+    exc_cumsum = tl.cumsum(active_bits, 0) - active_bits
+    active_flags = active_bits.to(tl.int1)
+    rev_arange = tl.where(active_flags, 0, K - 1 - tl.arange(0, K))
+    write_indx = exc_cumsum + rev_arange
+    yv = tl.where(active_flags, yv, sentinel)
+    yi = tl.where(active_flags, yi, sentinel)
+    tl.store(RetYv + pid_m * K + write_indx, yv)
+    tl.store(RetYi + pid_m * K + write_indx, yi)
+def masked_compaction(
+    Yv: torch.Tensor,   # [M, K] values
+    Yi: torch.Tensor,   # [M, K] indices (int32)
+    BitMask: torch.Tensor,  # [M, ceil(K/32)] or similar - 1 bit per position
+    sentinel: float = float("nan"),
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Compact Yv, Yi per row: active (BitMask=1) to front, inactive to back with sentinel.
+    Returns (RetYv, RetYi) same shape as (Yv, Yi).
+    """
+    M, K = Yv.shape
+    assert Yi.shape == (M, K)
+    RetYv = torch.empty_like(Yv)
+    RetYi = torch.empty_like(Yi)
+    grid = (M,)
+    _masked_compaction[grid](
+        Yv, Yi, BitMask,
+        BitMask.stride(0), BitMask.stride(1),
+        RetYv, RetYi, sentinel, K=K,
+    )
+    return RetYv, RetYi
+def masked_compaction_torch_fallback(Yv, Yi, BitMask, sentinel=float("nan")):
+    """PyTorch fallback if Triton kernel fails on ROCm."""
+    M, K = Yv.shape
+    RetYv = torch.full_like(Yv, sentinel)
+    RetYi = torch.full_like(Yi, -1)
+    for m in range(M):
+        # Bit per position k: div=k//32, rem=k%32
+        div = torch.arange(K, device=Yv.device) // 32
+        rem = torch.arange(K, device=Yv.device) % 32
+        active = ((BitMask[m, div] >> rem) & 1).bool()
+        n_active = active.sum().item()
+        RetYv[m, :n_active] = Yv[m, active]
+        RetYi[m, :n_active] = Yi[m, active]
+    return RetYv, RetYi
+masked_compaction_pytorch = masked_compaction_torch_fallback  # alias for import

numerics_details/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# numerics_details: MXFP only (use mxfp_details)
+from .mxfp_details import upcast_mxfp4_to_fp16
+__all__ = ["upcast_mxfp4_to_fp16"]

numerics_details/mxfp_details/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# mxfp_details: MXFP quantize/dequantize kernels
+from .upcast_mxfp4 import upcast_mxfp4_to_fp16
+__all__ = ["upcast_mxfp4_to_fp16"]

numerics_details/mxfp_details/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (240 Bytes). View file

numerics_details/mxfp_details/__pycache__/_downcast_to_mxfp.cpython-312.pyc ADDED Viewed

Binary file (8.91 kB). View file

numerics_details/mxfp_details/__pycache__/_upcast_from_mxfp.cpython-312.pyc ADDED Viewed

Binary file (7.88 kB). View file

numerics_details/mxfp_details/__pycache__/upcast_mxfp4.cpython-312.pyc ADDED Viewed

Binary file (5.19 kB). View file

numerics_details/mxfp_details/_downcast_to_mxfp.py ADDED Viewed

	@@ -0,0 +1,163 @@

+# From https://huggingface.co/kernels-community/triton-kernels/blob/main/build/torch-cuda/numerics_details/mxfp_details/_downcast_to_mxfp.py
+import triton
+import triton.language as tl
+# fmt: off
+MXFP_BLOCK_SIZE = tl.constexpr(32)
+@triton.jit
+def _get_max_quant_val(dtype: tl.constexpr):
+    if dtype == tl.uint8:
+        return 6.0
+    elif dtype == tl.float8e5:
+        return 57344.0
+    elif dtype == tl.float8e4nv:
+        return 448.0
+    else:
+        tl.static_assert(False, f"Invalid {dtype=}")
+@triton.jit
+def _compute_quant_and_scale(src_tensor, valid_src_mask, mx_tensor_dtype: tl.constexpr,
+                             DEQUANT_SCALE_ROUNDING_MODE: tl.constexpr = 0):
+    is_fp8: tl.constexpr = mx_tensor_dtype == tl.float8e4nv or mx_tensor_dtype == tl.float8e5
+    BLOCK_SIZE_OUT_DIM: tl.constexpr = src_tensor.shape[0]
+    BLOCK_SIZE_QUANT_DIM: tl.constexpr = src_tensor.shape[1]
+    BLOCK_SIZE_QUANT_MX_SCALE: tl.constexpr = src_tensor.shape[1] // MXFP_BLOCK_SIZE
+    # Explicit cast to fp32 since most ops are not supported on bfloat16. We avoid needless conversions to and from bf16
+    f32_tensor = src_tensor.to(tl.float32)
+    abs_tensor = tl.abs(f32_tensor)
+    abs_tensor = tl.where(valid_src_mask, abs_tensor, -1.0)  # Don't consider padding tensors in scale computation
+    abs_tensor = tl.reshape(abs_tensor, [BLOCK_SIZE_OUT_DIM, BLOCK_SIZE_QUANT_MX_SCALE, MXFP_BLOCK_SIZE])
+    max_val = tl.max(abs_tensor, axis=2, keep_dims=True)
+    dequant_scale = max_val / _get_max_quant_val(mx_tensor_dtype)
+    if DEQUANT_SCALE_ROUNDING_MODE == 0:
+        # DequantScaleRoundingMode.ROUND_UP
+        # compute 2 ** ceil(log2(dequant_scale))
+        # Adding 0x007FFFFF adds exponent by 1 unless mantissa is all zeros
+        # A corner case: exponent is 0xFF that will overflow but that's already
+        # NaN so assume we don't care.
+        dequant_scale_exponent = (dequant_scale.to(tl.uint32, bitcast=True) + 0x007FFFFF) & 0x7F800000
+    else:
+        # DequantScaleRoundingMode.ROUND_DOWN
+        # compute 2 ** floor(log2(dequant_scale))
+        assert DEQUANT_SCALE_ROUNDING_MODE == 1
+        dequant_scale_exponent = dequant_scale.to(tl.uint32, bitcast=True) & 0x7F800000
+    dequant_scale_rounded = dequant_scale_exponent.to(tl.float32, bitcast=True)
+    quant_scale = tl.where(dequant_scale_rounded == 0, 0, 1.0 / dequant_scale_rounded)
+    f32_tensor = tl.reshape(f32_tensor, [BLOCK_SIZE_OUT_DIM, BLOCK_SIZE_QUANT_MX_SCALE, MXFP_BLOCK_SIZE])
+    quant_tensor = f32_tensor * quant_scale
+    # Reshape the tensors after scaling
+    quant_tensor = quant_tensor.reshape([BLOCK_SIZE_OUT_DIM, BLOCK_SIZE_QUANT_DIM])
+    # Set the invalid portions of the tensor to 0. This will ensure that any padding tensors are 0 in the mx format.
+    quant_tensor = tl.where(valid_src_mask, quant_tensor, 0)
+    dequant_scale_exponent = dequant_scale_exponent.reshape([BLOCK_SIZE_OUT_DIM, BLOCK_SIZE_QUANT_MX_SCALE])
+    # First, we simply extract the exponent part of the scales and store the result
+    dequant_scale_exponent = (dequant_scale_exponent >> 23).to(tl.uint8)
+    # Now we must convert the tensors to the mx format.
+    if is_fp8:
+        out_tensor = quant_tensor.to(mx_tensor_dtype)
+    else:
+        quant_tensor = quant_tensor.to(tl.uint32, bitcast=True)
+        signs = quant_tensor & 0x80000000
+        exponents = (quant_tensor >> 23) & 0xFF
+        mantissas = (quant_tensor & 0x7FFFFF)
+        # 0.25 <= x < 0.75 maps to 0.5, a denormal number
+        E8_BIAS = 127
+        E2_BIAS = 1
+        # Move implicit bit 1 at the beginning to mantissa for denormals
+        # tl.core.sub not available in Triton ROCm; use plain subtraction
+        adjusted_exponents = E8_BIAS - (exponents + 1)
+        mantissas = tl.where(exponents < E8_BIAS, (0x400000 | (mantissas >> 1)) >> adjusted_exponents, mantissas)
+        # For normal numbers, we change the bias from 127 to 1, and for subnormals, we keep exponent as 0.
+        exponents = tl.maximum(exponents, E8_BIAS - E2_BIAS) - (E8_BIAS - E2_BIAS)
+        # Combine sign, exponent, and mantissa, while saturating
+        # rounding nearest with tie breaking up by adding +1 to one bit right of the LSB, then shift right
+        e2m1_tmp = tl.minimum((((exponents << 2) | (mantissas >> 21)) + 1) >> 1, 0x7)
+        e2m1_value = ((signs >> 28) | e2m1_tmp).to(tl.uint8)
+        e2m1_value = tl.reshape(e2m1_value, [BLOCK_SIZE_OUT_DIM, BLOCK_SIZE_QUANT_DIM // 2, 2])
+        evens, odds = tl.split(e2m1_value)
+        out_tensor = evens | (odds << 4)
+    return out_tensor, dequant_scale_exponent
+@triton.jit
+def _downcast_to_mxfp(mx_tensor_ptr, stride_mxt_outer, stride_mxt_quant: tl.constexpr,
+                      mx_scale_ptr, stride_mx_scale_outer, stride_mx_scale_quant,
+                      src_ptr, stride_src_outer, stride_src_quant,
+                      outer_dim, quant_dim,
+                      BLOCK_SIZE_OUT_DIM: tl.constexpr, BLOCK_SIZE_QUANT_DIM: tl.constexpr,
+                      DEQUANT_SCALE_ROUNDING_MODE: tl.constexpr):
+    tl.static_assert(stride_mxt_quant == 1, f"Output stride, {stride_mxt_quant=} must be 1.")
+    tl.static_assert(BLOCK_SIZE_QUANT_DIM % MXFP_BLOCK_SIZE == 0, f"{BLOCK_SIZE_QUANT_DIM=} must be a multiple of 32")
+    # uint8 signifies two fp4 e2m1 values packed into a single byte
+    mx_tensor_dtype: tl.constexpr = mx_tensor_ptr.dtype.element_ty
+    tl.static_assert(mx_tensor_dtype == tl.uint8 or (mx_tensor_dtype == tl.float8e4nv or mx_tensor_dtype == tl.float8e5),
+                     f"Invalid {mx_tensor_dtype=}. Must be uint8 or float8.")
+    src_dtype: tl.constexpr = src_ptr.dtype.element_ty
+    tl.static_assert(mx_scale_ptr.dtype.element_ty == tl.uint8, f"{mx_scale_ptr.dtype.element_ty=} must be uint8")
+    tl.static_assert((src_dtype == tl.bfloat16) or (src_dtype == tl.float16), f"{src_dtype=} must be bfloat16 or float16")
+    is_fp4: tl.constexpr = mx_tensor_dtype == tl.uint8
+    outer_block = tl.program_id(0).to(tl.int64)
+    quant_block = tl.program_id(1).to(tl.int64)
+    K_DIVISOR: tl.constexpr = 2 if is_fp4 else 1
+    BLOCK_SIZE_QUANT_MX_SCALE: tl.constexpr = BLOCK_SIZE_QUANT_DIM // MXFP_BLOCK_SIZE
+    BLOCK_SIZE_QUANT_MX_TENSOR: tl.constexpr = BLOCK_SIZE_QUANT_DIM // K_DIVISOR
+    start_src_quant = quant_block * BLOCK_SIZE_QUANT_DIM
+    start_mx_scale_quant = quant_block * BLOCK_SIZE_QUANT_MX_SCALE
+    start_mx_quant = quant_block * BLOCK_SIZE_QUANT_MX_TENSOR
+    start_out = outer_block * BLOCK_SIZE_OUT_DIM
+    src_ptr += start_src_quant * stride_src_quant + start_out * stride_src_outer
+    mx_scale_ptr += start_mx_scale_quant * stride_mx_scale_quant + start_out * stride_mx_scale_outer
+    mx_tensor_ptr += start_mx_quant * stride_mxt_quant + start_out * stride_mxt_outer
+    offs_src_quant = tl.arange(0, BLOCK_SIZE_QUANT_DIM)[None, :].to(tl.int64)
+    offs_mxt_quant = tl.arange(0, BLOCK_SIZE_QUANT_MX_TENSOR)[None, :].to(tl.int64)
+    offs_scale_quant = tl.arange(0, BLOCK_SIZE_QUANT_MX_SCALE)[None, :].to(tl.int64)
+    offs_outer = tl.arange(0, BLOCK_SIZE_OUT_DIM)[:, None].to(tl.int64)
+    mask_src_quant = start_src_quant + offs_src_quant < quant_dim
+    mask_n = start_out + offs_outer < outer_dim
+    full_mask_src = mask_src_quant & mask_n
+    mask_mxt_quant = start_mx_quant + offs_mxt_quant < tl.cdiv(quant_dim, K_DIVISOR)
+    full_mask_mxt = mask_mxt_quant & mask_n
+    scale_mask_k = start_mx_scale_quant + offs_scale_quant < tl.cdiv(quant_dim, MXFP_BLOCK_SIZE)
+    full_scale_mask = scale_mask_k & mask_n
+    src_tensor_offsets = offs_src_quant * stride_src_quant + offs_outer * stride_src_outer
+    mx_scale_offsets = offs_scale_quant * stride_mx_scale_quant + offs_outer * stride_mx_scale_outer
+    mx_tensor_offsets = offs_mxt_quant * stride_mxt_quant + offs_outer * stride_mxt_outer
+    src_tensor = tl.load(src_ptr + src_tensor_offsets, mask=full_mask_src)
+    out_tensor, scale_tensor = _compute_quant_and_scale(src_tensor, full_mask_src, mx_tensor_dtype,
+                                                        DEQUANT_SCALE_ROUNDING_MODE)
+    tl.store(mx_scale_ptr + mx_scale_offsets, scale_tensor, mask=full_scale_mask)
+    tl.store(mx_tensor_ptr + mx_tensor_offsets, out_tensor, mask=full_mask_mxt)
+@triton.jit(repr=lambda _: "_dequantize_mxfp8")
+def _dequantize_mxfp8_fn(input, mask, pid=None):
+    return _compute_quant_and_scale(input, mask, tl.float8e4nv)

numerics_details/mxfp_details/_upcast_from_mxfp.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import triton
+import triton.language as tl
+from ._downcast_to_mxfp import MXFP_BLOCK_SIZE
+# fmt: off
+@triton.jit
+def _upcast_from_mxfp(out_ptr, stride_o_outer, stride_o_quant: tl.constexpr, mx_scale_ptr, stride_scale_outer,
+                      stride_scale_quant, mx_tensor_ptr, stride_tensor_outer, stride_tensor_quant: tl.constexpr,
+                      outer_dim, quant_dim, BLOCK_SIZE_OUT_DIM: tl.constexpr, BLOCK_SIZE_QUANT_DIM: tl.constexpr):
+    tl.static_assert(stride_o_quant == 1, "the weight must be contiguous in the k dimension for mx")
+    tl.static_assert(BLOCK_SIZE_QUANT_DIM % MXFP_BLOCK_SIZE == 0, "BLOCK_SIZE_K must be a multiple of 32")
+    # uint8 signifies two fp4 e2m1 values packed into a single byte
+    mx_tensor_dtype: tl.constexpr = mx_tensor_ptr.dtype.element_ty
+    dst_dtype: tl.constexpr = out_ptr.dtype.element_ty
+    tl.static_assert(dst_dtype == tl.float16 or dst_dtype == tl.bfloat16)
+    tl.static_assert(
+        mx_tensor_dtype == tl.uint8
+        or ((mx_tensor_dtype == tl.float8e4nv or mx_tensor_dtype == tl.float8e5) or mx_tensor_dtype == dst_dtype),
+        "mx_tensor_ptr must be uint8 or float8 or dst_dtype")
+    tl.static_assert(mx_scale_ptr.dtype.element_ty == tl.uint8, "mx_scale_ptr must be uint8")
+    # Determine if we are dealing with fp8 types.
+    is_fp4: tl.constexpr = mx_tensor_dtype == tl.uint8
+    is_fp8: tl.constexpr = mx_tensor_dtype == tl.float8e4nv or mx_tensor_dtype == tl.float8e5
+    K_DIVISOR: tl.constexpr = 2 if is_fp4 else 1
+    BLOCK_SIZE_QUANT_MX_SCALE: tl.constexpr = BLOCK_SIZE_QUANT_DIM // MXFP_BLOCK_SIZE
+    BLOCK_SIZE_QUANT_MX_TENSOR: tl.constexpr = BLOCK_SIZE_QUANT_DIM // K_DIVISOR
+    # Compute starting indices for the quantized (packed) dimension and the outer dimension.
+    outer_block = tl.program_id(0).to(tl.int64)
+    quant_block = tl.program_id(1).to(tl.int64)
+    start_mxt_quant = quant_block * BLOCK_SIZE_QUANT_MX_TENSOR
+    start_out_quant = quant_block * BLOCK_SIZE_QUANT_DIM
+    start_mx_scale_quant = quant_block * BLOCK_SIZE_QUANT_MX_SCALE
+    start_out = outer_block * BLOCK_SIZE_OUT_DIM
+    mx_tensor_ptr += start_mxt_quant * stride_tensor_quant + start_out * stride_tensor_outer
+    mx_scale_ptr += start_mx_scale_quant * stride_scale_quant + start_out * stride_scale_outer
+    out_ptr += start_out * stride_o_outer + start_out_quant * stride_o_quant
+    # Compute offsets and masks.
+    offs_src_quant = tl.arange(0, BLOCK_SIZE_QUANT_MX_TENSOR)[None, :].to(tl.int64)
+    offs_out_quant = tl.arange(0, BLOCK_SIZE_QUANT_DIM)[None, :].to(tl.int64)
+    offs_outer = tl.arange(0, BLOCK_SIZE_OUT_DIM)[:, None].to(tl.int64)
+    offs_scale = tl.arange(0, BLOCK_SIZE_QUANT_MX_SCALE)[None, :].to(tl.int64)
+    mask_outer = start_out + offs_outer < outer_dim
+    mask_out_quant = start_out_quant + offs_out_quant < quant_dim
+    full_mask_out = mask_out_quant & mask_outer
+    mask_src_quant = start_mxt_quant + offs_src_quant < tl.cdiv(quant_dim, K_DIVISOR)
+    full_mask_src = mask_src_quant & mask_outer
+    mask_scale = start_mx_scale_quant + offs_scale < tl.cdiv(quant_dim, MXFP_BLOCK_SIZE)
+    full_scale_mask = mask_scale & mask_outer
+    tensor_offsets = offs_src_quant * stride_tensor_quant + offs_outer * stride_tensor_outer
+    scale_offsets = offs_scale * stride_scale_quant + offs_outer * stride_scale_outer
+    out_offsets = offs_out_quant * stride_o_quant + offs_outer * stride_o_outer
+    # Load the packed tensor and scale.
+    tensor = tl.load(mx_tensor_ptr + tensor_offsets, mask=full_mask_src)
+    scale = tl.load(mx_scale_ptr + scale_offsets, mask=full_scale_mask)
+    # Upcast the scale to the destination type.
+    if dst_dtype == tl.bfloat16:
+        dst_scale = (scale.to(tl.uint16) << 7).to(dst_dtype, bitcast=True)
+    else:
+        tl.static_assert(dst_dtype == tl.float16)
+        dst_scale = (scale.to(tl.uint32) << 23).to(tl.float32, bitcast=True)
+        dst_scale = dst_scale.to(tl.float16)
+    # Now upcast the tensor.
+    if is_fp8:
+        dst_tensor = tensor.to(dst_dtype)
+        if mx_tensor_dtype == tl.float8e5:
+            from_e_bits: tl.constexpr = 5
+            from_m_bits: tl.constexpr = 2
+            to_e_bits: tl.constexpr = 8 if dst_dtype == tl.bfloat16 else 5
+            to_m_bits: tl.constexpr = 7 if dst_dtype == tl.bfloat16 else 10
+            # Preserve infs and nans. FIXME Fp8E5M2_to_Bf16 doesn't preserve them!
+            non_finite_mask_src: tl.constexpr = ((1 << from_e_bits) - 1) << from_m_bits
+            non_finite_mask_dst: tl.constexpr = ((1 << to_e_bits) - 1) << to_m_bits
+            dst_tensor = tl.where(
+                (tensor.to(tl.uint8, bitcast=True) & non_finite_mask_src) == non_finite_mask_src,
+                (dst_tensor.to(tl.uint16, bitcast=True) | non_finite_mask_dst).to(dst_dtype, bitcast=True),
+                dst_tensor,
+            )
+    else:
+        tl.static_assert(is_fp4)
+        dst_bias: tl.constexpr = 127 if dst_dtype == tl.bfloat16 else 15
+        dst_0p5: tl.constexpr = 16128 if dst_dtype == tl.bfloat16 else 0x3800
+        dst_m_bits: tl.constexpr = 7 if dst_dtype == tl.bfloat16 else 10
+        # e2m1
+        em0 = tensor & 0x07
+        em1 = tensor & 0x70
+        x0 = (em0.to(tl.uint16) << (dst_m_bits - 1)) | ((tensor & 0x08).to(tl.uint16) << 12)
+        x1 = (em1.to(tl.uint16) << (dst_m_bits - 5)) | ((tensor & 0x80).to(tl.uint16) << 8)
+        # Three cases:
+        # 1) x is normal and non-zero: Correct bias
+        x0 = tl.where((em0 & 0x06) != 0, x0 + ((dst_bias - 1) << dst_m_bits), x0)
+        x1 = tl.where((em1 & 0x60) != 0, x1 + ((dst_bias - 1) << dst_m_bits), x1)
+        # 2) x is subnormal (x == 0bs001 where s is the sign): Map to +-0.5 in the dst type
+        x0 = tl.where(em0 == 0x01, dst_0p5 | (x0 & 0x8000), x0)
+        x1 = tl.where(em1 == 0x10, dst_0p5 | (x1 & 0x8000), x1)
+        # 3) x is zero, do nothing
+        # Interleave x0,x1: use tl.where (ROCm tl.cat only supports 1D)
+        idx_k = tl.arange(0, BLOCK_SIZE_QUANT_DIM)
+        is_even = (idx_k % 2) == 0
+        val_x0 = x0[:, idx_k // 2]
+        val_x1 = x1[:, idx_k // 2]
+        dst_tensor = tl.where(is_even[None, :], val_x0, val_x1).to(dst_dtype, bitcast=True)
+    # dst_tensor already [M, K/32, 32] for fp4; scale was stored with 32-sized "inner" grouping
+    dst_scale = dst_scale.reshape([BLOCK_SIZE_OUT_DIM, BLOCK_SIZE_QUANT_MX_SCALE, 1])
+    scale = scale.reshape(dst_scale.shape)
+    out_tensor = dst_tensor * dst_scale
+    # Correct any NaNs encoded via the scale.
+    out_tensor = tl.where(scale == 0xFF, float("nan"), out_tensor)
+    out_tensor = out_tensor.reshape([BLOCK_SIZE_OUT_DIM, BLOCK_SIZE_QUANT_DIM])
+    tl.store(out_ptr + out_offsets, out_tensor, mask=full_mask_out)

numerics_details/mxfp_details/upcast_mxfp4.py ADDED Viewed

	@@ -0,0 +1,88 @@

+"""
+Reusable MXFP4 upcast: MXFP4 (uint8 mx_tensor + uint8 mx_scale) -> fp16/bf16.
+Uses Triton kernel when available; falls back to PyTorch on ROCm (tl.cat limitation).
+"""
+import torch
+from ._upcast_from_mxfp import _upcast_from_mxfp
+try:
+    from triton.compiler.errors import CompilationError
+except ImportError:
+    CompilationError = Exception
+MXFP_BLOCK_SIZE_PY = 32
+def _upcast_mxfp4_to_fp16_pytorch(
+    mx_tensor: torch.Tensor, mx_scale: torch.Tensor, dtype: torch.dtype = torch.float16
+) -> torch.Tensor:
+    """PyTorch fallback (used when Triton kernel fails on ROCm)."""
+    M, K_half = mx_tensor.shape
+    K = K_half * 2
+    dst_bias = 15
+    dst_0p5 = 0x3800
+    dst_m_bits = 10
+    tensor = mx_tensor.to(torch.int32)
+    em0 = tensor & 0x07
+    em1 = tensor & 0x70
+    x0 = (em0 << (dst_m_bits - 1)) | ((tensor & 0x08) << 12)
+    x1 = (em1 << (dst_m_bits - 5)) | ((tensor & 0x80) << 8)
+    x0 = torch.where((em0 & 0x06) != 0, x0 + ((dst_bias - 1) << dst_m_bits), x0)
+    x1 = torch.where((em1 & 0x60) != 0, x1 + ((dst_bias - 1) << dst_m_bits), x1)
+    x0 = torch.where(em0 == 0x01, torch.full_like(x0, dst_0p5) | (x0 & 0x8000), x0)
+    x1 = torch.where(em1 == 0x10, torch.full_like(x1, dst_0p5) | (x1 & 0x8000), x1)
+    out_u16 = torch.empty((M, K), device=mx_tensor.device, dtype=torch.uint16)
+    out_u16[:, 0::2] = (x0 & 0xFFFF).to(torch.uint16)
+    out_u16[:, 1::2] = (x1 & 0xFFFF).to(torch.uint16)
+    dst_tensor = out_u16.view(dtype)
+    scale_u32 = mx_scale.to(torch.int32) << 23
+    dst_scale = scale_u32.view(torch.float32).to(dtype)
+    dst_scale = dst_scale.unsqueeze(-1).repeat(1, 1, 32).reshape(M, K)
+    out_tensor = dst_tensor * dst_scale
+    out_tensor = torch.where(
+        mx_scale.unsqueeze(-1).expand(-1, -1, 32).reshape(M, K) == 0xFF,
+        float("nan"),
+        out_tensor,
+    )
+    return out_tensor
+def upcast_mxfp4_to_fp16(
+    mx_tensor: torch.Tensor,
+    mx_scale: torch.Tensor,
+    block_m: int = 128,
+    block_k: int = 64,
+    dtype: torch.dtype = torch.float16,
+    verbose: bool = False,
+) -> torch.Tensor:
+    """Convert MXFP4 [M,K/2]+[M,K/32] -> fp16/bf16 [M,K]. Falls back to PyTorch if Triton fails."""
+    assert mx_tensor.dim() == 2 and mx_tensor.dtype == torch.uint8
+    assert mx_scale.dim() == 2 and mx_scale.dtype == torch.uint8
+    M = mx_tensor.shape[0]
+    K = mx_tensor.shape[1] * 2
+    assert mx_scale.shape == (M, K // 32)
+    assert block_k % MXFP_BLOCK_SIZE_PY == 0
+    try:
+        out = torch.empty((M, K), device=mx_tensor.device, dtype=dtype)
+        grid = ((M + block_m - 1) // block_m, (K + block_k - 1) // block_k)
+        _upcast_from_mxfp[grid](
+            out, out.stride(0), 1,
+            mx_scale, mx_scale.stride(0), mx_scale.stride(1),
+            mx_tensor, mx_tensor.stride(0), 1,
+            M, K,
+            BLOCK_SIZE_OUT_DIM=block_m,
+            BLOCK_SIZE_QUANT_DIM=block_k,
+        )
+        return out
+    except CompilationError:
+        if verbose:
+            print("Triton upcast failed (e.g. ROCm), using PyTorch fallback.")
+        return _upcast_mxfp4_to_fp16_pytorch(mx_tensor, mx_scale, dtype)

submission.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import torch
+import triton
+import triton.language as tl
+# 1. HARDWARE DIAGNOSTICS
+def check_environment():
+    print(f"--- Environment Check ---")
+    cuda_avail = torch.cuda.is_available()
+    print(f"Is CUDA/ROCm available? {cuda_avail}")
+    if cuda_avail:
+        device_name = torch.cuda.get_device_name(0)
+        print(f"GPU Detected: {device_name}")
+        # Check for Blackwell (SM 10.0) or MI300X (gfx942)
+        prop = torch.cuda.get_device_properties(0)
+        if hasattr(prop, 'major'):
+            print(f"Compute Capability: {prop.major}.{prop.minor}")
+    else:
+        print("No NVIDIA/AMD GPU detected. Triton kernels will not run on this hardware.")
+    print(f"-------------------------\n")
+# Call diagnostic immediately on import
+check_environment()
+# 2. PLACEHOLDER KERNEL (Logic from previous steps)
+@triton.jit
+def dual_gemm_kernel(a_ptr, b1_ptr, b2_ptr, c_ptr, M, N, K, **meta):
+    # Kernel code here...
+    pass
+# 3. HARNESS INTERFACE
+def dual_gemm_submission(data):
+    # This is what the leaderboard/benchmark calls
+    a, b1, b2, sfa, sfb1, sfb2, c = data
+    # ... launch logic ...
+    return c

test_mxfp.py ADDED Viewed

	@@ -0,0 +1,87 @@

+#!/usr/bin/env python3
+"""
+Minimal test for MXFP _downcast_to_mxfp on MI300X (ROCm).
+Tests fp16 -> uint8 (fp4 packed) path; float8 path may not work on ROCm yet.
+Run on remote: cd /root/kernels && python test_mxfp.py
+"""
+import sys
+import os
+# Allow imports from /root/kernels
+_script_dir = os.path.dirname(os.path.abspath(__file__))
+if _script_dir not in sys.path:
+    sys.path.insert(0, _script_dir)
+def main():
+    print("=== MXFP Import Test ===")
+    try:
+        from numerics_details.mxfp_details._downcast_to_mxfp import (
+            _downcast_to_mxfp,
+            _compute_quant_and_scale,
+            MXFP_BLOCK_SIZE,
+        )
+        print("  Import OK: _downcast_to_mxfp, _compute_quant_and_scale, MXFP_BLOCK_SIZE")
+    except Exception as e:
+        print(f"  Import FAILED: {e}")
+        return 1
+    print("\n=== Triton + CUDA/ROCm Check ===")
+    import torch
+    if not torch.cuda.is_available():
+        print("  No GPU available. Skipping kernel test.")
+        return 0
+    print(f"  Device: {torch.cuda.get_device_name(0)}")
+    import triton
+    import triton.language as tl
+    print("\n=== MXFP Downcast Test (fp16 -> fp4 uint8) ===")
+    # Use fp4 path (uint8 output) - avoids float8 dtypes which may lack ROCm support
+    BLOCK_SIZE_OUT_DIM = 64
+    BLOCK_SIZE_QUANT_DIM = 64  # must be multiple of 32
+    outer_dim = 128
+    quant_dim = 128
+    DEQUANT_SCALE_ROUNDING_MODE = 0
+    device = "cuda"
+    src = torch.randn(outer_dim, quant_dim, device=device, dtype=torch.float16) * 0.1
+    # Output shapes for fp4 (uint8): mx_tensor [outer, quant//2], mx_scale [outer, quant//32]
+    mx_tensor = torch.empty(outer_dim, quant_dim // 2, device=device, dtype=torch.uint8)
+    mx_scale = torch.empty(outer_dim, quant_dim // 32, device=device, dtype=torch.uint8)
+    num_outer_blocks = (outer_dim + BLOCK_SIZE_OUT_DIM - 1) // BLOCK_SIZE_OUT_DIM
+    num_quant_blocks = (quant_dim + BLOCK_SIZE_QUANT_DIM - 1) // BLOCK_SIZE_QUANT_DIM
+    grid = (num_outer_blocks, num_quant_blocks)
+    try:
+        _downcast_to_mxfp[grid](
+            mx_tensor,
+            src.stride(0), 1,
+            mx_scale,
+            mx_scale.stride(0), mx_scale.stride(1),
+            src,
+            src.stride(0), src.stride(1),
+            outer_dim, quant_dim,
+            BLOCK_SIZE_OUT_DIM=BLOCK_SIZE_OUT_DIM,
+            BLOCK_SIZE_QUANT_DIM=BLOCK_SIZE_QUANT_DIM,
+            DEQUANT_SCALE_ROUNDING_MODE=DEQUANT_SCALE_ROUNDING_MODE,
+        )
+        torch.cuda.synchronize()
+        print("  Kernel launch OK")
+        print(f"  mx_tensor shape: {mx_tensor.shape}, dtype: {mx_tensor.dtype}")
+        print(f"  mx_scale shape: {mx_scale.shape}")
+        print(f"  mx_tensor sample (first row): {mx_tensor[0, :8].tolist()}")
+    except Exception as e:
+        print(f"  Kernel FAILED: {e}")
+        import traceback
+        traceback.print_exc()
+        return 1
+    print("\n=== Done ===")
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

testing.py ADDED Viewed

	@@ -0,0 +1,206 @@

+"""
+Testing utilities matching triton-kernels/build/torch-cuda/testing.py.
+https://huggingface.co/kernels-community/triton-kernels/blob/main/build/torch-cuda/testing.py
+"""
+import enum
+import functools
+import os
+import subprocess
+import sys
+import torch
+# Numerics constants - use triton_kernels.numerics when available
+try:
+    from .numerics import MAX_FINITE_FLOAT8E4B8, MAX_FINITE_FLOAT8E4NV, MAX_FINITE_FLOAT8E5
+except ImportError:
+    # Standalone fallback: standard float8 max finite values
+    MAX_FINITE_FLOAT8E5 = 57344.0  # float8 e5m2
+    MAX_FINITE_FLOAT8E4NV = 448.0  # float8 e4m3fn
+    MAX_FINITE_FLOAT8E4B8 = 448.0  # float8 e4m3fnuz
+def assert_equal(ref, tri):
+    if isinstance(ref, torch.Tensor):
+        assert torch.all(ref == tri)
+    else:
+        assert ref == tri
+def assert_close(ref, tri, maxtol=None, rmstol=None, description="--", verbose=True):
+    """
+    Compare reference values against obtained values.
+    """
+    if tri.dtype.itemsize == 1:
+        ref_as_type = ref.to(tri.dtype)
+        if ref.dtype == tri.dtype:
+            assert torch.all(ref_as_type == tri)
+            return
+        ref = ref_as_type
+    if maxtol is None:
+        maxtol = 2e-2
+    if rmstol is None:
+        rmstol = 4e-3
+    # cast to float32:
+    ref = ref.to(torch.float32).detach()
+    tri = tri.to(torch.float32).detach()
+    assert ref.shape == tri.shape, f"Tensors must have same size {ref.shape=} {tri.shape=}"
+    # deal with infinite elements:
+    inf_mask_ref = torch.isinf(ref)
+    inf_mask_tri = torch.isinf(tri)
+    assert torch.equal(inf_mask_ref, inf_mask_tri), "Tensor must have same infinite elements"
+    refn = torch.where(inf_mask_ref, 0, ref)
+    trin = torch.where(inf_mask_tri, 0, tri)
+    # normalise so that RMS calculation doesn't overflow:
+    eps = 1.0e-30
+    multiplier = 1.0 / (torch.max(torch.abs(refn)) + eps)
+    refn *= multiplier
+    trin *= multiplier
+    ref_rms = torch.sqrt(torch.square(refn).mean()) + eps
+    rel_err = torch.abs(refn - trin) / torch.maximum(ref_rms, torch.abs(refn))
+    max_err = torch.max(rel_err).item()
+    rms_err = torch.sqrt(torch.square(rel_err).mean()).item()
+    if verbose:
+        print("%s maximum relative error = %s (threshold = %s)" % (description, max_err, maxtol))
+        print("%s RMS relative error = %s (threshold = %s)" % (description, rms_err, rmstol))
+    if max_err > maxtol:
+        bad_idxs = torch.nonzero(rel_err > maxtol)
+        num_nonzero = bad_idxs.size(0)
+        bad_idxs = bad_idxs[:1000]
+        print("%d / %d mismatched elements (shape = %s) at coords %s" %
+              (num_nonzero, rel_err.numel(), tuple(rel_err.shape), bad_idxs.tolist()))
+        bad_idxs = bad_idxs.unbind(-1)
+        print("ref values: ", ref[tuple(bad_idxs)].cpu())
+        print("tri values: ", tri[tuple(bad_idxs)].cpu())
+    assert max_err <= maxtol
+    assert rms_err <= rmstol
+class ComputeSanitizerTool(enum.Enum):
+    MEMCHECK = "memcheck"
+    RACECHECK = "racecheck"
+    SYNCCHECK = "synccheck"
+    INITCHECK = "initcheck"
+def compute_sanitizer(**target_kwargs):
+    """
+    Decorator to run a test with compute sanitizer enabled and pytorch caching allocator disabled,
+    to expose potential memory access errors.
+    This decorator requires the `request` fixture to be present.
+    If `run_sanitizer` argument is present and set to False, the sanitizer is not run.
+    Running tests under compute sanitizer requires launching subprocess and is slow,
+    so use sparingly
+    """
+    def decorator(test_fn):
+        @functools.wraps(test_fn)
+        def wrapper(*args, **kwargs):
+            if os.environ.get("SKIP_COMPUTE_SANITIZER") == "1":
+                test_fn(*args, **kwargs)
+                return
+            import psutil
+            if target_kwargs.pop("clear_torch_cache", False):
+                # If we don't pop clear_torch_cache, it won't pass
+                # target_kwargs.items() <= kwargs.items() condition below.
+                torch.cuda.empty_cache()
+            tools_to_check = target_kwargs.pop("tools_to_check", [ComputeSanitizerTool.MEMCHECK])
+            assert isinstance(tools_to_check, list), f"{tools_to_check=}"
+            assert all(tool in ComputeSanitizerTool for tool in tools_to_check), (
+                f"{(tool for tool in tools_to_check if tool not in ComputeSanitizerTool)=}")
+            ppid_name = psutil.Process(os.getppid()).exe()
+            run_compute_sanitizer = target_kwargs.items() <= kwargs.items()
+            if "run_sanitizer" in kwargs:
+                run_compute_sanitizer &= kwargs["run_sanitizer"]
+            if run_compute_sanitizer and "compute-sanitizer" not in ppid_name:
+                for tool in tools_to_check:
+                    path = os.path.realpath(test_fn.__globals__["__file__"])
+                    # get path of current file
+                    env = {
+                        "PATH": os.environ["PATH"],
+                        "PYTORCH_NO_CUDA_MEMORY_CACHING": "1",
+                        "TORCH_SHOW_CPP_STACKTRACES": "1",
+                        "CUDA_LAUNCH_BLOCKING": "1",
+                    }
+                    if "CUDA_VISIBLE_DEVICES" in os.environ:
+                        env["CUDA_VISIBLE_DEVICES"] = os.environ["CUDA_VISIBLE_DEVICES"]
+                    assert "request_fixture" in kwargs, (
+                        "memcheck'ed test must have a (possibly unused) `request` fixture")
+                    test_id = kwargs["request_fixture"].node.callspec.id
+                    cmd = f"{path}::{test_fn.__name__}[{test_id}]"
+                    cmd = [
+                        "compute-sanitizer",
+                        "--target-processes=application-only",
+                        "--destroy-on-device-error=context",
+                        f"--tool={tool.value}",
+                        sys.executable,
+                        "-m",
+                        "pytest",
+                        "-vsx",
+                        cmd,
+                    ]
+                    for opt in ["--update_checksum", "--ignore_checksum_error"]:
+                        if opt in sys.argv:
+                            cmd.append(opt)
+                    out = subprocess.run(
+                        cmd,
+                        stdout=subprocess.PIPE,
+                        stderr=subprocess.STDOUT,
+                        env=env,
+                    )
+                    sanitizer_ok = "ERROR SUMMARY: 0 errors" in str(
+                        out.stdout) or "RACECHECK SUMMARY: 0 hazards displayed" in str(out.stdout)
+                    test_output = out.stdout
+                    if type(test_output) is bytes:
+                        test_output = test_output.decode()
+                    fail = False
+                    if not sanitizer_ok:
+                        print("compute-sanitizer returned an error")
+                        fail = True
+                    elif out.returncode != 0:
+                        print(
+                            "The test failed due to some other reason: consider running without compute-sanitizer to verify."
+                        )
+                        print(f"{out.returncode=}")
+                        fail = True
+                    if fail:
+                        print("*****************************************************")
+                        print("******************** TEST OUTPUT ********************")
+                        print("*****************************************************")
+                        print(test_output)
+                        print("*****************************************************")
+                        print("****************** TEST OUTPUT END ******************")
+                        print("*****************************************************")
+                        assert None
+            else:
+                test_fn(*args, **kwargs)
+        return wrapper
+    return decorator
+def compute_actual_scale(x, dtype):
+    max_finite = {
+        torch.float8_e5m2: MAX_FINITE_FLOAT8E5,
+        torch.float8_e4m3fn: MAX_FINITE_FLOAT8E4NV,
+        torch.float8_e4m3fnuz: MAX_FINITE_FLOAT8E4B8,
+    }[dtype]
+    return x.abs().max() / max_finite