nologik commited on May 27

Commit

7347d97

verified ·

1 Parent(s): 77310b5

Uploaded using `kernel-builder`.

Browse files

Files changed (20) hide show

build/torch210-cxx11-cu130-aarch64-linux/__init__.py +132 -0
build/torch210-cxx11-cu130-aarch64-linux/_nvfp4_moe_cuda_61b571c.abi3.so +3 -0
build/torch210-cxx11-cu130-aarch64-linux/_ops.py +9 -0
build/torch210-cxx11-cu130-aarch64-linux/metadata.json +10 -0
build/torch210-cxx11-cu130-aarch64-linux/nvfp4_moe/__init__.py +26 -0
build/torch211-cxx11-cu130-aarch64-linux/__init__.py +132 -0
build/torch211-cxx11-cu130-aarch64-linux/_nvfp4_moe_cuda_61b571c.abi3.so +3 -0
build/torch211-cxx11-cu130-aarch64-linux/_ops.py +9 -0
build/torch211-cxx11-cu130-aarch64-linux/metadata.json +10 -0
build/torch211-cxx11-cu130-aarch64-linux/nvfp4_moe/__init__.py +26 -0
build/torch212-cxx11-cu130-aarch64-linux/__init__.py +132 -0
build/torch212-cxx11-cu130-aarch64-linux/_nvfp4_moe_cuda_61b571c.abi3.so +3 -0
build/torch212-cxx11-cu130-aarch64-linux/_ops.py +9 -0
build/torch212-cxx11-cu130-aarch64-linux/metadata.json +10 -0
build/torch212-cxx11-cu130-aarch64-linux/nvfp4_moe/__init__.py +26 -0
build/torch212-cxx11-cu132-aarch64-linux/__init__.py +132 -0
build/torch212-cxx11-cu132-aarch64-linux/_nvfp4_moe_cuda_61b571c.abi3.so +3 -0
build/torch212-cxx11-cu132-aarch64-linux/_ops.py +9 -0
build/torch212-cxx11-cu132-aarch64-linux/metadata.json +10 -0
build/torch212-cxx11-cu132-aarch64-linux/nvfp4_moe/__init__.py +26 -0

build/torch210-cxx11-cu130-aarch64-linux/__init__.py ADDED Viewed

	@@ -0,0 +1,132 @@

+# SPDX-License-Identifier: AGPL-3.0-only
+"""Atlas NVFP4 MoE kernels for Qwen3.6-35B-A3B sparse on GB10 (SM121).
+Centerpiece is ``moe_w4a16_fused_gate_up_t_k64`` — the fused gate/up
+NVFP4 grouped GEMM that amortizes one weight read across both
+projections. The K=64 tile suffix targets Qwen3.6's
+hidden_dim=2048 / moe_intermediate=512 layout.
+All ops use Atlas's software E2M1 conversion (SM121 lacks the
+``cvt.rn.satfinite.e2m1x2.f32`` PTX instruction).
+"""
+import torch
+from ._ops import ops
+__all__ = [
+    "quantize_bf16_to_nvfp4",
+    "moe_gate_topk_fused",
+    "moe_topk_softmax",
+    "moe_topk_sigmoid",
+    "moe_permute_tokens",
+    "moe_silu_mul",
+    "moe_w4a16_ptrtable_t_k64",
+    "moe_w4a16_fused_gate_up_t_k64",
+]
+def quantize_bf16_to_nvfp4(
+    input: torch.Tensor,
+    out_packed: torch.Tensor,
+    out_scales: torch.Tensor,
+    per_tensor_scale: float,
+) -> None:
+    ops.quantize_bf16_to_nvfp4(input, out_packed, out_scales, per_tensor_scale)
+def moe_gate_topk_fused(
+    activation: torch.Tensor,
+    gate_weight_packed: torch.Tensor,
+    gate_weight_scales: torch.Tensor,
+    scale2: float,
+    expert_indices: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_k: int,
+    normalize: bool = True,
+) -> None:
+    ops.moe_gate_topk_fused(
+        activation, gate_weight_packed, gate_weight_scales, scale2,
+        expert_indices, expert_weights, top_k, int(normalize))
+def moe_topk_softmax(
+    gate_logits: torch.Tensor,
+    expert_indices: torch.Tensor,
+    expert_weights: torch.Tensor,
+    num_experts: int,
+    top_k: int,
+    normalize: bool = True,
+) -> None:
+    ops.moe_topk_softmax(gate_logits, expert_indices, expert_weights,
+                         num_experts, top_k, int(normalize))
+def moe_topk_sigmoid(
+    gate_logits: torch.Tensor,
+    bias: torch.Tensor,
+    expert_indices: torch.Tensor,
+    expert_weights: torch.Tensor,
+    num_experts: int,
+    top_k: int,
+    normalize: bool = True,
+    scaling_factor: float = 1.0,
+) -> None:
+    """Sigmoid-scored top-k. Bias is added before scoring (Nemotron-H style)."""
+    ops.moe_topk_sigmoid(gate_logits, bias, expert_indices, expert_weights,
+                         num_experts, top_k, int(normalize), scaling_factor)
+def moe_permute_tokens(
+    hidden_states: torch.Tensor,
+    permuted: torch.Tensor,
+    sorted_token_ids: torch.Tensor,
+) -> None:
+    ops.moe_permute_tokens(hidden_states, permuted, sorted_token_ids)
+def moe_silu_mul(gate: torch.Tensor, up: torch.Tensor, output: torch.Tensor) -> None:
+    ops.moe_silu_mul(gate, up, output)
+def moe_w4a16_ptrtable_t_k64(
+    A: torch.Tensor,
+    B_packed_ptrs: torch.Tensor,
+    B_scale_ptrs: torch.Tensor,
+    scale2_vals: torch.Tensor,
+    C: torch.Tensor,
+    expert_offsets: torch.Tensor,
+    sorted_token_ids: torch.Tensor,
+    N: int, K: int, max_m_tiles: int,
+) -> None:
+    """``max_m_tiles`` is the largest expert's row count rounded up to
+    the M-tile (= ceil(max_expert_rows / 64)). It bounds the grid Y
+    dimension; the kernel early-exits per-expert when offsets are met.
+    """
+    ops.moe_w4a16_ptrtable_t_k64(
+        A, B_packed_ptrs, B_scale_ptrs, scale2_vals, C,
+        expert_offsets, sorted_token_ids, N, K, max_m_tiles)
+def moe_w4a16_fused_gate_up_t_k64(
+    A: torch.Tensor,
+    gate_packed_ptrs: torch.Tensor,
+    gate_scale_ptrs: torch.Tensor,
+    gate_scale2_vals: torch.Tensor,
+    up_packed_ptrs: torch.Tensor,
+    up_scale_ptrs: torch.Tensor,
+    up_scale2_vals: torch.Tensor,
+    C_gate: torch.Tensor,
+    C_up: torch.Tensor,
+    expert_offsets: torch.Tensor,
+    sorted_token_ids: torch.Tensor,
+    N: int, K: int, max_m_tiles: int,
+) -> None:
+    """Fused gate+up grouped GEMM. Grid X is ``ceil(2*N/128)`` —
+    interleaves gate and up tiles to amortize the A read."""
+    ops.moe_w4a16_fused_gate_up_t_k64(
+        A,
+        gate_packed_ptrs, gate_scale_ptrs, gate_scale2_vals,
+        up_packed_ptrs, up_scale_ptrs, up_scale2_vals,
+        C_gate, C_up,
+        expert_offsets, sorted_token_ids, N, K, max_m_tiles)

build/torch210-cxx11-cu130-aarch64-linux/_nvfp4_moe_cuda_61b571c.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4be5149a122590d8ddbd52c686cc867b6cba47e689f9bbdd9f58e70fc4dc6bcf
+size 2865576

build/torch210-cxx11-cu130-aarch64-linux/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _nvfp4_moe_cuda_61b571c
+ops = torch.ops._nvfp4_moe_cuda_61b571c
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_nvfp4_moe_cuda_61b571c::{op_name}"

build/torch210-cxx11-cu130-aarch64-linux/metadata.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "name": "nvfp4-moe",
+  "id": "_nvfp4_moe_cuda_61b571c",
+  "version": 0,
+  "license": "AGPL-3.0-only",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda"
+  }
+}

build/torch210-cxx11-cu130-aarch64-linux/nvfp4_moe/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))

build/torch211-cxx11-cu130-aarch64-linux/__init__.py ADDED Viewed

	@@ -0,0 +1,132 @@

+# SPDX-License-Identifier: AGPL-3.0-only
+"""Atlas NVFP4 MoE kernels for Qwen3.6-35B-A3B sparse on GB10 (SM121).
+Centerpiece is ``moe_w4a16_fused_gate_up_t_k64`` — the fused gate/up
+NVFP4 grouped GEMM that amortizes one weight read across both
+projections. The K=64 tile suffix targets Qwen3.6's
+hidden_dim=2048 / moe_intermediate=512 layout.
+All ops use Atlas's software E2M1 conversion (SM121 lacks the
+``cvt.rn.satfinite.e2m1x2.f32`` PTX instruction).
+"""
+import torch
+from ._ops import ops
+__all__ = [
+    "quantize_bf16_to_nvfp4",
+    "moe_gate_topk_fused",
+    "moe_topk_softmax",
+    "moe_topk_sigmoid",
+    "moe_permute_tokens",
+    "moe_silu_mul",
+    "moe_w4a16_ptrtable_t_k64",
+    "moe_w4a16_fused_gate_up_t_k64",
+]
+def quantize_bf16_to_nvfp4(
+    input: torch.Tensor,
+    out_packed: torch.Tensor,
+    out_scales: torch.Tensor,
+    per_tensor_scale: float,
+) -> None:
+    ops.quantize_bf16_to_nvfp4(input, out_packed, out_scales, per_tensor_scale)
+def moe_gate_topk_fused(
+    activation: torch.Tensor,
+    gate_weight_packed: torch.Tensor,
+    gate_weight_scales: torch.Tensor,
+    scale2: float,
+    expert_indices: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_k: int,
+    normalize: bool = True,
+) -> None:
+    ops.moe_gate_topk_fused(
+        activation, gate_weight_packed, gate_weight_scales, scale2,
+        expert_indices, expert_weights, top_k, int(normalize))
+def moe_topk_softmax(
+    gate_logits: torch.Tensor,
+    expert_indices: torch.Tensor,
+    expert_weights: torch.Tensor,
+    num_experts: int,
+    top_k: int,
+    normalize: bool = True,
+) -> None:
+    ops.moe_topk_softmax(gate_logits, expert_indices, expert_weights,
+                         num_experts, top_k, int(normalize))
+def moe_topk_sigmoid(
+    gate_logits: torch.Tensor,
+    bias: torch.Tensor,
+    expert_indices: torch.Tensor,
+    expert_weights: torch.Tensor,
+    num_experts: int,
+    top_k: int,
+    normalize: bool = True,
+    scaling_factor: float = 1.0,
+) -> None:
+    """Sigmoid-scored top-k. Bias is added before scoring (Nemotron-H style)."""
+    ops.moe_topk_sigmoid(gate_logits, bias, expert_indices, expert_weights,
+                         num_experts, top_k, int(normalize), scaling_factor)
+def moe_permute_tokens(
+    hidden_states: torch.Tensor,
+    permuted: torch.Tensor,
+    sorted_token_ids: torch.Tensor,
+) -> None:
+    ops.moe_permute_tokens(hidden_states, permuted, sorted_token_ids)
+def moe_silu_mul(gate: torch.Tensor, up: torch.Tensor, output: torch.Tensor) -> None:
+    ops.moe_silu_mul(gate, up, output)
+def moe_w4a16_ptrtable_t_k64(
+    A: torch.Tensor,
+    B_packed_ptrs: torch.Tensor,
+    B_scale_ptrs: torch.Tensor,
+    scale2_vals: torch.Tensor,
+    C: torch.Tensor,
+    expert_offsets: torch.Tensor,
+    sorted_token_ids: torch.Tensor,
+    N: int, K: int, max_m_tiles: int,
+) -> None:
+    """``max_m_tiles`` is the largest expert's row count rounded up to
+    the M-tile (= ceil(max_expert_rows / 64)). It bounds the grid Y
+    dimension; the kernel early-exits per-expert when offsets are met.
+    """
+    ops.moe_w4a16_ptrtable_t_k64(
+        A, B_packed_ptrs, B_scale_ptrs, scale2_vals, C,
+        expert_offsets, sorted_token_ids, N, K, max_m_tiles)
+def moe_w4a16_fused_gate_up_t_k64(
+    A: torch.Tensor,
+    gate_packed_ptrs: torch.Tensor,
+    gate_scale_ptrs: torch.Tensor,
+    gate_scale2_vals: torch.Tensor,
+    up_packed_ptrs: torch.Tensor,
+    up_scale_ptrs: torch.Tensor,
+    up_scale2_vals: torch.Tensor,
+    C_gate: torch.Tensor,
+    C_up: torch.Tensor,
+    expert_offsets: torch.Tensor,
+    sorted_token_ids: torch.Tensor,
+    N: int, K: int, max_m_tiles: int,
+) -> None:
+    """Fused gate+up grouped GEMM. Grid X is ``ceil(2*N/128)`` —
+    interleaves gate and up tiles to amortize the A read."""
+    ops.moe_w4a16_fused_gate_up_t_k64(
+        A,
+        gate_packed_ptrs, gate_scale_ptrs, gate_scale2_vals,
+        up_packed_ptrs, up_scale_ptrs, up_scale2_vals,
+        C_gate, C_up,
+        expert_offsets, sorted_token_ids, N, K, max_m_tiles)

build/torch211-cxx11-cu130-aarch64-linux/_nvfp4_moe_cuda_61b571c.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dc96b2bfeec43e532e535fcfd2ffc3332894c1f972981dfea514b1c4412e7d8c
+size 2865576

build/torch211-cxx11-cu130-aarch64-linux/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _nvfp4_moe_cuda_61b571c
+ops = torch.ops._nvfp4_moe_cuda_61b571c
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_nvfp4_moe_cuda_61b571c::{op_name}"

build/torch211-cxx11-cu130-aarch64-linux/metadata.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "name": "nvfp4-moe",
+  "id": "_nvfp4_moe_cuda_61b571c",
+  "version": 0,
+  "license": "AGPL-3.0-only",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda"
+  }
+}

build/torch211-cxx11-cu130-aarch64-linux/nvfp4_moe/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))

build/torch212-cxx11-cu130-aarch64-linux/__init__.py ADDED Viewed

	@@ -0,0 +1,132 @@

+# SPDX-License-Identifier: AGPL-3.0-only
+"""Atlas NVFP4 MoE kernels for Qwen3.6-35B-A3B sparse on GB10 (SM121).
+Centerpiece is ``moe_w4a16_fused_gate_up_t_k64`` — the fused gate/up
+NVFP4 grouped GEMM that amortizes one weight read across both
+projections. The K=64 tile suffix targets Qwen3.6's
+hidden_dim=2048 / moe_intermediate=512 layout.
+All ops use Atlas's software E2M1 conversion (SM121 lacks the
+``cvt.rn.satfinite.e2m1x2.f32`` PTX instruction).
+"""
+import torch
+from ._ops import ops
+__all__ = [
+    "quantize_bf16_to_nvfp4",
+    "moe_gate_topk_fused",
+    "moe_topk_softmax",
+    "moe_topk_sigmoid",
+    "moe_permute_tokens",
+    "moe_silu_mul",
+    "moe_w4a16_ptrtable_t_k64",
+    "moe_w4a16_fused_gate_up_t_k64",
+]
+def quantize_bf16_to_nvfp4(
+    input: torch.Tensor,
+    out_packed: torch.Tensor,
+    out_scales: torch.Tensor,
+    per_tensor_scale: float,
+) -> None:
+    ops.quantize_bf16_to_nvfp4(input, out_packed, out_scales, per_tensor_scale)
+def moe_gate_topk_fused(
+    activation: torch.Tensor,
+    gate_weight_packed: torch.Tensor,
+    gate_weight_scales: torch.Tensor,
+    scale2: float,
+    expert_indices: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_k: int,
+    normalize: bool = True,
+) -> None:
+    ops.moe_gate_topk_fused(
+        activation, gate_weight_packed, gate_weight_scales, scale2,
+        expert_indices, expert_weights, top_k, int(normalize))
+def moe_topk_softmax(
+    gate_logits: torch.Tensor,
+    expert_indices: torch.Tensor,
+    expert_weights: torch.Tensor,
+    num_experts: int,
+    top_k: int,
+    normalize: bool = True,
+) -> None:
+    ops.moe_topk_softmax(gate_logits, expert_indices, expert_weights,
+                         num_experts, top_k, int(normalize))
+def moe_topk_sigmoid(
+    gate_logits: torch.Tensor,
+    bias: torch.Tensor,
+    expert_indices: torch.Tensor,
+    expert_weights: torch.Tensor,
+    num_experts: int,
+    top_k: int,
+    normalize: bool = True,
+    scaling_factor: float = 1.0,
+) -> None:
+    """Sigmoid-scored top-k. Bias is added before scoring (Nemotron-H style)."""
+    ops.moe_topk_sigmoid(gate_logits, bias, expert_indices, expert_weights,
+                         num_experts, top_k, int(normalize), scaling_factor)
+def moe_permute_tokens(
+    hidden_states: torch.Tensor,
+    permuted: torch.Tensor,
+    sorted_token_ids: torch.Tensor,
+) -> None:
+    ops.moe_permute_tokens(hidden_states, permuted, sorted_token_ids)
+def moe_silu_mul(gate: torch.Tensor, up: torch.Tensor, output: torch.Tensor) -> None:
+    ops.moe_silu_mul(gate, up, output)
+def moe_w4a16_ptrtable_t_k64(
+    A: torch.Tensor,
+    B_packed_ptrs: torch.Tensor,
+    B_scale_ptrs: torch.Tensor,
+    scale2_vals: torch.Tensor,
+    C: torch.Tensor,
+    expert_offsets: torch.Tensor,
+    sorted_token_ids: torch.Tensor,
+    N: int, K: int, max_m_tiles: int,
+) -> None:
+    """``max_m_tiles`` is the largest expert's row count rounded up to
+    the M-tile (= ceil(max_expert_rows / 64)). It bounds the grid Y
+    dimension; the kernel early-exits per-expert when offsets are met.
+    """
+    ops.moe_w4a16_ptrtable_t_k64(
+        A, B_packed_ptrs, B_scale_ptrs, scale2_vals, C,
+        expert_offsets, sorted_token_ids, N, K, max_m_tiles)
+def moe_w4a16_fused_gate_up_t_k64(
+    A: torch.Tensor,
+    gate_packed_ptrs: torch.Tensor,
+    gate_scale_ptrs: torch.Tensor,
+    gate_scale2_vals: torch.Tensor,
+    up_packed_ptrs: torch.Tensor,
+    up_scale_ptrs: torch.Tensor,
+    up_scale2_vals: torch.Tensor,
+    C_gate: torch.Tensor,
+    C_up: torch.Tensor,
+    expert_offsets: torch.Tensor,
+    sorted_token_ids: torch.Tensor,
+    N: int, K: int, max_m_tiles: int,
+) -> None:
+    """Fused gate+up grouped GEMM. Grid X is ``ceil(2*N/128)`` —
+    interleaves gate and up tiles to amortize the A read."""
+    ops.moe_w4a16_fused_gate_up_t_k64(
+        A,
+        gate_packed_ptrs, gate_scale_ptrs, gate_scale2_vals,
+        up_packed_ptrs, up_scale_ptrs, up_scale2_vals,
+        C_gate, C_up,
+        expert_offsets, sorted_token_ids, N, K, max_m_tiles)

build/torch212-cxx11-cu130-aarch64-linux/_nvfp4_moe_cuda_61b571c.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0c8e1ab3a370805356cb8f32fb06e7c4755d7fb3389238390b009c125ea09f93
+size 2865600

build/torch212-cxx11-cu130-aarch64-linux/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _nvfp4_moe_cuda_61b571c
+ops = torch.ops._nvfp4_moe_cuda_61b571c
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_nvfp4_moe_cuda_61b571c::{op_name}"

build/torch212-cxx11-cu130-aarch64-linux/metadata.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "name": "nvfp4-moe",
+  "id": "_nvfp4_moe_cuda_61b571c",
+  "version": 0,
+  "license": "AGPL-3.0-only",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda"
+  }
+}

build/torch212-cxx11-cu130-aarch64-linux/nvfp4_moe/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))

build/torch212-cxx11-cu132-aarch64-linux/__init__.py ADDED Viewed

	@@ -0,0 +1,132 @@

+# SPDX-License-Identifier: AGPL-3.0-only
+"""Atlas NVFP4 MoE kernels for Qwen3.6-35B-A3B sparse on GB10 (SM121).
+Centerpiece is ``moe_w4a16_fused_gate_up_t_k64`` — the fused gate/up
+NVFP4 grouped GEMM that amortizes one weight read across both
+projections. The K=64 tile suffix targets Qwen3.6's
+hidden_dim=2048 / moe_intermediate=512 layout.
+All ops use Atlas's software E2M1 conversion (SM121 lacks the
+``cvt.rn.satfinite.e2m1x2.f32`` PTX instruction).
+"""
+import torch
+from ._ops import ops
+__all__ = [
+    "quantize_bf16_to_nvfp4",
+    "moe_gate_topk_fused",
+    "moe_topk_softmax",
+    "moe_topk_sigmoid",
+    "moe_permute_tokens",
+    "moe_silu_mul",
+    "moe_w4a16_ptrtable_t_k64",
+    "moe_w4a16_fused_gate_up_t_k64",
+]
+def quantize_bf16_to_nvfp4(
+    input: torch.Tensor,
+    out_packed: torch.Tensor,
+    out_scales: torch.Tensor,
+    per_tensor_scale: float,
+) -> None:
+    ops.quantize_bf16_to_nvfp4(input, out_packed, out_scales, per_tensor_scale)
+def moe_gate_topk_fused(
+    activation: torch.Tensor,
+    gate_weight_packed: torch.Tensor,
+    gate_weight_scales: torch.Tensor,
+    scale2: float,
+    expert_indices: torch.Tensor,
+    expert_weights: torch.Tensor,
+    top_k: int,
+    normalize: bool = True,
+) -> None:
+    ops.moe_gate_topk_fused(
+        activation, gate_weight_packed, gate_weight_scales, scale2,
+        expert_indices, expert_weights, top_k, int(normalize))
+def moe_topk_softmax(
+    gate_logits: torch.Tensor,
+    expert_indices: torch.Tensor,
+    expert_weights: torch.Tensor,
+    num_experts: int,
+    top_k: int,
+    normalize: bool = True,
+) -> None:
+    ops.moe_topk_softmax(gate_logits, expert_indices, expert_weights,
+                         num_experts, top_k, int(normalize))
+def moe_topk_sigmoid(
+    gate_logits: torch.Tensor,
+    bias: torch.Tensor,
+    expert_indices: torch.Tensor,
+    expert_weights: torch.Tensor,
+    num_experts: int,
+    top_k: int,
+    normalize: bool = True,
+    scaling_factor: float = 1.0,
+) -> None:
+    """Sigmoid-scored top-k. Bias is added before scoring (Nemotron-H style)."""
+    ops.moe_topk_sigmoid(gate_logits, bias, expert_indices, expert_weights,
+                         num_experts, top_k, int(normalize), scaling_factor)
+def moe_permute_tokens(
+    hidden_states: torch.Tensor,
+    permuted: torch.Tensor,
+    sorted_token_ids: torch.Tensor,
+) -> None:
+    ops.moe_permute_tokens(hidden_states, permuted, sorted_token_ids)
+def moe_silu_mul(gate: torch.Tensor, up: torch.Tensor, output: torch.Tensor) -> None:
+    ops.moe_silu_mul(gate, up, output)
+def moe_w4a16_ptrtable_t_k64(
+    A: torch.Tensor,
+    B_packed_ptrs: torch.Tensor,
+    B_scale_ptrs: torch.Tensor,
+    scale2_vals: torch.Tensor,
+    C: torch.Tensor,
+    expert_offsets: torch.Tensor,
+    sorted_token_ids: torch.Tensor,
+    N: int, K: int, max_m_tiles: int,
+) -> None:
+    """``max_m_tiles`` is the largest expert's row count rounded up to
+    the M-tile (= ceil(max_expert_rows / 64)). It bounds the grid Y
+    dimension; the kernel early-exits per-expert when offsets are met.
+    """
+    ops.moe_w4a16_ptrtable_t_k64(
+        A, B_packed_ptrs, B_scale_ptrs, scale2_vals, C,
+        expert_offsets, sorted_token_ids, N, K, max_m_tiles)
+def moe_w4a16_fused_gate_up_t_k64(
+    A: torch.Tensor,
+    gate_packed_ptrs: torch.Tensor,
+    gate_scale_ptrs: torch.Tensor,
+    gate_scale2_vals: torch.Tensor,
+    up_packed_ptrs: torch.Tensor,
+    up_scale_ptrs: torch.Tensor,
+    up_scale2_vals: torch.Tensor,
+    C_gate: torch.Tensor,
+    C_up: torch.Tensor,
+    expert_offsets: torch.Tensor,
+    sorted_token_ids: torch.Tensor,
+    N: int, K: int, max_m_tiles: int,
+) -> None:
+    """Fused gate+up grouped GEMM. Grid X is ``ceil(2*N/128)`` —
+    interleaves gate and up tiles to amortize the A read."""
+    ops.moe_w4a16_fused_gate_up_t_k64(
+        A,
+        gate_packed_ptrs, gate_scale_ptrs, gate_scale2_vals,
+        up_packed_ptrs, up_scale_ptrs, up_scale2_vals,
+        C_gate, C_up,
+        expert_offsets, sorted_token_ids, N, K, max_m_tiles)

build/torch212-cxx11-cu132-aarch64-linux/_nvfp4_moe_cuda_61b571c.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:91483795cdafc1b51ac51d3465f8cd32e98ef2b8834db4590a2e3d5f84a7bfc8
+size 2865600

build/torch212-cxx11-cu132-aarch64-linux/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _nvfp4_moe_cuda_61b571c
+ops = torch.ops._nvfp4_moe_cuda_61b571c
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_nvfp4_moe_cuda_61b571c::{op_name}"

build/torch212-cxx11-cu132-aarch64-linux/metadata.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "name": "nvfp4-moe",
+  "id": "_nvfp4_moe_cuda_61b571c",
+  "version": 0,
+  "license": "AGPL-3.0-only",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda"
+  }
+}

build/torch212-cxx11-cu132-aarch64-linux/nvfp4_moe/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))