nologik commited on
Commit
7347d97
·
verified ·
1 Parent(s): 77310b5

Uploaded using `kernel-builder`.

Browse files
build/torch210-cxx11-cu130-aarch64-linux/__init__.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: AGPL-3.0-only
2
+ """Atlas NVFP4 MoE kernels for Qwen3.6-35B-A3B sparse on GB10 (SM121).
3
+
4
+ Centerpiece is ``moe_w4a16_fused_gate_up_t_k64`` — the fused gate/up
5
+ NVFP4 grouped GEMM that amortizes one weight read across both
6
+ projections. The K=64 tile suffix targets Qwen3.6's
7
+ hidden_dim=2048 / moe_intermediate=512 layout.
8
+
9
+ All ops use Atlas's software E2M1 conversion (SM121 lacks the
10
+ ``cvt.rn.satfinite.e2m1x2.f32`` PTX instruction).
11
+ """
12
+
13
+ import torch
14
+
15
+ from ._ops import ops
16
+
17
+ __all__ = [
18
+ "quantize_bf16_to_nvfp4",
19
+ "moe_gate_topk_fused",
20
+ "moe_topk_softmax",
21
+ "moe_topk_sigmoid",
22
+ "moe_permute_tokens",
23
+ "moe_silu_mul",
24
+ "moe_w4a16_ptrtable_t_k64",
25
+ "moe_w4a16_fused_gate_up_t_k64",
26
+ ]
27
+
28
+
29
+ def quantize_bf16_to_nvfp4(
30
+ input: torch.Tensor,
31
+ out_packed: torch.Tensor,
32
+ out_scales: torch.Tensor,
33
+ per_tensor_scale: float,
34
+ ) -> None:
35
+ ops.quantize_bf16_to_nvfp4(input, out_packed, out_scales, per_tensor_scale)
36
+
37
+
38
+ def moe_gate_topk_fused(
39
+ activation: torch.Tensor,
40
+ gate_weight_packed: torch.Tensor,
41
+ gate_weight_scales: torch.Tensor,
42
+ scale2: float,
43
+ expert_indices: torch.Tensor,
44
+ expert_weights: torch.Tensor,
45
+ top_k: int,
46
+ normalize: bool = True,
47
+ ) -> None:
48
+ ops.moe_gate_topk_fused(
49
+ activation, gate_weight_packed, gate_weight_scales, scale2,
50
+ expert_indices, expert_weights, top_k, int(normalize))
51
+
52
+
53
+ def moe_topk_softmax(
54
+ gate_logits: torch.Tensor,
55
+ expert_indices: torch.Tensor,
56
+ expert_weights: torch.Tensor,
57
+ num_experts: int,
58
+ top_k: int,
59
+ normalize: bool = True,
60
+ ) -> None:
61
+ ops.moe_topk_softmax(gate_logits, expert_indices, expert_weights,
62
+ num_experts, top_k, int(normalize))
63
+
64
+
65
+ def moe_topk_sigmoid(
66
+ gate_logits: torch.Tensor,
67
+ bias: torch.Tensor,
68
+ expert_indices: torch.Tensor,
69
+ expert_weights: torch.Tensor,
70
+ num_experts: int,
71
+ top_k: int,
72
+ normalize: bool = True,
73
+ scaling_factor: float = 1.0,
74
+ ) -> None:
75
+ """Sigmoid-scored top-k. Bias is added before scoring (Nemotron-H style)."""
76
+ ops.moe_topk_sigmoid(gate_logits, bias, expert_indices, expert_weights,
77
+ num_experts, top_k, int(normalize), scaling_factor)
78
+
79
+
80
+ def moe_permute_tokens(
81
+ hidden_states: torch.Tensor,
82
+ permuted: torch.Tensor,
83
+ sorted_token_ids: torch.Tensor,
84
+ ) -> None:
85
+ ops.moe_permute_tokens(hidden_states, permuted, sorted_token_ids)
86
+
87
+
88
+ def moe_silu_mul(gate: torch.Tensor, up: torch.Tensor, output: torch.Tensor) -> None:
89
+ ops.moe_silu_mul(gate, up, output)
90
+
91
+
92
+ def moe_w4a16_ptrtable_t_k64(
93
+ A: torch.Tensor,
94
+ B_packed_ptrs: torch.Tensor,
95
+ B_scale_ptrs: torch.Tensor,
96
+ scale2_vals: torch.Tensor,
97
+ C: torch.Tensor,
98
+ expert_offsets: torch.Tensor,
99
+ sorted_token_ids: torch.Tensor,
100
+ N: int, K: int, max_m_tiles: int,
101
+ ) -> None:
102
+ """``max_m_tiles`` is the largest expert's row count rounded up to
103
+ the M-tile (= ceil(max_expert_rows / 64)). It bounds the grid Y
104
+ dimension; the kernel early-exits per-expert when offsets are met.
105
+ """
106
+ ops.moe_w4a16_ptrtable_t_k64(
107
+ A, B_packed_ptrs, B_scale_ptrs, scale2_vals, C,
108
+ expert_offsets, sorted_token_ids, N, K, max_m_tiles)
109
+
110
+
111
+ def moe_w4a16_fused_gate_up_t_k64(
112
+ A: torch.Tensor,
113
+ gate_packed_ptrs: torch.Tensor,
114
+ gate_scale_ptrs: torch.Tensor,
115
+ gate_scale2_vals: torch.Tensor,
116
+ up_packed_ptrs: torch.Tensor,
117
+ up_scale_ptrs: torch.Tensor,
118
+ up_scale2_vals: torch.Tensor,
119
+ C_gate: torch.Tensor,
120
+ C_up: torch.Tensor,
121
+ expert_offsets: torch.Tensor,
122
+ sorted_token_ids: torch.Tensor,
123
+ N: int, K: int, max_m_tiles: int,
124
+ ) -> None:
125
+ """Fused gate+up grouped GEMM. Grid X is ``ceil(2*N/128)`` —
126
+ interleaves gate and up tiles to amortize the A read."""
127
+ ops.moe_w4a16_fused_gate_up_t_k64(
128
+ A,
129
+ gate_packed_ptrs, gate_scale_ptrs, gate_scale2_vals,
130
+ up_packed_ptrs, up_scale_ptrs, up_scale2_vals,
131
+ C_gate, C_up,
132
+ expert_offsets, sorted_token_ids, N, K, max_m_tiles)
build/torch210-cxx11-cu130-aarch64-linux/_nvfp4_moe_cuda_61b571c.abi3.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4be5149a122590d8ddbd52c686cc867b6cba47e689f9bbdd9f58e70fc4dc6bcf
3
+ size 2865576
build/torch210-cxx11-cu130-aarch64-linux/_ops.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from . import _nvfp4_moe_cuda_61b571c
3
+ ops = torch.ops._nvfp4_moe_cuda_61b571c
4
+
5
+ def add_op_namespace_prefix(op_name: str):
6
+ """
7
+ Prefix op by namespace.
8
+ """
9
+ return f"_nvfp4_moe_cuda_61b571c::{op_name}"
build/torch210-cxx11-cu130-aarch64-linux/metadata.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "nvfp4-moe",
3
+ "id": "_nvfp4_moe_cuda_61b571c",
4
+ "version": 0,
5
+ "license": "AGPL-3.0-only",
6
+ "python-depends": [],
7
+ "backend": {
8
+ "type": "cuda"
9
+ }
10
+ }
build/torch210-cxx11-cu130-aarch64-linux/nvfp4_moe/__init__.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ctypes
2
+ import importlib.util
3
+ import sys
4
+ from pathlib import Path
5
+ from types import ModuleType
6
+
7
+
8
+ def _import_from_path(file_path: Path) -> ModuleType:
9
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
10
+ # it would also be used for other imports. So, we make a module name that
11
+ # depends on the path for it to be unique using the hex-encoded hash of
12
+ # the path.
13
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
14
+ module_name = path_hash
15
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
16
+ if spec is None:
17
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
18
+ module = importlib.util.module_from_spec(spec)
19
+ if module is None:
20
+ raise ImportError(f"Cannot load module {module_name} from spec")
21
+ sys.modules[module_name] = module
22
+ spec.loader.exec_module(module) # type: ignore
23
+ return module
24
+
25
+
26
+ globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
build/torch211-cxx11-cu130-aarch64-linux/__init__.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: AGPL-3.0-only
2
+ """Atlas NVFP4 MoE kernels for Qwen3.6-35B-A3B sparse on GB10 (SM121).
3
+
4
+ Centerpiece is ``moe_w4a16_fused_gate_up_t_k64`` — the fused gate/up
5
+ NVFP4 grouped GEMM that amortizes one weight read across both
6
+ projections. The K=64 tile suffix targets Qwen3.6's
7
+ hidden_dim=2048 / moe_intermediate=512 layout.
8
+
9
+ All ops use Atlas's software E2M1 conversion (SM121 lacks the
10
+ ``cvt.rn.satfinite.e2m1x2.f32`` PTX instruction).
11
+ """
12
+
13
+ import torch
14
+
15
+ from ._ops import ops
16
+
17
+ __all__ = [
18
+ "quantize_bf16_to_nvfp4",
19
+ "moe_gate_topk_fused",
20
+ "moe_topk_softmax",
21
+ "moe_topk_sigmoid",
22
+ "moe_permute_tokens",
23
+ "moe_silu_mul",
24
+ "moe_w4a16_ptrtable_t_k64",
25
+ "moe_w4a16_fused_gate_up_t_k64",
26
+ ]
27
+
28
+
29
+ def quantize_bf16_to_nvfp4(
30
+ input: torch.Tensor,
31
+ out_packed: torch.Tensor,
32
+ out_scales: torch.Tensor,
33
+ per_tensor_scale: float,
34
+ ) -> None:
35
+ ops.quantize_bf16_to_nvfp4(input, out_packed, out_scales, per_tensor_scale)
36
+
37
+
38
+ def moe_gate_topk_fused(
39
+ activation: torch.Tensor,
40
+ gate_weight_packed: torch.Tensor,
41
+ gate_weight_scales: torch.Tensor,
42
+ scale2: float,
43
+ expert_indices: torch.Tensor,
44
+ expert_weights: torch.Tensor,
45
+ top_k: int,
46
+ normalize: bool = True,
47
+ ) -> None:
48
+ ops.moe_gate_topk_fused(
49
+ activation, gate_weight_packed, gate_weight_scales, scale2,
50
+ expert_indices, expert_weights, top_k, int(normalize))
51
+
52
+
53
+ def moe_topk_softmax(
54
+ gate_logits: torch.Tensor,
55
+ expert_indices: torch.Tensor,
56
+ expert_weights: torch.Tensor,
57
+ num_experts: int,
58
+ top_k: int,
59
+ normalize: bool = True,
60
+ ) -> None:
61
+ ops.moe_topk_softmax(gate_logits, expert_indices, expert_weights,
62
+ num_experts, top_k, int(normalize))
63
+
64
+
65
+ def moe_topk_sigmoid(
66
+ gate_logits: torch.Tensor,
67
+ bias: torch.Tensor,
68
+ expert_indices: torch.Tensor,
69
+ expert_weights: torch.Tensor,
70
+ num_experts: int,
71
+ top_k: int,
72
+ normalize: bool = True,
73
+ scaling_factor: float = 1.0,
74
+ ) -> None:
75
+ """Sigmoid-scored top-k. Bias is added before scoring (Nemotron-H style)."""
76
+ ops.moe_topk_sigmoid(gate_logits, bias, expert_indices, expert_weights,
77
+ num_experts, top_k, int(normalize), scaling_factor)
78
+
79
+
80
+ def moe_permute_tokens(
81
+ hidden_states: torch.Tensor,
82
+ permuted: torch.Tensor,
83
+ sorted_token_ids: torch.Tensor,
84
+ ) -> None:
85
+ ops.moe_permute_tokens(hidden_states, permuted, sorted_token_ids)
86
+
87
+
88
+ def moe_silu_mul(gate: torch.Tensor, up: torch.Tensor, output: torch.Tensor) -> None:
89
+ ops.moe_silu_mul(gate, up, output)
90
+
91
+
92
+ def moe_w4a16_ptrtable_t_k64(
93
+ A: torch.Tensor,
94
+ B_packed_ptrs: torch.Tensor,
95
+ B_scale_ptrs: torch.Tensor,
96
+ scale2_vals: torch.Tensor,
97
+ C: torch.Tensor,
98
+ expert_offsets: torch.Tensor,
99
+ sorted_token_ids: torch.Tensor,
100
+ N: int, K: int, max_m_tiles: int,
101
+ ) -> None:
102
+ """``max_m_tiles`` is the largest expert's row count rounded up to
103
+ the M-tile (= ceil(max_expert_rows / 64)). It bounds the grid Y
104
+ dimension; the kernel early-exits per-expert when offsets are met.
105
+ """
106
+ ops.moe_w4a16_ptrtable_t_k64(
107
+ A, B_packed_ptrs, B_scale_ptrs, scale2_vals, C,
108
+ expert_offsets, sorted_token_ids, N, K, max_m_tiles)
109
+
110
+
111
+ def moe_w4a16_fused_gate_up_t_k64(
112
+ A: torch.Tensor,
113
+ gate_packed_ptrs: torch.Tensor,
114
+ gate_scale_ptrs: torch.Tensor,
115
+ gate_scale2_vals: torch.Tensor,
116
+ up_packed_ptrs: torch.Tensor,
117
+ up_scale_ptrs: torch.Tensor,
118
+ up_scale2_vals: torch.Tensor,
119
+ C_gate: torch.Tensor,
120
+ C_up: torch.Tensor,
121
+ expert_offsets: torch.Tensor,
122
+ sorted_token_ids: torch.Tensor,
123
+ N: int, K: int, max_m_tiles: int,
124
+ ) -> None:
125
+ """Fused gate+up grouped GEMM. Grid X is ``ceil(2*N/128)`` —
126
+ interleaves gate and up tiles to amortize the A read."""
127
+ ops.moe_w4a16_fused_gate_up_t_k64(
128
+ A,
129
+ gate_packed_ptrs, gate_scale_ptrs, gate_scale2_vals,
130
+ up_packed_ptrs, up_scale_ptrs, up_scale2_vals,
131
+ C_gate, C_up,
132
+ expert_offsets, sorted_token_ids, N, K, max_m_tiles)
build/torch211-cxx11-cu130-aarch64-linux/_nvfp4_moe_cuda_61b571c.abi3.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc96b2bfeec43e532e535fcfd2ffc3332894c1f972981dfea514b1c4412e7d8c
3
+ size 2865576
build/torch211-cxx11-cu130-aarch64-linux/_ops.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from . import _nvfp4_moe_cuda_61b571c
3
+ ops = torch.ops._nvfp4_moe_cuda_61b571c
4
+
5
+ def add_op_namespace_prefix(op_name: str):
6
+ """
7
+ Prefix op by namespace.
8
+ """
9
+ return f"_nvfp4_moe_cuda_61b571c::{op_name}"
build/torch211-cxx11-cu130-aarch64-linux/metadata.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "nvfp4-moe",
3
+ "id": "_nvfp4_moe_cuda_61b571c",
4
+ "version": 0,
5
+ "license": "AGPL-3.0-only",
6
+ "python-depends": [],
7
+ "backend": {
8
+ "type": "cuda"
9
+ }
10
+ }
build/torch211-cxx11-cu130-aarch64-linux/nvfp4_moe/__init__.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ctypes
2
+ import importlib.util
3
+ import sys
4
+ from pathlib import Path
5
+ from types import ModuleType
6
+
7
+
8
+ def _import_from_path(file_path: Path) -> ModuleType:
9
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
10
+ # it would also be used for other imports. So, we make a module name that
11
+ # depends on the path for it to be unique using the hex-encoded hash of
12
+ # the path.
13
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
14
+ module_name = path_hash
15
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
16
+ if spec is None:
17
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
18
+ module = importlib.util.module_from_spec(spec)
19
+ if module is None:
20
+ raise ImportError(f"Cannot load module {module_name} from spec")
21
+ sys.modules[module_name] = module
22
+ spec.loader.exec_module(module) # type: ignore
23
+ return module
24
+
25
+
26
+ globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
build/torch212-cxx11-cu130-aarch64-linux/__init__.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: AGPL-3.0-only
2
+ """Atlas NVFP4 MoE kernels for Qwen3.6-35B-A3B sparse on GB10 (SM121).
3
+
4
+ Centerpiece is ``moe_w4a16_fused_gate_up_t_k64`` — the fused gate/up
5
+ NVFP4 grouped GEMM that amortizes one weight read across both
6
+ projections. The K=64 tile suffix targets Qwen3.6's
7
+ hidden_dim=2048 / moe_intermediate=512 layout.
8
+
9
+ All ops use Atlas's software E2M1 conversion (SM121 lacks the
10
+ ``cvt.rn.satfinite.e2m1x2.f32`` PTX instruction).
11
+ """
12
+
13
+ import torch
14
+
15
+ from ._ops import ops
16
+
17
+ __all__ = [
18
+ "quantize_bf16_to_nvfp4",
19
+ "moe_gate_topk_fused",
20
+ "moe_topk_softmax",
21
+ "moe_topk_sigmoid",
22
+ "moe_permute_tokens",
23
+ "moe_silu_mul",
24
+ "moe_w4a16_ptrtable_t_k64",
25
+ "moe_w4a16_fused_gate_up_t_k64",
26
+ ]
27
+
28
+
29
+ def quantize_bf16_to_nvfp4(
30
+ input: torch.Tensor,
31
+ out_packed: torch.Tensor,
32
+ out_scales: torch.Tensor,
33
+ per_tensor_scale: float,
34
+ ) -> None:
35
+ ops.quantize_bf16_to_nvfp4(input, out_packed, out_scales, per_tensor_scale)
36
+
37
+
38
+ def moe_gate_topk_fused(
39
+ activation: torch.Tensor,
40
+ gate_weight_packed: torch.Tensor,
41
+ gate_weight_scales: torch.Tensor,
42
+ scale2: float,
43
+ expert_indices: torch.Tensor,
44
+ expert_weights: torch.Tensor,
45
+ top_k: int,
46
+ normalize: bool = True,
47
+ ) -> None:
48
+ ops.moe_gate_topk_fused(
49
+ activation, gate_weight_packed, gate_weight_scales, scale2,
50
+ expert_indices, expert_weights, top_k, int(normalize))
51
+
52
+
53
+ def moe_topk_softmax(
54
+ gate_logits: torch.Tensor,
55
+ expert_indices: torch.Tensor,
56
+ expert_weights: torch.Tensor,
57
+ num_experts: int,
58
+ top_k: int,
59
+ normalize: bool = True,
60
+ ) -> None:
61
+ ops.moe_topk_softmax(gate_logits, expert_indices, expert_weights,
62
+ num_experts, top_k, int(normalize))
63
+
64
+
65
+ def moe_topk_sigmoid(
66
+ gate_logits: torch.Tensor,
67
+ bias: torch.Tensor,
68
+ expert_indices: torch.Tensor,
69
+ expert_weights: torch.Tensor,
70
+ num_experts: int,
71
+ top_k: int,
72
+ normalize: bool = True,
73
+ scaling_factor: float = 1.0,
74
+ ) -> None:
75
+ """Sigmoid-scored top-k. Bias is added before scoring (Nemotron-H style)."""
76
+ ops.moe_topk_sigmoid(gate_logits, bias, expert_indices, expert_weights,
77
+ num_experts, top_k, int(normalize), scaling_factor)
78
+
79
+
80
+ def moe_permute_tokens(
81
+ hidden_states: torch.Tensor,
82
+ permuted: torch.Tensor,
83
+ sorted_token_ids: torch.Tensor,
84
+ ) -> None:
85
+ ops.moe_permute_tokens(hidden_states, permuted, sorted_token_ids)
86
+
87
+
88
+ def moe_silu_mul(gate: torch.Tensor, up: torch.Tensor, output: torch.Tensor) -> None:
89
+ ops.moe_silu_mul(gate, up, output)
90
+
91
+
92
+ def moe_w4a16_ptrtable_t_k64(
93
+ A: torch.Tensor,
94
+ B_packed_ptrs: torch.Tensor,
95
+ B_scale_ptrs: torch.Tensor,
96
+ scale2_vals: torch.Tensor,
97
+ C: torch.Tensor,
98
+ expert_offsets: torch.Tensor,
99
+ sorted_token_ids: torch.Tensor,
100
+ N: int, K: int, max_m_tiles: int,
101
+ ) -> None:
102
+ """``max_m_tiles`` is the largest expert's row count rounded up to
103
+ the M-tile (= ceil(max_expert_rows / 64)). It bounds the grid Y
104
+ dimension; the kernel early-exits per-expert when offsets are met.
105
+ """
106
+ ops.moe_w4a16_ptrtable_t_k64(
107
+ A, B_packed_ptrs, B_scale_ptrs, scale2_vals, C,
108
+ expert_offsets, sorted_token_ids, N, K, max_m_tiles)
109
+
110
+
111
+ def moe_w4a16_fused_gate_up_t_k64(
112
+ A: torch.Tensor,
113
+ gate_packed_ptrs: torch.Tensor,
114
+ gate_scale_ptrs: torch.Tensor,
115
+ gate_scale2_vals: torch.Tensor,
116
+ up_packed_ptrs: torch.Tensor,
117
+ up_scale_ptrs: torch.Tensor,
118
+ up_scale2_vals: torch.Tensor,
119
+ C_gate: torch.Tensor,
120
+ C_up: torch.Tensor,
121
+ expert_offsets: torch.Tensor,
122
+ sorted_token_ids: torch.Tensor,
123
+ N: int, K: int, max_m_tiles: int,
124
+ ) -> None:
125
+ """Fused gate+up grouped GEMM. Grid X is ``ceil(2*N/128)`` —
126
+ interleaves gate and up tiles to amortize the A read."""
127
+ ops.moe_w4a16_fused_gate_up_t_k64(
128
+ A,
129
+ gate_packed_ptrs, gate_scale_ptrs, gate_scale2_vals,
130
+ up_packed_ptrs, up_scale_ptrs, up_scale2_vals,
131
+ C_gate, C_up,
132
+ expert_offsets, sorted_token_ids, N, K, max_m_tiles)
build/torch212-cxx11-cu130-aarch64-linux/_nvfp4_moe_cuda_61b571c.abi3.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c8e1ab3a370805356cb8f32fb06e7c4755d7fb3389238390b009c125ea09f93
3
+ size 2865600
build/torch212-cxx11-cu130-aarch64-linux/_ops.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from . import _nvfp4_moe_cuda_61b571c
3
+ ops = torch.ops._nvfp4_moe_cuda_61b571c
4
+
5
+ def add_op_namespace_prefix(op_name: str):
6
+ """
7
+ Prefix op by namespace.
8
+ """
9
+ return f"_nvfp4_moe_cuda_61b571c::{op_name}"
build/torch212-cxx11-cu130-aarch64-linux/metadata.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "nvfp4-moe",
3
+ "id": "_nvfp4_moe_cuda_61b571c",
4
+ "version": 0,
5
+ "license": "AGPL-3.0-only",
6
+ "python-depends": [],
7
+ "backend": {
8
+ "type": "cuda"
9
+ }
10
+ }
build/torch212-cxx11-cu130-aarch64-linux/nvfp4_moe/__init__.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ctypes
2
+ import importlib.util
3
+ import sys
4
+ from pathlib import Path
5
+ from types import ModuleType
6
+
7
+
8
+ def _import_from_path(file_path: Path) -> ModuleType:
9
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
10
+ # it would also be used for other imports. So, we make a module name that
11
+ # depends on the path for it to be unique using the hex-encoded hash of
12
+ # the path.
13
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
14
+ module_name = path_hash
15
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
16
+ if spec is None:
17
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
18
+ module = importlib.util.module_from_spec(spec)
19
+ if module is None:
20
+ raise ImportError(f"Cannot load module {module_name} from spec")
21
+ sys.modules[module_name] = module
22
+ spec.loader.exec_module(module) # type: ignore
23
+ return module
24
+
25
+
26
+ globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))
build/torch212-cxx11-cu132-aarch64-linux/__init__.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: AGPL-3.0-only
2
+ """Atlas NVFP4 MoE kernels for Qwen3.6-35B-A3B sparse on GB10 (SM121).
3
+
4
+ Centerpiece is ``moe_w4a16_fused_gate_up_t_k64`` — the fused gate/up
5
+ NVFP4 grouped GEMM that amortizes one weight read across both
6
+ projections. The K=64 tile suffix targets Qwen3.6's
7
+ hidden_dim=2048 / moe_intermediate=512 layout.
8
+
9
+ All ops use Atlas's software E2M1 conversion (SM121 lacks the
10
+ ``cvt.rn.satfinite.e2m1x2.f32`` PTX instruction).
11
+ """
12
+
13
+ import torch
14
+
15
+ from ._ops import ops
16
+
17
+ __all__ = [
18
+ "quantize_bf16_to_nvfp4",
19
+ "moe_gate_topk_fused",
20
+ "moe_topk_softmax",
21
+ "moe_topk_sigmoid",
22
+ "moe_permute_tokens",
23
+ "moe_silu_mul",
24
+ "moe_w4a16_ptrtable_t_k64",
25
+ "moe_w4a16_fused_gate_up_t_k64",
26
+ ]
27
+
28
+
29
+ def quantize_bf16_to_nvfp4(
30
+ input: torch.Tensor,
31
+ out_packed: torch.Tensor,
32
+ out_scales: torch.Tensor,
33
+ per_tensor_scale: float,
34
+ ) -> None:
35
+ ops.quantize_bf16_to_nvfp4(input, out_packed, out_scales, per_tensor_scale)
36
+
37
+
38
+ def moe_gate_topk_fused(
39
+ activation: torch.Tensor,
40
+ gate_weight_packed: torch.Tensor,
41
+ gate_weight_scales: torch.Tensor,
42
+ scale2: float,
43
+ expert_indices: torch.Tensor,
44
+ expert_weights: torch.Tensor,
45
+ top_k: int,
46
+ normalize: bool = True,
47
+ ) -> None:
48
+ ops.moe_gate_topk_fused(
49
+ activation, gate_weight_packed, gate_weight_scales, scale2,
50
+ expert_indices, expert_weights, top_k, int(normalize))
51
+
52
+
53
+ def moe_topk_softmax(
54
+ gate_logits: torch.Tensor,
55
+ expert_indices: torch.Tensor,
56
+ expert_weights: torch.Tensor,
57
+ num_experts: int,
58
+ top_k: int,
59
+ normalize: bool = True,
60
+ ) -> None:
61
+ ops.moe_topk_softmax(gate_logits, expert_indices, expert_weights,
62
+ num_experts, top_k, int(normalize))
63
+
64
+
65
+ def moe_topk_sigmoid(
66
+ gate_logits: torch.Tensor,
67
+ bias: torch.Tensor,
68
+ expert_indices: torch.Tensor,
69
+ expert_weights: torch.Tensor,
70
+ num_experts: int,
71
+ top_k: int,
72
+ normalize: bool = True,
73
+ scaling_factor: float = 1.0,
74
+ ) -> None:
75
+ """Sigmoid-scored top-k. Bias is added before scoring (Nemotron-H style)."""
76
+ ops.moe_topk_sigmoid(gate_logits, bias, expert_indices, expert_weights,
77
+ num_experts, top_k, int(normalize), scaling_factor)
78
+
79
+
80
+ def moe_permute_tokens(
81
+ hidden_states: torch.Tensor,
82
+ permuted: torch.Tensor,
83
+ sorted_token_ids: torch.Tensor,
84
+ ) -> None:
85
+ ops.moe_permute_tokens(hidden_states, permuted, sorted_token_ids)
86
+
87
+
88
+ def moe_silu_mul(gate: torch.Tensor, up: torch.Tensor, output: torch.Tensor) -> None:
89
+ ops.moe_silu_mul(gate, up, output)
90
+
91
+
92
+ def moe_w4a16_ptrtable_t_k64(
93
+ A: torch.Tensor,
94
+ B_packed_ptrs: torch.Tensor,
95
+ B_scale_ptrs: torch.Tensor,
96
+ scale2_vals: torch.Tensor,
97
+ C: torch.Tensor,
98
+ expert_offsets: torch.Tensor,
99
+ sorted_token_ids: torch.Tensor,
100
+ N: int, K: int, max_m_tiles: int,
101
+ ) -> None:
102
+ """``max_m_tiles`` is the largest expert's row count rounded up to
103
+ the M-tile (= ceil(max_expert_rows / 64)). It bounds the grid Y
104
+ dimension; the kernel early-exits per-expert when offsets are met.
105
+ """
106
+ ops.moe_w4a16_ptrtable_t_k64(
107
+ A, B_packed_ptrs, B_scale_ptrs, scale2_vals, C,
108
+ expert_offsets, sorted_token_ids, N, K, max_m_tiles)
109
+
110
+
111
+ def moe_w4a16_fused_gate_up_t_k64(
112
+ A: torch.Tensor,
113
+ gate_packed_ptrs: torch.Tensor,
114
+ gate_scale_ptrs: torch.Tensor,
115
+ gate_scale2_vals: torch.Tensor,
116
+ up_packed_ptrs: torch.Tensor,
117
+ up_scale_ptrs: torch.Tensor,
118
+ up_scale2_vals: torch.Tensor,
119
+ C_gate: torch.Tensor,
120
+ C_up: torch.Tensor,
121
+ expert_offsets: torch.Tensor,
122
+ sorted_token_ids: torch.Tensor,
123
+ N: int, K: int, max_m_tiles: int,
124
+ ) -> None:
125
+ """Fused gate+up grouped GEMM. Grid X is ``ceil(2*N/128)`` —
126
+ interleaves gate and up tiles to amortize the A read."""
127
+ ops.moe_w4a16_fused_gate_up_t_k64(
128
+ A,
129
+ gate_packed_ptrs, gate_scale_ptrs, gate_scale2_vals,
130
+ up_packed_ptrs, up_scale_ptrs, up_scale2_vals,
131
+ C_gate, C_up,
132
+ expert_offsets, sorted_token_ids, N, K, max_m_tiles)
build/torch212-cxx11-cu132-aarch64-linux/_nvfp4_moe_cuda_61b571c.abi3.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:91483795cdafc1b51ac51d3465f8cd32e98ef2b8834db4590a2e3d5f84a7bfc8
3
+ size 2865600
build/torch212-cxx11-cu132-aarch64-linux/_ops.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from . import _nvfp4_moe_cuda_61b571c
3
+ ops = torch.ops._nvfp4_moe_cuda_61b571c
4
+
5
+ def add_op_namespace_prefix(op_name: str):
6
+ """
7
+ Prefix op by namespace.
8
+ """
9
+ return f"_nvfp4_moe_cuda_61b571c::{op_name}"
build/torch212-cxx11-cu132-aarch64-linux/metadata.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "nvfp4-moe",
3
+ "id": "_nvfp4_moe_cuda_61b571c",
4
+ "version": 0,
5
+ "license": "AGPL-3.0-only",
6
+ "python-depends": [],
7
+ "backend": {
8
+ "type": "cuda"
9
+ }
10
+ }
build/torch212-cxx11-cu132-aarch64-linux/nvfp4_moe/__init__.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ctypes
2
+ import importlib.util
3
+ import sys
4
+ from pathlib import Path
5
+ from types import ModuleType
6
+
7
+
8
+ def _import_from_path(file_path: Path) -> ModuleType:
9
+ # We cannot use the module name as-is, after adding it to `sys.modules`,
10
+ # it would also be used for other imports. So, we make a module name that
11
+ # depends on the path for it to be unique using the hex-encoded hash of
12
+ # the path.
13
+ path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
14
+ module_name = path_hash
15
+ spec = importlib.util.spec_from_file_location(module_name, file_path)
16
+ if spec is None:
17
+ raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
18
+ module = importlib.util.module_from_spec(spec)
19
+ if module is None:
20
+ raise ImportError(f"Cannot load module {module_name} from spec")
21
+ sys.modules[module_name] = module
22
+ spec.loader.exec_module(module) # type: ignore
23
+ return module
24
+
25
+
26
+ globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))