nologik commited on 3 days ago

Commit

64de0bd

verified ·

1 Parent(s): 057d8be

Uploaded using `kernel-builder`.

Browse files

Files changed (20) hide show

build/torch210-cxx11-cu130-aarch64-linux/__init__.py +167 -0
build/torch210-cxx11-cu130-aarch64-linux/_gdn_cuda_bcba8ad.abi3.so +3 -0
build/torch210-cxx11-cu130-aarch64-linux/_ops.py +9 -0
build/torch210-cxx11-cu130-aarch64-linux/gdn/__init__.py +26 -0
build/torch210-cxx11-cu130-aarch64-linux/metadata.json +10 -0
build/torch211-cxx11-cu130-aarch64-linux/__init__.py +167 -0
build/torch211-cxx11-cu130-aarch64-linux/_gdn_cuda_bcba8ad.abi3.so +3 -0
build/torch211-cxx11-cu130-aarch64-linux/_ops.py +9 -0
build/torch211-cxx11-cu130-aarch64-linux/gdn/__init__.py +26 -0
build/torch211-cxx11-cu130-aarch64-linux/metadata.json +10 -0
build/torch212-cxx11-cu130-aarch64-linux/__init__.py +167 -0
build/torch212-cxx11-cu130-aarch64-linux/_gdn_cuda_bcba8ad.abi3.so +3 -0
build/torch212-cxx11-cu130-aarch64-linux/_ops.py +9 -0
build/torch212-cxx11-cu130-aarch64-linux/gdn/__init__.py +26 -0
build/torch212-cxx11-cu130-aarch64-linux/metadata.json +10 -0
build/torch212-cxx11-cu132-aarch64-linux/__init__.py +167 -0
build/torch212-cxx11-cu132-aarch64-linux/_gdn_cuda_bcba8ad.abi3.so +3 -0
build/torch212-cxx11-cu132-aarch64-linux/_ops.py +9 -0
build/torch212-cxx11-cu132-aarch64-linux/gdn/__init__.py +26 -0
build/torch212-cxx11-cu132-aarch64-linux/metadata.json +10 -0

build/torch210-cxx11-cu130-aarch64-linux/__init__.py ADDED Viewed

	@@ -0,0 +1,167 @@

+# SPDX-License-Identifier: AGPL-3.0-only
+"""Atlas Gated DeltaNet kernels for NVIDIA GB10 (SM121).
+These kernels back the linear-attention path of Qwen3.6 hybrid models
+(27B dense and 35B-A3B sparse). They are hand-tuned for the unified
+LPDDR5X memory layout of the DGX Spark and pinned to compute
+capability 12.1 — they will not load on any other GPU.
+"""
+from typing import Optional
+import torch
+from ._ops import ops
+__all__ = [
+    "gdn_decode",
+    "gdn_prefill",
+    "gdn_chunk2",
+    "gdn_chunk3",
+    "gdn_wy2",
+    "gdn_wy3",
+    "gdn_wy4",
+    "causal_conv1d_fwd",
+    "causal_conv1d_update",
+]
+def gdn_decode(
+    h_state: torch.Tensor,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    gate: torch.Tensor,
+    beta: torch.Tensor,
+    output: torch.Tensor,
+) -> None:
+    """Single-token GDN decode (in-place update of ``h_state`` and ``output``).
+    The recurrent path keeps Q/K/V in FP32 to avoid the precision drift
+    that BF16 inputs cause over long contexts in hybrid models.
+    Shapes
+    ------
+    h_state : (B, num_v_heads, k_dim, v_dim) float32, in-place updated
+    query   : (B, num_k_heads, k_dim)        float32
+    key     : (B, num_k_heads, k_dim)        float32
+    value   : (B, num_v_heads, v_dim)        float32
+    gate    : (B, num_v_heads)               float32  (exp(g_t) decay)
+    beta    : (B, num_v_heads)               float32  (sigmoid(b_t))
+    output  : (B, num_v_heads, v_dim)        bfloat16, in-place written
+    """
+    ops.gdn_decode(h_state, query, key, value, gate, beta, output)
+def gdn_prefill(
+    h_state: torch.Tensor,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    gate: torch.Tensor,
+    beta: torch.Tensor,
+    output: torch.Tensor,
+) -> None:
+    """Multi-token GDN prefill (one batch, one chunk).
+    Shapes
+    ------
+    h_state : (B, num_v_heads, k_dim, v_dim) float32
+    query   : (B, seq_len, num_k_heads, k_dim) bfloat16
+    key     : (B, seq_len, num_k_heads, k_dim) bfloat16
+    value   : (B, seq_len, num_v_heads, v_dim) bfloat16
+    gate    : (B, seq_len, num_v_heads)        float32
+    beta    : (B, seq_len, num_v_heads)        float32
+    output  : (B, seq_len, num_v_heads, v_dim) bfloat16
+    """
+    ops.gdn_prefill(h_state, query, key, value, gate, beta, output)
+def gdn_chunk2(
+    h_state: torch.Tensor, query: torch.Tensor, key: torch.Tensor,
+    value: torch.Tensor, gate: torch.Tensor, beta: torch.Tensor,
+    output: torch.Tensor, h_state_intermediate: torch.Tensor,
+) -> None:
+    """K=2 chunkwise verify (MTP draft length 1).
+    Writes the intermediate state after token 0 to ``h_state_intermediate``
+    so the caller can roll back when token 1 is rejected.
+    """
+    ops.gdn_chunk2(h_state, query, key, value, gate, beta, output, h_state_intermediate)
+def gdn_chunk3(
+    h_state: torch.Tensor, query: torch.Tensor, key: torch.Tensor,
+    value: torch.Tensor, gate: torch.Tensor, beta: torch.Tensor,
+    output: torch.Tensor,
+    h_state_inter0: torch.Tensor, h_state_inter1: torch.Tensor,
+) -> None:
+    """K=3 chunkwise verify (MTP draft length 2)."""
+    ops.gdn_chunk3(h_state, query, key, value, gate, beta, output,
+                   h_state_inter0, h_state_inter1)
+def gdn_wy2(
+    h_state: torch.Tensor, query: torch.Tensor, key: torch.Tensor,
+    value: torch.Tensor, gate: torch.Tensor, beta: torch.Tensor,
+    output: torch.Tensor, h_state_intermediate: torch.Tensor,
+) -> None:
+    """2-pass WY-chunkwise K=2 verify (replaces chunk2 at higher acceptance)."""
+    ops.gdn_wy2(h_state, query, key, value, gate, beta, output, h_state_intermediate)
+def gdn_wy3(
+    h_state: torch.Tensor, query: torch.Tensor, key: torch.Tensor,
+    value: torch.Tensor, gate: torch.Tensor, beta: torch.Tensor,
+    output: torch.Tensor, h_state_inter0: torch.Tensor, h_state_inter1: torch.Tensor,
+) -> None:
+    """2-pass WY-chunkwise K=3 verify."""
+    ops.gdn_wy3(h_state, query, key, value, gate, beta, output,
+                h_state_inter0, h_state_inter1)
+def gdn_wy4(
+    h_state: torch.Tensor, query: torch.Tensor, key: torch.Tensor,
+    value: torch.Tensor, gate: torch.Tensor, beta: torch.Tensor,
+    output: torch.Tensor,
+    h_state_inter0: torch.Tensor,
+    h_state_inter1: torch.Tensor,
+    h_state_inter2: torch.Tensor,
+) -> None:
+    """2-pass WY-chunkwise K=4 verify."""
+    ops.gdn_wy4(h_state, query, key, value, gate, beta, output,
+                h_state_inter0, h_state_inter1, h_state_inter2)
+def causal_conv1d_fwd(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: Optional[torch.Tensor],
+    out: torch.Tensor,
+) -> None:
+    """Depthwise causal Conv1d forward (used by the SSM input projection).
+    x      : (B, D, L) bfloat16
+    weight : (D, d_conv) bfloat16
+    bias   : (D,) float32 or None
+    out    : (B, D, L) bfloat16
+    """
+    ops.causal_conv1d_fwd(x, weight, bias, out)
+def causal_conv1d_update(
+    conv_state: torch.Tensor,
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: Optional[torch.Tensor],
+    out: torch.Tensor,
+) -> None:
+    """Single-step causal Conv1d update (single-token decode path).
+    conv_state : (B, D, d_conv) float32, in-place updated (rolled left, last slot = x)
+    x          : (B, D) bfloat16  (new input)
+    weight     : (D, d_conv) bfloat16
+    bias       : (D,) float32 or None
+    out        : (B, D) bfloat16
+    """
+    ops.causal_conv1d_update(conv_state, x, weight, bias, out)

build/torch210-cxx11-cu130-aarch64-linux/_gdn_cuda_bcba8ad.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8c28c8e0849b7ebff1a86d47b56b8036b08c654b94f7489f080d5d5b98ac3091
+size 3242624

build/torch210-cxx11-cu130-aarch64-linux/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _gdn_cuda_bcba8ad
+ops = torch.ops._gdn_cuda_bcba8ad
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_gdn_cuda_bcba8ad::{op_name}"

build/torch210-cxx11-cu130-aarch64-linux/gdn/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))

build/torch210-cxx11-cu130-aarch64-linux/metadata.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "name": "gdn",
+  "id": "_gdn_cuda_bcba8ad",
+  "version": 0,
+  "license": "AGPL-3.0-only",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda"
+  }
+}

build/torch211-cxx11-cu130-aarch64-linux/__init__.py ADDED Viewed

	@@ -0,0 +1,167 @@

+# SPDX-License-Identifier: AGPL-3.0-only
+"""Atlas Gated DeltaNet kernels for NVIDIA GB10 (SM121).
+These kernels back the linear-attention path of Qwen3.6 hybrid models
+(27B dense and 35B-A3B sparse). They are hand-tuned for the unified
+LPDDR5X memory layout of the DGX Spark and pinned to compute
+capability 12.1 — they will not load on any other GPU.
+"""
+from typing import Optional
+import torch
+from ._ops import ops
+__all__ = [
+    "gdn_decode",
+    "gdn_prefill",
+    "gdn_chunk2",
+    "gdn_chunk3",
+    "gdn_wy2",
+    "gdn_wy3",
+    "gdn_wy4",
+    "causal_conv1d_fwd",
+    "causal_conv1d_update",
+]
+def gdn_decode(
+    h_state: torch.Tensor,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    gate: torch.Tensor,
+    beta: torch.Tensor,
+    output: torch.Tensor,
+) -> None:
+    """Single-token GDN decode (in-place update of ``h_state`` and ``output``).
+    The recurrent path keeps Q/K/V in FP32 to avoid the precision drift
+    that BF16 inputs cause over long contexts in hybrid models.
+    Shapes
+    ------
+    h_state : (B, num_v_heads, k_dim, v_dim) float32, in-place updated
+    query   : (B, num_k_heads, k_dim)        float32
+    key     : (B, num_k_heads, k_dim)        float32
+    value   : (B, num_v_heads, v_dim)        float32
+    gate    : (B, num_v_heads)               float32  (exp(g_t) decay)
+    beta    : (B, num_v_heads)               float32  (sigmoid(b_t))
+    output  : (B, num_v_heads, v_dim)        bfloat16, in-place written
+    """
+    ops.gdn_decode(h_state, query, key, value, gate, beta, output)
+def gdn_prefill(
+    h_state: torch.Tensor,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    gate: torch.Tensor,
+    beta: torch.Tensor,
+    output: torch.Tensor,
+) -> None:
+    """Multi-token GDN prefill (one batch, one chunk).
+    Shapes
+    ------
+    h_state : (B, num_v_heads, k_dim, v_dim) float32
+    query   : (B, seq_len, num_k_heads, k_dim) bfloat16
+    key     : (B, seq_len, num_k_heads, k_dim) bfloat16
+    value   : (B, seq_len, num_v_heads, v_dim) bfloat16
+    gate    : (B, seq_len, num_v_heads)        float32
+    beta    : (B, seq_len, num_v_heads)        float32
+    output  : (B, seq_len, num_v_heads, v_dim) bfloat16
+    """
+    ops.gdn_prefill(h_state, query, key, value, gate, beta, output)
+def gdn_chunk2(
+    h_state: torch.Tensor, query: torch.Tensor, key: torch.Tensor,
+    value: torch.Tensor, gate: torch.Tensor, beta: torch.Tensor,
+    output: torch.Tensor, h_state_intermediate: torch.Tensor,
+) -> None:
+    """K=2 chunkwise verify (MTP draft length 1).
+    Writes the intermediate state after token 0 to ``h_state_intermediate``
+    so the caller can roll back when token 1 is rejected.
+    """
+    ops.gdn_chunk2(h_state, query, key, value, gate, beta, output, h_state_intermediate)
+def gdn_chunk3(
+    h_state: torch.Tensor, query: torch.Tensor, key: torch.Tensor,
+    value: torch.Tensor, gate: torch.Tensor, beta: torch.Tensor,
+    output: torch.Tensor,
+    h_state_inter0: torch.Tensor, h_state_inter1: torch.Tensor,
+) -> None:
+    """K=3 chunkwise verify (MTP draft length 2)."""
+    ops.gdn_chunk3(h_state, query, key, value, gate, beta, output,
+                   h_state_inter0, h_state_inter1)
+def gdn_wy2(
+    h_state: torch.Tensor, query: torch.Tensor, key: torch.Tensor,
+    value: torch.Tensor, gate: torch.Tensor, beta: torch.Tensor,
+    output: torch.Tensor, h_state_intermediate: torch.Tensor,
+) -> None:
+    """2-pass WY-chunkwise K=2 verify (replaces chunk2 at higher acceptance)."""
+    ops.gdn_wy2(h_state, query, key, value, gate, beta, output, h_state_intermediate)
+def gdn_wy3(
+    h_state: torch.Tensor, query: torch.Tensor, key: torch.Tensor,
+    value: torch.Tensor, gate: torch.Tensor, beta: torch.Tensor,
+    output: torch.Tensor, h_state_inter0: torch.Tensor, h_state_inter1: torch.Tensor,
+) -> None:
+    """2-pass WY-chunkwise K=3 verify."""
+    ops.gdn_wy3(h_state, query, key, value, gate, beta, output,
+                h_state_inter0, h_state_inter1)
+def gdn_wy4(
+    h_state: torch.Tensor, query: torch.Tensor, key: torch.Tensor,
+    value: torch.Tensor, gate: torch.Tensor, beta: torch.Tensor,
+    output: torch.Tensor,
+    h_state_inter0: torch.Tensor,
+    h_state_inter1: torch.Tensor,
+    h_state_inter2: torch.Tensor,
+) -> None:
+    """2-pass WY-chunkwise K=4 verify."""
+    ops.gdn_wy4(h_state, query, key, value, gate, beta, output,
+                h_state_inter0, h_state_inter1, h_state_inter2)
+def causal_conv1d_fwd(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: Optional[torch.Tensor],
+    out: torch.Tensor,
+) -> None:
+    """Depthwise causal Conv1d forward (used by the SSM input projection).
+    x      : (B, D, L) bfloat16
+    weight : (D, d_conv) bfloat16
+    bias   : (D,) float32 or None
+    out    : (B, D, L) bfloat16
+    """
+    ops.causal_conv1d_fwd(x, weight, bias, out)
+def causal_conv1d_update(
+    conv_state: torch.Tensor,
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: Optional[torch.Tensor],
+    out: torch.Tensor,
+) -> None:
+    """Single-step causal Conv1d update (single-token decode path).
+    conv_state : (B, D, d_conv) float32, in-place updated (rolled left, last slot = x)
+    x          : (B, D) bfloat16  (new input)
+    weight     : (D, d_conv) bfloat16
+    bias       : (D,) float32 or None
+    out        : (B, D) bfloat16
+    """
+    ops.causal_conv1d_update(conv_state, x, weight, bias, out)

build/torch211-cxx11-cu130-aarch64-linux/_gdn_cuda_bcba8ad.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:92cdec32c4ef029a154eeb37ee7e889d00144c82496e59699c76175dd5df4084
+size 3242624

build/torch211-cxx11-cu130-aarch64-linux/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _gdn_cuda_bcba8ad
+ops = torch.ops._gdn_cuda_bcba8ad
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_gdn_cuda_bcba8ad::{op_name}"

build/torch211-cxx11-cu130-aarch64-linux/gdn/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))

build/torch211-cxx11-cu130-aarch64-linux/metadata.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "name": "gdn",
+  "id": "_gdn_cuda_bcba8ad",
+  "version": 0,
+  "license": "AGPL-3.0-only",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda"
+  }
+}

build/torch212-cxx11-cu130-aarch64-linux/__init__.py ADDED Viewed

	@@ -0,0 +1,167 @@

+# SPDX-License-Identifier: AGPL-3.0-only
+"""Atlas Gated DeltaNet kernels for NVIDIA GB10 (SM121).
+These kernels back the linear-attention path of Qwen3.6 hybrid models
+(27B dense and 35B-A3B sparse). They are hand-tuned for the unified
+LPDDR5X memory layout of the DGX Spark and pinned to compute
+capability 12.1 — they will not load on any other GPU.
+"""
+from typing import Optional
+import torch
+from ._ops import ops
+__all__ = [
+    "gdn_decode",
+    "gdn_prefill",
+    "gdn_chunk2",
+    "gdn_chunk3",
+    "gdn_wy2",
+    "gdn_wy3",
+    "gdn_wy4",
+    "causal_conv1d_fwd",
+    "causal_conv1d_update",
+]
+def gdn_decode(
+    h_state: torch.Tensor,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    gate: torch.Tensor,
+    beta: torch.Tensor,
+    output: torch.Tensor,
+) -> None:
+    """Single-token GDN decode (in-place update of ``h_state`` and ``output``).
+    The recurrent path keeps Q/K/V in FP32 to avoid the precision drift
+    that BF16 inputs cause over long contexts in hybrid models.
+    Shapes
+    ------
+    h_state : (B, num_v_heads, k_dim, v_dim) float32, in-place updated
+    query   : (B, num_k_heads, k_dim)        float32
+    key     : (B, num_k_heads, k_dim)        float32
+    value   : (B, num_v_heads, v_dim)        float32
+    gate    : (B, num_v_heads)               float32  (exp(g_t) decay)
+    beta    : (B, num_v_heads)               float32  (sigmoid(b_t))
+    output  : (B, num_v_heads, v_dim)        bfloat16, in-place written
+    """
+    ops.gdn_decode(h_state, query, key, value, gate, beta, output)
+def gdn_prefill(
+    h_state: torch.Tensor,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    gate: torch.Tensor,
+    beta: torch.Tensor,
+    output: torch.Tensor,
+) -> None:
+    """Multi-token GDN prefill (one batch, one chunk).
+    Shapes
+    ------
+    h_state : (B, num_v_heads, k_dim, v_dim) float32
+    query   : (B, seq_len, num_k_heads, k_dim) bfloat16
+    key     : (B, seq_len, num_k_heads, k_dim) bfloat16
+    value   : (B, seq_len, num_v_heads, v_dim) bfloat16
+    gate    : (B, seq_len, num_v_heads)        float32
+    beta    : (B, seq_len, num_v_heads)        float32
+    output  : (B, seq_len, num_v_heads, v_dim) bfloat16
+    """
+    ops.gdn_prefill(h_state, query, key, value, gate, beta, output)
+def gdn_chunk2(
+    h_state: torch.Tensor, query: torch.Tensor, key: torch.Tensor,
+    value: torch.Tensor, gate: torch.Tensor, beta: torch.Tensor,
+    output: torch.Tensor, h_state_intermediate: torch.Tensor,
+) -> None:
+    """K=2 chunkwise verify (MTP draft length 1).
+    Writes the intermediate state after token 0 to ``h_state_intermediate``
+    so the caller can roll back when token 1 is rejected.
+    """
+    ops.gdn_chunk2(h_state, query, key, value, gate, beta, output, h_state_intermediate)
+def gdn_chunk3(
+    h_state: torch.Tensor, query: torch.Tensor, key: torch.Tensor,
+    value: torch.Tensor, gate: torch.Tensor, beta: torch.Tensor,
+    output: torch.Tensor,
+    h_state_inter0: torch.Tensor, h_state_inter1: torch.Tensor,
+) -> None:
+    """K=3 chunkwise verify (MTP draft length 2)."""
+    ops.gdn_chunk3(h_state, query, key, value, gate, beta, output,
+                   h_state_inter0, h_state_inter1)
+def gdn_wy2(
+    h_state: torch.Tensor, query: torch.Tensor, key: torch.Tensor,
+    value: torch.Tensor, gate: torch.Tensor, beta: torch.Tensor,
+    output: torch.Tensor, h_state_intermediate: torch.Tensor,
+) -> None:
+    """2-pass WY-chunkwise K=2 verify (replaces chunk2 at higher acceptance)."""
+    ops.gdn_wy2(h_state, query, key, value, gate, beta, output, h_state_intermediate)
+def gdn_wy3(
+    h_state: torch.Tensor, query: torch.Tensor, key: torch.Tensor,
+    value: torch.Tensor, gate: torch.Tensor, beta: torch.Tensor,
+    output: torch.Tensor, h_state_inter0: torch.Tensor, h_state_inter1: torch.Tensor,
+) -> None:
+    """2-pass WY-chunkwise K=3 verify."""
+    ops.gdn_wy3(h_state, query, key, value, gate, beta, output,
+                h_state_inter0, h_state_inter1)
+def gdn_wy4(
+    h_state: torch.Tensor, query: torch.Tensor, key: torch.Tensor,
+    value: torch.Tensor, gate: torch.Tensor, beta: torch.Tensor,
+    output: torch.Tensor,
+    h_state_inter0: torch.Tensor,
+    h_state_inter1: torch.Tensor,
+    h_state_inter2: torch.Tensor,
+) -> None:
+    """2-pass WY-chunkwise K=4 verify."""
+    ops.gdn_wy4(h_state, query, key, value, gate, beta, output,
+                h_state_inter0, h_state_inter1, h_state_inter2)
+def causal_conv1d_fwd(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: Optional[torch.Tensor],
+    out: torch.Tensor,
+) -> None:
+    """Depthwise causal Conv1d forward (used by the SSM input projection).
+    x      : (B, D, L) bfloat16
+    weight : (D, d_conv) bfloat16
+    bias   : (D,) float32 or None
+    out    : (B, D, L) bfloat16
+    """
+    ops.causal_conv1d_fwd(x, weight, bias, out)
+def causal_conv1d_update(
+    conv_state: torch.Tensor,
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: Optional[torch.Tensor],
+    out: torch.Tensor,
+) -> None:
+    """Single-step causal Conv1d update (single-token decode path).
+    conv_state : (B, D, d_conv) float32, in-place updated (rolled left, last slot = x)
+    x          : (B, D) bfloat16  (new input)
+    weight     : (D, d_conv) bfloat16
+    bias       : (D,) float32 or None
+    out        : (B, D) bfloat16
+    """
+    ops.causal_conv1d_update(conv_state, x, weight, bias, out)

build/torch212-cxx11-cu130-aarch64-linux/_gdn_cuda_bcba8ad.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4696f54be4ebace851150e8a68f405417453a03f1f94d359721c8141b908df20
+size 3242648

build/torch212-cxx11-cu130-aarch64-linux/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _gdn_cuda_bcba8ad
+ops = torch.ops._gdn_cuda_bcba8ad
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_gdn_cuda_bcba8ad::{op_name}"

build/torch212-cxx11-cu130-aarch64-linux/gdn/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))

build/torch212-cxx11-cu130-aarch64-linux/metadata.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "name": "gdn",
+  "id": "_gdn_cuda_bcba8ad",
+  "version": 0,
+  "license": "AGPL-3.0-only",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda"
+  }
+}

build/torch212-cxx11-cu132-aarch64-linux/__init__.py ADDED Viewed

	@@ -0,0 +1,167 @@

+# SPDX-License-Identifier: AGPL-3.0-only
+"""Atlas Gated DeltaNet kernels for NVIDIA GB10 (SM121).
+These kernels back the linear-attention path of Qwen3.6 hybrid models
+(27B dense and 35B-A3B sparse). They are hand-tuned for the unified
+LPDDR5X memory layout of the DGX Spark and pinned to compute
+capability 12.1 — they will not load on any other GPU.
+"""
+from typing import Optional
+import torch
+from ._ops import ops
+__all__ = [
+    "gdn_decode",
+    "gdn_prefill",
+    "gdn_chunk2",
+    "gdn_chunk3",
+    "gdn_wy2",
+    "gdn_wy3",
+    "gdn_wy4",
+    "causal_conv1d_fwd",
+    "causal_conv1d_update",
+]
+def gdn_decode(
+    h_state: torch.Tensor,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    gate: torch.Tensor,
+    beta: torch.Tensor,
+    output: torch.Tensor,
+) -> None:
+    """Single-token GDN decode (in-place update of ``h_state`` and ``output``).
+    The recurrent path keeps Q/K/V in FP32 to avoid the precision drift
+    that BF16 inputs cause over long contexts in hybrid models.
+    Shapes
+    ------
+    h_state : (B, num_v_heads, k_dim, v_dim) float32, in-place updated
+    query   : (B, num_k_heads, k_dim)        float32
+    key     : (B, num_k_heads, k_dim)        float32
+    value   : (B, num_v_heads, v_dim)        float32
+    gate    : (B, num_v_heads)               float32  (exp(g_t) decay)
+    beta    : (B, num_v_heads)               float32  (sigmoid(b_t))
+    output  : (B, num_v_heads, v_dim)        bfloat16, in-place written
+    """
+    ops.gdn_decode(h_state, query, key, value, gate, beta, output)
+def gdn_prefill(
+    h_state: torch.Tensor,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    gate: torch.Tensor,
+    beta: torch.Tensor,
+    output: torch.Tensor,
+) -> None:
+    """Multi-token GDN prefill (one batch, one chunk).
+    Shapes
+    ------
+    h_state : (B, num_v_heads, k_dim, v_dim) float32
+    query   : (B, seq_len, num_k_heads, k_dim) bfloat16
+    key     : (B, seq_len, num_k_heads, k_dim) bfloat16
+    value   : (B, seq_len, num_v_heads, v_dim) bfloat16
+    gate    : (B, seq_len, num_v_heads)        float32
+    beta    : (B, seq_len, num_v_heads)        float32
+    output  : (B, seq_len, num_v_heads, v_dim) bfloat16
+    """
+    ops.gdn_prefill(h_state, query, key, value, gate, beta, output)
+def gdn_chunk2(
+    h_state: torch.Tensor, query: torch.Tensor, key: torch.Tensor,
+    value: torch.Tensor, gate: torch.Tensor, beta: torch.Tensor,
+    output: torch.Tensor, h_state_intermediate: torch.Tensor,
+) -> None:
+    """K=2 chunkwise verify (MTP draft length 1).
+    Writes the intermediate state after token 0 to ``h_state_intermediate``
+    so the caller can roll back when token 1 is rejected.
+    """
+    ops.gdn_chunk2(h_state, query, key, value, gate, beta, output, h_state_intermediate)
+def gdn_chunk3(
+    h_state: torch.Tensor, query: torch.Tensor, key: torch.Tensor,
+    value: torch.Tensor, gate: torch.Tensor, beta: torch.Tensor,
+    output: torch.Tensor,
+    h_state_inter0: torch.Tensor, h_state_inter1: torch.Tensor,
+) -> None:
+    """K=3 chunkwise verify (MTP draft length 2)."""
+    ops.gdn_chunk3(h_state, query, key, value, gate, beta, output,
+                   h_state_inter0, h_state_inter1)
+def gdn_wy2(
+    h_state: torch.Tensor, query: torch.Tensor, key: torch.Tensor,
+    value: torch.Tensor, gate: torch.Tensor, beta: torch.Tensor,
+    output: torch.Tensor, h_state_intermediate: torch.Tensor,
+) -> None:
+    """2-pass WY-chunkwise K=2 verify (replaces chunk2 at higher acceptance)."""
+    ops.gdn_wy2(h_state, query, key, value, gate, beta, output, h_state_intermediate)
+def gdn_wy3(
+    h_state: torch.Tensor, query: torch.Tensor, key: torch.Tensor,
+    value: torch.Tensor, gate: torch.Tensor, beta: torch.Tensor,
+    output: torch.Tensor, h_state_inter0: torch.Tensor, h_state_inter1: torch.Tensor,
+) -> None:
+    """2-pass WY-chunkwise K=3 verify."""
+    ops.gdn_wy3(h_state, query, key, value, gate, beta, output,
+                h_state_inter0, h_state_inter1)
+def gdn_wy4(
+    h_state: torch.Tensor, query: torch.Tensor, key: torch.Tensor,
+    value: torch.Tensor, gate: torch.Tensor, beta: torch.Tensor,
+    output: torch.Tensor,
+    h_state_inter0: torch.Tensor,
+    h_state_inter1: torch.Tensor,
+    h_state_inter2: torch.Tensor,
+) -> None:
+    """2-pass WY-chunkwise K=4 verify."""
+    ops.gdn_wy4(h_state, query, key, value, gate, beta, output,
+                h_state_inter0, h_state_inter1, h_state_inter2)
+def causal_conv1d_fwd(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: Optional[torch.Tensor],
+    out: torch.Tensor,
+) -> None:
+    """Depthwise causal Conv1d forward (used by the SSM input projection).
+    x      : (B, D, L) bfloat16
+    weight : (D, d_conv) bfloat16
+    bias   : (D,) float32 or None
+    out    : (B, D, L) bfloat16
+    """
+    ops.causal_conv1d_fwd(x, weight, bias, out)
+def causal_conv1d_update(
+    conv_state: torch.Tensor,
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: Optional[torch.Tensor],
+    out: torch.Tensor,
+) -> None:
+    """Single-step causal Conv1d update (single-token decode path).
+    conv_state : (B, D, d_conv) float32, in-place updated (rolled left, last slot = x)
+    x          : (B, D) bfloat16  (new input)
+    weight     : (D, d_conv) bfloat16
+    bias       : (D,) float32 or None
+    out        : (B, D) bfloat16
+    """
+    ops.causal_conv1d_update(conv_state, x, weight, bias, out)

build/torch212-cxx11-cu132-aarch64-linux/_gdn_cuda_bcba8ad.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:32e0475d1a711df63bf51e0cbfa413e10989afde5e11d43f80855972bf48c91c
+size 3373720

build/torch212-cxx11-cu132-aarch64-linux/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _gdn_cuda_bcba8ad
+ops = torch.ops._gdn_cuda_bcba8ad
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_gdn_cuda_bcba8ad::{op_name}"

build/torch212-cxx11-cu132-aarch64-linux/gdn/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import ctypes
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+def _import_from_path(file_path: Path) -> ModuleType:
+    # We cannot use the module name as-is, after adding it to `sys.modules`,
+    # it would also be used for other imports. So, we make a module name that
+    # depends on the path for it to be unique using the hex-encoded hash of
+    # the path.
+    path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value)
+    module_name = path_hash
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ImportError(f"Cannot load spec for {module_name} from {file_path}")
+    module = importlib.util.module_from_spec(spec)
+    if module is None:
+        raise ImportError(f"Cannot load module {module_name} from spec")
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)  # type: ignore
+    return module
+globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py")))

build/torch212-cxx11-cu132-aarch64-linux/metadata.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "name": "gdn",
+  "id": "_gdn_cuda_bcba8ad",
+  "version": 0,
+  "license": "AGPL-3.0-only",
+  "python-depends": [],
+  "backend": {
+    "type": "cuda"
+  }
+}