Spaces:

jboth
/

sam3d-objects-fixed

Paused

App Files Files Community

jboth commited on 19 days ago

Commit

8484f0c

verified ·

1 Parent(s): 7046e4a

Upload flash_attn_stub/flash_attn/init.py with huggingface_hub

Browse files

Files changed (1) hide show

flash_attn_stub/flash_attn/__init__.py +191 -13

flash_attn_stub/flash_attn/__init__.py CHANGED Viewed

@@ -1,13 +1,191 @@
-"""flash_attn stub – not available on ZeroGPU."""
-def flash_attn_varlen_qkvpacked_func(*a, **kw):
-    raise NotImplementedError("flash_attn stub")
-def flash_attn_varlen_kvpacked_func(*a, **kw):
-    raise NotImplementedError("flash_attn stub")
-def flash_attn_varlen_func(*a, **kw):
-    raise NotImplementedError("flash_attn stub")
-def flash_attn_qkvpacked_func(*a, **kw):
-    raise NotImplementedError("flash_attn stub")
-def flash_attn_kvpacked_func(*a, **kw):
-    raise NotImplementedError("flash_attn stub")
-def flash_attn_func(*a, **kw):
-    raise NotImplementedError("flash_attn stub")

+"""flash_attn stub – implements flash attention API using torch SDPA.
+This replaces the real flash_attn package on systems where it cannot be compiled
+(e.g. ZeroGPU with PyTorch 2.10+cu128 and no matching wheel).
+All functions accept the same signatures as flash_attn 2.x and delegate to
+torch.nn.functional.scaled_dot_product_attention.
+"""
+import torch
+import torch.nn.functional as F
+def _sdpa(q, k, v, causal=False, softmax_scale=None):
+    """Apply SDPA. q/k/v are (B, H, L, D)."""
+    return F.scaled_dot_product_attention(
+        q, k, v,
+        is_causal=causal,
+        scale=softmax_scale,
+    )
+# ---------- non-varlen ----------
+def flash_attn_func(q, k, v, dropout_p=0.0, softmax_scale=None, causal=False,
+                    window_size=(-1, -1), softcap=0.0, alibi_slopes=None,
+                    deterministic=False, return_attn_probs=False):
+    """q/k/v: (B, L, H, D) -> out: (B, L, H, D)"""
+    # Permute to (B, H, L, D) for SDPA
+    q2 = q.transpose(1, 2)
+    k2 = k.transpose(1, 2)
+    v2 = v.transpose(1, 2)
+    out = _sdpa(q2, k2, v2, causal=causal, softmax_scale=softmax_scale)
+    out = out.transpose(1, 2)  # back to (B, L, H, D)
+    return out
+def flash_attn_qkvpacked_func(qkv, dropout_p=0.0, softmax_scale=None, causal=False,
+                               window_size=(-1, -1), softcap=0.0, alibi_slopes=None,
+                               deterministic=False, return_attn_probs=False):
+    """qkv: (B, L, 3, H, D) -> out: (B, L, H, D)"""
+    q, k, v = qkv.unbind(dim=2)
+    return flash_attn_func(q, k, v, dropout_p=dropout_p, softmax_scale=softmax_scale,
+                           causal=causal)
+def flash_attn_kvpacked_func(q, kv, dropout_p=0.0, softmax_scale=None, causal=False,
+                              window_size=(-1, -1), softcap=0.0, alibi_slopes=None,
+                              deterministic=False, return_attn_probs=False):
+    """q: (B, Lq, H, D), kv: (B, Lk, 2, H, D) -> out: (B, Lq, H, D)"""
+    k, v = kv.unbind(dim=2)
+    return flash_attn_func(q, k, v, dropout_p=dropout_p, softmax_scale=softmax_scale,
+                           causal=causal)
+# ---------- varlen ----------
+def _varlen_sdpa(q, k, v, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k,
+                 causal=False, softmax_scale=None):
+    """
+    q: (total_q, H, D), k: (total_k, H, D), v: (total_k, H, D)
+    cu_seqlens_q/k: (batch+1,) int32
+    Returns: (total_q, H, D)
+    """
+    batch = cu_seqlens_q.shape[0] - 1
+    H = q.shape[1]
+    D = q.shape[2]
+    # Fast path: all seqlens are equal (common case)
+    cu_q = cu_seqlens_q.tolist()
+    cu_k = cu_seqlens_k.tolist()
+    all_equal = True
+    sq0 = cu_q[1] - cu_q[0]
+    sk0 = cu_k[1] - cu_k[0]
+    for i in range(1, batch):
+        if cu_q[i + 1] - cu_q[i] != sq0 or cu_k[i + 1] - cu_k[i] != sk0:
+            all_equal = False
+            break
+    if all_equal and sq0 == max_seqlen_q and sk0 == max_seqlen_k:
+        # Reshape directly – no padding needed
+        q2 = q.reshape(batch, sq0, H, D).transpose(1, 2)   # (B, H, Lq, D)
+        k2 = k.reshape(batch, sk0, H, D).transpose(1, 2)
+        v2 = v.reshape(batch, sk0, H, D).transpose(1, 2)
+        out = _sdpa(q2, k2, v2, causal=causal, softmax_scale=softmax_scale)
+        return out.transpose(1, 2).reshape(-1, H, D)
+    # Slow path: unequal lengths – pad, compute, then gather
+    q_padded = q.new_zeros(batch, max_seqlen_q, H, D)
+    k_padded = k.new_zeros(batch, max_seqlen_k, H, D)
+    v_padded = v.new_zeros(batch, max_seqlen_k, H, D)
+    for i in range(batch):
+        sq = cu_q[i + 1] - cu_q[i]
+        sk = cu_k[i + 1] - cu_k[i]
+        q_padded[i, :sq] = q[cu_q[i]:cu_q[i + 1]]
+        k_padded[i, :sk] = k[cu_k[i]:cu_k[i + 1]]
+        v_padded[i, :sk] = v[cu_k[i]:cu_k[i + 1]]
+    # Create attention mask for padding
+    q_mask = torch.arange(max_seqlen_q, device=q.device).unsqueeze(0)  # (1, Lq)
+    k_mask = torch.arange(max_seqlen_k, device=k.device).unsqueeze(0)  # (1, Lk)
+    q_lens = torch.tensor([cu_q[i + 1] - cu_q[i] for i in range(batch)],
+                          device=q.device).unsqueeze(1)  # (B, 1)
+    k_lens = torch.tensor([cu_k[i + 1] - cu_k[i] for i in range(batch)],
+                          device=k.device).unsqueeze(1)  # (B, 1)
+    # (B, 1, 1, Lk) – True where valid
+    attn_mask = (k_mask < k_lens).unsqueeze(1).unsqueeze(2)
+    # Also mask out query positions that are padding (their output is ignored anyway)
+    # Use float mask: -inf for invalid positions
+    attn_bias = torch.zeros(batch, 1, max_seqlen_q, max_seqlen_k,
+                            device=q.device, dtype=q.dtype)
+    attn_bias.masked_fill_(~attn_mask, float('-inf'))
+    if causal:
+        causal_mask = torch.triu(
+            torch.ones(max_seqlen_q, max_seqlen_k, device=q.device, dtype=torch.bool),
+            diagonal=1
+        )
+        attn_bias.masked_fill_(causal_mask.unsqueeze(0).unsqueeze(0), float('-inf'))
+    q2 = q_padded.transpose(1, 2)  # (B, H, Lq, D)
+    k2 = k_padded.transpose(1, 2)
+    v2 = v_padded.transpose(1, 2)
+    out = F.scaled_dot_product_attention(q2, k2, v2, attn_mask=attn_bias,
+                                         scale=softmax_scale)
+    out = out.transpose(1, 2)  # (B, Lq, H, D)
+    # Gather results back to packed format
+    parts = []
+    for i in range(batch):
+        sq = cu_q[i + 1] - cu_q[i]
+        parts.append(out[i, :sq])  # (sq, H, D)
+    return torch.cat(parts, dim=0)
+def flash_attn_varlen_func(q, k, v, cu_seqlens_q, cu_seqlens_k,
+                           max_seqlen_q, max_seqlen_k,
+                           dropout_p=0.0, softmax_scale=None, causal=False,
+                           window_size=(-1, -1), softcap=0.0, alibi_slopes=None,
+                           deterministic=False, return_attn_probs=False,
+                           block_table=None):
+    """q/k/v: (total, H, D) -> out: (total_q, H, D)"""
+    return _varlen_sdpa(q, k, v, cu_seqlens_q, cu_seqlens_k,
+                        max_seqlen_q, max_seqlen_k,
+                        causal=causal, softmax_scale=softmax_scale)
+def flash_attn_varlen_qkvpacked_func(qkv, cu_seqlens, max_seqlen,
+                                      dropout_p=0.0, softmax_scale=None, causal=False,
+                                      window_size=(-1, -1), softcap=0.0,
+                                      alibi_slopes=None, deterministic=False,
+                                      return_attn_probs=False):
+    """qkv: (total, 3, H, D) -> out: (total, H, D)"""
+    q, k, v = qkv.unbind(dim=1)
+    return _varlen_sdpa(q, k, v, cu_seqlens, cu_seqlens,
+                        max_seqlen, max_seqlen,
+                        causal=causal, softmax_scale=softmax_scale)
+def flash_attn_varlen_kvpacked_func(q, kv, cu_seqlens_q, cu_seqlens_k,
+                                     max_seqlen_q, max_seqlen_k,
+                                     dropout_p=0.0, softmax_scale=None, causal=False,
+                                     window_size=(-1, -1), softcap=0.0,
+                                     alibi_slopes=None, deterministic=False,
+                                     return_attn_probs=False):
+    """q: (total_q, H, D), kv: (total_k, 2, H, D) -> out: (total_q, H, D)"""
+    k, v = kv.unbind(dim=1)
+    return _varlen_sdpa(q, k, v, cu_seqlens_q, cu_seqlens_k,
+                        max_seqlen_q, max_seqlen_k,
+                        causal=causal, softmax_scale=softmax_scale)
+# ---------- with_kvcache (used by some SAM2 code paths) ----------
+def flash_attn_with_kvcache(q, k_cache, v_cache, k=None, v=None,
+                            rotary_cos=None, rotary_sin=None,
+                            cache_seqlens=None, cache_batch_idx=None,
+                            block_table=None, softmax_scale=None, causal=False,
+                            window_size=(-1, -1), softcap=0.0,
+                            rotary_interleaved=True, alibi_slopes=None,
+                            num_splits=0, return_softmax_lse=False):
+    """Simplified kv-cache attention fallback."""
+    # Combine current k/v with cache if provided
+    if k is not None:
+        k_full = torch.cat([k_cache, k], dim=1)
+        v_full = torch.cat([v_cache, v], dim=1)
+    else:
+        k_full = k_cache
+        v_full = v_cache
+    return flash_attn_func(q, k_full, v_full, softmax_scale=softmax_scale, causal=causal)

Upload flash_attn_stub/flash_attn/__init__.py with huggingface_hub

Upload flash_attn_stub/flash_attn/init.py with huggingface_hub