Spaces:
Running on Zero
Running on Zero
File size: 5,427 Bytes
8f1bcd9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 | """
generate.py
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
Text-to-motion generation.
Primary backend: MoMask inference server running on the Vast.ai instance.
Returns [T, 263] HumanML3D features directly β no SMPL
body mesh required.
Fallback backend: HumanML3D dataset keyword search (offline / no GPU needed).
Usage
βββββ
from Retarget.generate import generate_motion
# Use MoMask on instance
motion = generate_motion("a person walks forward",
backend_url="http://ssh4.vast.ai:8765")
# Local fallback (streams HuggingFace dataset)
motion = generate_motion("a person walks forward")
# Returned motion: np.ndarray [T, 263]
# Feed directly to animate_glb()
"""
from __future__ import annotations
import json
import numpy as np
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# Public API
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def generate_motion(
prompt: str,
backend_url: str | None = None,
num_frames: int = 196,
fps: float = 20.0,
seed: int = -1,
) -> np.ndarray:
"""
Generate a HumanML3D [T, 263] motion array from a text prompt.
Parameters
----------
prompt
Natural language description of the desired motion.
Examples: "a person walks forward", "someone does a jumping jack",
"a man waves hello with his right hand"
backend_url
URL of the MoMask inference server. E.g. "http://ssh4.vast.ai:8765".
If None or if the server is unreachable, falls back to dataset search.
num_frames
Desired clip length in frames (at 20 fps; max ~196 β 9.8 s).
fps
Target fps (MoMask natively produces 20 fps).
seed
Random seed for reproducibility (-1 = random).
Returns
-------
np.ndarray shape [T, 263] HumanML3D feature vector.
"""
if backend_url:
try:
return _call_momask(prompt, backend_url, num_frames, seed)
except Exception as exc:
print(f"[generate] MoMask unreachable ({exc}) β falling back to dataset search")
return _dataset_search_fallback(prompt)
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# MoMask backend
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def _call_momask(
prompt: str,
url: str,
num_frames: int,
seed: int,
) -> np.ndarray:
"""POST to the MoMask inference server; return [T, 263] array."""
import urllib.request
payload = json.dumps({
"prompt": prompt,
"num_frames": num_frames,
"seed": seed,
}).encode("utf-8")
req = urllib.request.Request(
f"{url.rstrip('/')}/generate",
data=payload,
headers={"Content-Type": "application/json"},
method="POST",
)
with urllib.request.urlopen(req, timeout=180) as resp:
result = json.loads(resp.read())
motion = np.array(result["motion"], dtype=np.float32)
if motion.ndim != 2 or motion.shape[1] < 193:
raise ValueError(f"Server returned unexpected shape {motion.shape}")
print(f"[generate] MoMask: {motion.shape[0]} frames for '{prompt}'")
return motion
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# Dataset search fallback
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def _dataset_search_fallback(prompt: str) -> np.ndarray:
"""
Keyword search in TeoGchx/HumanML3D dataset (streaming, HuggingFace).
Used when no MoMask server is available.
"""
from .search import search_motions, format_choice_label
print(f"[generate] Searching HumanML3D dataset for: '{prompt}'")
results = search_motions(prompt, top_k=5, split="test", max_scan=500)
if not results:
raise RuntimeError(
f"No motion found in dataset for prompt: {prompt!r}\n"
"Check your internet connection or deploy MoMask on the instance."
)
best = results[0]
print(f"[generate] Best match: {format_choice_label(best)}")
return np.array(best["motion"], dtype=np.float32)
|