File size: 5,427 Bytes
8f1bcd9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
"""
generate.py
───────────────────────────────────────────────────────────────────────────────
Text-to-motion generation.

Primary backend:  MoMask inference server running on the Vast.ai instance.
                  Returns [T, 263] HumanML3D features directly β€” no SMPL
                  body mesh required.

Fallback backend: HumanML3D dataset keyword search (offline / no GPU needed).

Usage
─────
    from Retarget.generate import generate_motion

    # Use MoMask on instance
    motion = generate_motion("a person walks forward",
                             backend_url="http://ssh4.vast.ai:8765")

    # Local fallback (streams HuggingFace dataset)
    motion = generate_motion("a person walks forward")

    # Returned motion: np.ndarray [T, 263]
    # Feed directly to animate_glb()
"""
from __future__ import annotations
import json
import numpy as np


# ──────────────────────────────────────────────────────────────────────────────
# Public API
# ──────────────────────────────────────────────────────────────────────────────

def generate_motion(
    prompt:      str,
    backend_url: str | None = None,
    num_frames:  int   = 196,
    fps:         float = 20.0,
    seed:        int   = -1,
) -> np.ndarray:
    """
    Generate a HumanML3D [T, 263] motion array from a text prompt.

    Parameters
    ----------
    prompt
        Natural language description of the desired motion.
        Examples: "a person walks forward", "someone does a jumping jack",
                  "a man waves hello with his right hand"
    backend_url
        URL of the MoMask inference server.  E.g. "http://ssh4.vast.ai:8765".
        If None or if the server is unreachable, falls back to dataset search.
    num_frames
        Desired clip length in frames (at 20 fps; max ~196 β‰ˆ 9.8 s).
    fps
        Target fps (MoMask natively produces 20 fps).
    seed
        Random seed for reproducibility (-1 = random).

    Returns
    -------
    np.ndarray  shape [T, 263]  HumanML3D feature vector.
    """
    if backend_url:
        try:
            return _call_momask(prompt, backend_url, num_frames, seed)
        except Exception as exc:
            print(f"[generate] MoMask unreachable ({exc}) β€” falling back to dataset search")

    return _dataset_search_fallback(prompt)


# ──────────────────────────────────────────────────────────────────────────────
# MoMask backend
# ──────────────────────────────────────────────────────────────────────────────

def _call_momask(
    prompt:     str,
    url:        str,
    num_frames: int,
    seed:       int,
) -> np.ndarray:
    """POST to the MoMask inference server; return [T, 263] array."""
    import urllib.request

    payload = json.dumps({
        "prompt":     prompt,
        "num_frames": num_frames,
        "seed":       seed,
    }).encode("utf-8")

    req = urllib.request.Request(
        f"{url.rstrip('/')}/generate",
        data=payload,
        headers={"Content-Type": "application/json"},
        method="POST",
    )
    with urllib.request.urlopen(req, timeout=180) as resp:
        result = json.loads(resp.read())

    motion = np.array(result["motion"], dtype=np.float32)
    if motion.ndim != 2 or motion.shape[1] < 193:
        raise ValueError(f"Server returned unexpected shape {motion.shape}")

    print(f"[generate] MoMask: {motion.shape[0]} frames for '{prompt}'")
    return motion


# ──────────────────────────────────────────────────────────────────────────────
# Dataset search fallback
# ──────────────────────────────────────────────────────────────────────────────

def _dataset_search_fallback(prompt: str) -> np.ndarray:
    """
    Keyword search in TeoGchx/HumanML3D dataset (streaming, HuggingFace).
    Used when no MoMask server is available.
    """
    from .search import search_motions, format_choice_label

    print(f"[generate] Searching HumanML3D dataset for: '{prompt}'")
    results = search_motions(prompt, top_k=5, split="test", max_scan=500)
    if not results:
        raise RuntimeError(
            f"No motion found in dataset for prompt: {prompt!r}\n"
            "Check your internet connection or deploy MoMask on the instance."
        )

    best = results[0]
    print(f"[generate] Best match: {format_choice_label(best)}")
    return np.array(best["motion"], dtype=np.float32)