File size: 11,467 Bytes
"""Minimal image-to-video prompt upsampler.

Run from the Cosmos3-Super-Image2Video repo root:

    export PROMPT_UPSAMPLER_API_KEY="..."
    python scripts/upsample_prompt.py \
        --model-name <model> \
        --base-url <VLM-endpoint-url> \
        --image-path assets/example_first_frame.png \
        --user-prompt "The dog flies into the outer space" \
        --output-path scripts/upsampled.json
"""

import argparse
import base64
import json
import mimetypes
import os
import re
from pathlib import Path

import requests

# Fixed generation settings: 16:9 480p, 189 frames @ 24 fps.
ASPECT_RATIO = "16,9"
WIDTH = 832
HEIGHT = 480
NUM_FRAMES = 189
FPS = 24

MAX_TOKENS = 8192

PROMPT_TEMPLATE = """You are an expert prompt engineer for an image-to-video generative model. You are given a STARTING FRAME image (the first frame of the video) and a USER INSTRUCTION describing the desired motion or changes to animate. Your task is to produce a dense, cinematic video description that the model will use to generate the full video, together with a customized negative prompt.

Complete this task in two phases.

---
### PHASE 1: VIDEO DESCRIPTION
Write a dense, narrative temporal_caption inside `<final_prompt>` XML tags, formatted as a JSON object using this exact template:

{
  "temporal_caption": "...",
  "duration": "placeholder",
  "fps": "placeholder",
  "resolution": {
    "H": "placeholder",
    "W": "placeholder"
  },
  "aspect_ratio": "placeholder"
}

Rules for the temporal_caption:
- The provided image is the exact starting frame - all described motion must be consistent with the starting frame.
- Opening: Establish the scene — subjects, environment, lighting — describing what is directly visible in the starting frame accurately and faithfully, noting essential elements that the motion will directly involve, the subject's orientation (e.g., "facing away", "in three-quarter profile"), and any implied ongoing motion (e.g., a cyclist leaning into a curve, water already splashing) so the video continues smoothly. Phrase it naturally as a scene description (do not say "in the starting frame", "initially shown", or similar meta-references).
- Motion: Describe the changes and actions in chronological order. Flow naturally from one action to the next. Advance time using natural conjunctions (e.g., "while," "as," "and").
- Physical Accuracy: All motion must obey gravity and reflect realistic material behavior (e.g., cloth ripples, water splashes, rigid objects resist deformation).
- Cause-and-effect: Always describe causes before their effects. Reflections, shadows, and secondary effects cannot appear on their own — the source object must first enter the frame or move into the relevant position before any reflection or shadow is described. E.g., a person must walk to the water's edge before their reflection appears on the surface; an object must strike the water before a splash erupts.
- Object Permanence: Every subject must persist throughout or have a clear reason for entering or exiting. When a new subject not present in the starting frame is introduced (e.g., an opposing team, an arriving vehicle), briefly describe their appearance (e.g., uniform color, vehicle type and color) so the generator can render them consistently, and describe a logical way for them to come into the frame (e.g., entering from a specific side of the frame, walking in through a door, or emerging from behind an existing object) rather than having them appear out of nowhere.
- Taboo Phrases: NEVER refer to the video medium itself. Avoid "the video shows...", "the scene...", "the clip...", "the frame...", "the camera shows...", "we see...".
- Perspective: Describe human body sides from the subject's own perspective (e.g., "her right hand" = the subject's right hand) to avoid ambiguity. This applies whenever a body part enters or moves in the frame: always specify whether it is the left or right (e.g., "his right hand reaches in from the lower edge"), never a bare "a hand enters the frame".
- Pronouns: Use singular pronouns ("he", "she", "him", "her", "it") or a singular noun phrase ("the person", "the rider", "the child") for single subjects. Never use "they"/"them"/"their" to refer to one person, as this can cause the model to render multiple subjects.
- Spatial Phrasing: Use spatial relationships for motion (e.g., "enters from the left", "rises above the horizon") rather than camera-centric descriptions.
- Camera: Include camera motion only if specified in the instruction; otherwise describe from a static viewpoint. Keep any described camera movement subtle and gradual — do not exaggerate altitude loss, tilt angle, or speed beyond what is minimally implied by the instruction. Do not use the word "transition" when describing camera motion.
- Cinematography Terms: When the instruction references a lens, camera, or filming technique (e.g., "probe lens", "macro lens", "fisheye", "drone shot", "GoPro"), treat it as a cinematographic style describing how the footage is captured — never as a physical object visible in the scene. Mention the style (e.g., for a probe lens: extreme close shot; for a fisheye lens: extreme wide angle fisheye view) rather than mentioning the lens or camera apparatus itself.
- Timelapse: If the instruction implies timelapse, explicitly use the word "timelapse" in the caption and avoid exaggerating its effects.
- Cuts & Montages: Always describe a single continuous shot with no hard cuts unless the user instruction explicitly used words like "cut", "hard cut", "jump cut", "shot change", or "montage". When multiple shots are requested without specifying an exact number, describe at most 3 shots, and dedicate the majority of the description to the opening action before any cut. Never use phrases like "the first shot", "the opening shot", or number shots as "first", "second", etc. — simply describe the action directly.
- Tone: Neutral, objective, descriptive. No opinions, value judgments, or inferred emotions unless physically observable.
- Length & Format: Write exactly ONE coherent paragraph of 5-8 sentences. No bullet points or lists.

USER INSTRUCTION:
"{nl_description}"

---
### PHASE 2: NEGATIVE PROMPT
Using your final video description from Phase 1, create a customized negative prompt.

HOW IT WORKS:
A negative prompt describes exactly what a bad video looks like. Use declarative statements (e.g., "blurry faces"). Never use negative instructions like "avoid" or "do not".

---
DEFAULT NEGATIVE PROMPT:
The video captures a series of frames showing macroblocking artifacts, chromatic aberration, high-frequency noise, and rolling shutter distortion. It includes static with no motion, motion blur, over-saturation, shaky footage, low resolution, grainy texture, pixelated images, poorly lit areas, underexposed and overexposed scenes, poor color balance, washed out colors, choppy sequences, jerky movements, low frame rate, bit-depth compression artifacts, color banding, unnatural transitions, outdated special effects, fake elements, unconvincing visuals, poorly edited content, jump cuts, hard cut, visual noise, and flickering. It features moiré patterns, edge halos, and temporal aliasing. Furthermore, the content defies common sense, generating illogical scenarios, nonsensical entities, absurd character behaviors, and conceptual paradoxes that violate basic human reasoning and everyday reality. The video looks like a surreal or glitchy hallucination. Overall, the video is of poor quality.
---

INSTRUCTIONS:
Delete any words from the default negative prompt that contradict your intended video. Keep most of the original wording and structure intact, and do not add new items. Examples:
* If you want scene cuts/montages -> REMOVE "jump cuts" and "hard cut".
* If you want a motionless/static scene -> REMOVE "static with no motion".
* If you want fantasy, sci-fi, or surrealism -> REMOVE "defies common sense", "illogical scenarios", "nonsensical entities", "surreal", and related logic-violation terms.
* If the scene has flickering light -> REMOVE "flickering".
* If it is a night-time timelapse -> REMOVE "motion blur".

Output only the final negative prompt as a single paragraph, wrapped in <negative_prompt> tags. Do not output any explanation or preamble."""


def image_to_data_url(path: Path) -> str:
    """Encode a local image as a base64 data URL."""
    mime = mimetypes.guess_type(path.name)[0] or "image/png"
    encoded = base64.b64encode(path.read_bytes()).decode("ascii")
    return f"data:{mime};base64,{encoded}"


def extract_tag(text: str, tag: str) -> str | None:
    """Return the stripped inner text of the first <tag>...</tag> block, if present."""
    match = re.search(rf"<{tag}>(.*?)</{tag}>", text, flags=re.DOTALL)
    return match.group(1).strip() if match else None


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Upsample an image-to-video prompt with a VLM.")
    parser.add_argument("--image-path", type=Path, default=Path("assets/example_first_frame.png"))
    parser.add_argument("--user-prompt", default="The dog flies into the outer space")
    parser.add_argument("--output-path", type=Path, default=Path("scripts/upsampled.json"))
    parser.add_argument("--model-name", required=True)
    parser.add_argument("--base-url", required=True, metavar="<VLM-endpoint-url>")
    return parser.parse_args()


def invoke_vlm(image_path: Path, user_prompt: str, model_name: str, base_url: str) -> str:
    """Call an OpenAI-compatible chat completions endpoint and return the assistant text."""
    payload = {
        "model": model_name,
        "max_tokens": MAX_TOKENS,
        "messages": [
            {
                "role": "user",
                "content": [
                    {"type": "image_url", "image_url": {"url": image_to_data_url(image_path)}},
                    {"type": "text", "text": PROMPT_TEMPLATE.replace("{nl_description}", user_prompt.strip())},
                ],
            }
        ],
    }
    headers = {"Authorization": f"Bearer {os.environ['PROMPT_UPSAMPLER_API_KEY']}"}
    response = requests.post(f"{base_url.rstrip('/')}/chat/completions", json=payload, headers=headers)
    response.raise_for_status()
    return response.json()["choices"][0]["message"]["content"]


def main() -> None:
    args = parse_args()
    content = invoke_vlm(args.image_path, args.user_prompt, args.model_name, args.base_url)

    final_prompt = extract_tag(content, "final_prompt")
    if final_prompt is None:
        raise RuntimeError(f"Response missing <final_prompt> block:\n{content}")

    # Pin the output parameters post-hoc (the template leaves them as placeholders).
    data = json.loads(final_prompt)
    data["duration"] = f"{int(NUM_FRAMES / FPS)}s"
    data["fps"] = float(round(FPS))
    data["resolution"] = {"H": HEIGHT, "W": WIDTH}
    data["aspect_ratio"] = ASPECT_RATIO

    record: dict = {"prompt": json.dumps(data, ensure_ascii=False)}
    negative = extract_tag(content, "negative_prompt")
    if negative:
        record["negative_prompt"] = negative

    args.output_path.parent.mkdir(parents=True, exist_ok=True)
    args.output_path.write_text(json.dumps(record, ensure_ascii=False), encoding="utf-8")
    print(f"PROMPT:\n{record['prompt']}")
    print(f"\nNEGATIVE PROMPT:\n{record.get('negative_prompt', '')}")
    print(f"\nWrote {args.output_path}")


if __name__ == "__main__":
    main()