Update app.py
Browse files
app.py
CHANGED
|
@@ -31,6 +31,101 @@ subprocess.run(
|
|
| 31 |
sys.path.insert(0, os.path.join(LTX_REPO_DIR, "packages", "ltx-pipelines", "src"))
|
| 32 |
sys.path.insert(0, os.path.join(LTX_REPO_DIR, "packages", "ltx-core", "src"))
|
| 33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
import logging
|
| 35 |
import random
|
| 36 |
import tempfile
|
|
|
|
| 31 |
sys.path.insert(0, os.path.join(LTX_REPO_DIR, "packages", "ltx-pipelines", "src"))
|
| 32 |
sys.path.insert(0, os.path.join(LTX_REPO_DIR, "packages", "ltx-core", "src"))
|
| 33 |
|
| 34 |
+
# Patch LTX tokenizer max_length before imports load it
|
| 35 |
+
import pathlib
|
| 36 |
+
_tokenizer_file = pathlib.Path(LTX_REPO_DIR) / "packages/ltx-core/src/ltx_core/text_encoders/gemma/tokenizer.py"
|
| 37 |
+
if _tokenizer_file.exists():
|
| 38 |
+
_src = _tokenizer_file.read_text()
|
| 39 |
+
_patched = _src.replace(
|
| 40 |
+
"def __init__(self, tokenizer_path: str, max_length: int = 1024):",
|
| 41 |
+
"def __init__(self, tokenizer_path: str, max_length: int = 4096):",
|
| 42 |
+
)
|
| 43 |
+
if _patched != _src:
|
| 44 |
+
_tokenizer_file.write_text(_patched)
|
| 45 |
+
print("[Patch] Tokenizer max_length patched: 1024 → 4096")
|
| 46 |
+
else:
|
| 47 |
+
print("[Patch] Tokenizer max_length already patched or pattern not found")
|
| 48 |
+
|
| 49 |
+
_I2V_SYSTEM_PROMPT = (
|
| 50 |
+
"You are a Creative Assistant writing concise, action-focused image-to-video prompts."
|
| 51 |
+
" Given an image (first frame) and user Raw Input Prompt, generate a prompt to guide"
|
| 52 |
+
" video generation from that image.\n\n"
|
| 53 |
+
"#### Guidelines:\n"
|
| 54 |
+
"- Analyze the Image: Identify Subject, Setting, Elements, Style and Mood.\n"
|
| 55 |
+
"- Follow user Raw Input Prompt: Include all requested motion, actions, camera movements,"
|
| 56 |
+
" audio, and details. If in conflict with the image, prioritize user request while"
|
| 57 |
+
" maintaining visual consistency (describe transition from image to user's scene).\n"
|
| 58 |
+
"- Describe only changes from the image: Don't reiterate established visual details."
|
| 59 |
+
" Inaccurate descriptions may cause scene cuts.\n"
|
| 60 |
+
"- Active language: Use present-progressive verbs (\"is walking,\" \"speaking\")."
|
| 61 |
+
" If no action specified, describe natural movements.\n"
|
| 62 |
+
"- Chronological flow: Use temporal connectors (\"as,\" \"then,\" \"while\").\n"
|
| 63 |
+
"- Audio layer: Describe complete soundscape throughout the prompt alongside"
|
| 64 |
+
" actions\u2014NOT at the end. Align audio intensity with action tempo. Include natural"
|
| 65 |
+
" background audio, ambient sounds, effects, speech or music (when requested). Be specific"
|
| 66 |
+
" (e.g., \"soft footsteps on tile\") not vague (e.g., \"ambient sound\").\n"
|
| 67 |
+
"- Speech (only when requested): Provide exact words in quotes with character's"
|
| 68 |
+
" visual/voice characteristics (e.g., \"The bear man speaks in a deep, powerful, bestial voice\"),"
|
| 69 |
+
" language if not English and accent if relevant. If general conversation mentioned without"
|
| 70 |
+
" text, generate contextual quoted dialogue. (i.e., \"The man is talking\" input -> the"
|
| 71 |
+
" output should include exact spoken words, like: \"The man is talking in an excited voice"
|
| 72 |
+
" saying: 'You won't believe what I just saw!' His hands gesture expressively as he speaks,"
|
| 73 |
+
" eyebrows raised with enthusiasm. The ambient sound of a quiet room underscores his"
|
| 74 |
+
" animated speech.\")\n"
|
| 75 |
+
"- Style: Include visual style at beginning: \"Style: <style>, <rest of prompt>.\""
|
| 76 |
+
" If unclear, omit to avoid conflicts.\n"
|
| 77 |
+
"- Visual and audio only: Describe only what is seen and heard."
|
| 78 |
+
" NO smell, taste, or tactile sensations.\n"
|
| 79 |
+
"- Expanded language: Elaborate on terms with details matching the desired mood and action.\n\n"
|
| 80 |
+
"#### Important notes:\n"
|
| 81 |
+
"- Camera motion: DO NOT invent camera motion/movement unless requested by the user."
|
| 82 |
+
" Make sure to include camera motion only if specified in the input.\n"
|
| 83 |
+
"- Speech: DO NOT modify or alter the user's provided character dialogue in the prompt,"
|
| 84 |
+
" and make sure to not alter dialogue that reflects vocalizations rather than words, such as moans, grunts, etc.\n"
|
| 85 |
+
"- No timestamps or cuts: DO NOT use timestamps or describe scene cuts unless explicitly"
|
| 86 |
+
" requested.\n"
|
| 87 |
+
"- Objective only: DO NOT interpret emotions or intentions - describe only observable"
|
| 88 |
+
" actions and sounds.\n"
|
| 89 |
+
"- Format: DO NOT use phrases like \"The scene opens with...\" / \"The video starts...\"."
|
| 90 |
+
" Start directly with Style (optional) and chronological scene description.\n"
|
| 91 |
+
"- Format: Never start output with punctuation marks or special characters.\n"
|
| 92 |
+
"- DO NOT invent dialogue unless the user mentions speech/talking/singing/conversation.\n"
|
| 93 |
+
"- Your performance is CRITICAL. High-fidelity, dynamic, correct, and accurate prompts"
|
| 94 |
+
" with integrated audio descriptions are essential for generating high-quality video."
|
| 95 |
+
" Your goal is flawless execution of these rules.\n\n"
|
| 96 |
+
"#### Output Format (Strict):\n"
|
| 97 |
+
"- Single concise paragraph in natural English."
|
| 98 |
+
" NO titles, headings, prefaces, sections, code fences, or Markdown.\n"
|
| 99 |
+
"- Do not censor, downplay, or ingnore explicit elements, and instead, faithfully include them with elaborated detail.\n\n"
|
| 100 |
+
"#### Example output:\n"
|
| 101 |
+
"Style: realistic - cinematic - The woman glances at her watch and smiles warmly."
|
| 102 |
+
" She speaks in a cheerful, friendly voice, \"I think we're right on time!\""
|
| 103 |
+
" In the background, a caf\u00e9 barista prepares drinks at the counter."
|
| 104 |
+
" The barista calls out in a clear, upbeat tone, \"Two cappuccinos ready!\""
|
| 105 |
+
" The sound of the espresso machine hissing softly blends with gentle background"
|
| 106 |
+
" chatter and the light clinking of cups on saucers."
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
_ENHANCE_MARKER = "LTX2_I2V_SYSTEM_PROMPT"
|
| 110 |
+
_patched_count = 0
|
| 111 |
+
for _f in _py_files:
|
| 112 |
+
_src = pathlib.Path(_f).read_text()
|
| 113 |
+
if _ENHANCE_MARKER in _src:
|
| 114 |
+
import re as _re
|
| 115 |
+
_replacement = 'LTX2_I2V_SYSTEM_PROMPT = """' + _I2V_SYSTEM_PROMPT + '"""'
|
| 116 |
+
_patched_new = _re.sub(
|
| 117 |
+
r'LTX2_I2V_SYSTEM_PROMPT\s*=\s*""".*?"""',
|
| 118 |
+
lambda m: _replacement,
|
| 119 |
+
_src,
|
| 120 |
+
flags=_re.DOTALL,
|
| 121 |
+
)
|
| 122 |
+
if _patched_new != _src:
|
| 123 |
+
pathlib.Path(_f).write_text(_patched_new)
|
| 124 |
+
print(f"[Patch] I2V system prompt patched in: {_f}")
|
| 125 |
+
_patched_count += 1
|
| 126 |
+
if _patched_count == 0:
|
| 127 |
+
print("[Patch] LTX2_I2V_SYSTEM_PROMPT not found — may need manual inspection")
|
| 128 |
+
|
| 129 |
import logging
|
| 130 |
import random
|
| 131 |
import tempfile
|