ideogram4 / app.py
multimodalart's picture
multimodalart HF Staff
Fix CSS and remove queue
b6b3100 verified
import os
os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
# outlines_core ships an @torch.compile bitmask kernel dynamo can't trace (torch.device const) -> noisy
# WON'T CONVERT spam on every local upsample. We never use torch.compile at runtime, so disable dynamo.
os.environ.setdefault("TORCHDYNAMO_DISABLE", "1")
# diffusers (with Ideogram4 support) is pip-installed from the PR — see requirements.txt. No bundled source.
import json
import random
import time
from threading import Thread
import gradio as gr
import requests
import spaces
import torch
from huggingface_hub import hf_hub_download
from diffusers import Ideogram4Pipeline
# Runtime shim (keeps the bundled diffusers pristine): cu130-era bitsandbytes returns Params4bit.shape as a
# plain tuple, but diffusers' check_quantized_param_shape calls .numel() on it. math.prod handles both, so
# this is a no-op once diffusers/bnb fix it upstream.
import math # noqa: E402
from diffusers.quantizers.bitsandbytes.bnb_quantizer import BnB4BitDiffusersQuantizer # noqa: E402
def _check_quantized_param_shape(self, param_name, current_param, loaded_param):
n = math.prod(tuple(current_param.shape))
inferred_shape = (n,) if "bias" in param_name else ((n + 1) // 2, 1)
if tuple(loaded_param.shape) != tuple(inferred_shape):
raise ValueError(f"Expected flattened shape of {param_name} to be {inferred_shape}, got {tuple(loaded_param.shape)}.")
return True
BnB4BitDiffusersQuantizer.check_quantized_param_shape = _check_quantized_param_shape
MODEL_ID = "ideogram-ai/ideogram-4-nf4"
LM_HEAD_REPO = "multimodalart/qwen3-vl-8b-instruct-lm-head"
AOTI_REPO = "multimodalart/i4-block-aoti"
AOTI_BLOCK_FILE = "Ideogram4TransformerBlock/package.pt2"
MAX_SEED = 2**31 - 1
# Prompt upsampling: Ideogram's hosted magic-prompt (default) with the local Qwen graft as fallback.
IDEOGRAM_MAGIC_PROMPT_URL = "https://api.ideogram.ai/v1/ideogram-v4/magic-prompt"
IDEOGRAM_API_KEY = os.environ.get("IDEOGRAM_API_KEY")
UPSAMPLERS = ["Ideogram (remote)", "Qwen (local)"]
# V4 presets (forward step-order: main CFG 7.0 -> polish 3.0).
MODES = {
"Turbo · 12 steps": dict(num_inference_steps=12, guidance_schedule=(7.0,) * 11 + (3.0,) * 1, mu=0.5, std=1.75),
"Default · 20 steps": dict(num_inference_steps=20, guidance_schedule=(7.0,) * 18 + (3.0,) * 2, mu=0.0, std=1.75),
"Quality · 48 steps": dict(num_inference_steps=48, guidance_schedule=(7.0,) * 45 + (3.0,) * 3, mu=0.0, std=1.5),
}
# --- Pipeline: dequantize both transformers nf4 -> bf16 in the parent (CPU) so AOTI can bind its weight-less
# graph to real bf16 weights (this is repo cold start, which is fine; function cold start stays fast). ---
t = time.perf_counter()
pipe = Ideogram4Pipeline.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16)
pipe.transformer.dequantize()
pipe.unconditional_transformer.dequantize()
pipe.to("cuda")
print(f"[timing] pipeline load + dequant: {time.perf_counter() - t:.1f}s", flush=True)
# The local prompt-enhancer LM head is grafted lazily by `pipe.upsample_prompt` on first use (onto the worker's
# GPU), so no explicit load is needed here. Local is only the fallback; Ideogram's remote API is the default.
# Pre-fetch the AOTI package AND pre-warm torch-inductor's CPU-ISA probe in the PARENT (repo cold start). The
# probe (valid_vec_isa_list) compiles test programs (~20s) the first time aoti_blocks_load builds a LazyAOTIModel;
# doing it once here means every ZeroGPU fork inherits the functools.cache, so per-worker (function cold start)
# aoti_blocks_load is just the ~instant block patch instead of a ~20s compile.
try:
hf_hub_download(AOTI_REPO, "package.pt2", subfolder="Ideogram4TransformerBlock")
from torch._inductor.cpu_vec_isa import valid_vec_isa_list
t = time.perf_counter()
valid_vec_isa_list()
print(f"[timing] vec-isa prewarm (parent): {time.perf_counter() - t:.1f}s", flush=True)
AOTI_OK = True
except Exception as e:
AOTI_OK = False
print(f"[aoti] prefetch/prewarm failed, running eager: {e!r}", flush=True)
_AOTI_APPLIED = False
def _apply_aoti():
"""Patch the compiled block onto every Ideogram4TransformerBlock of both transformers (once per worker).
`aoti_blocks_load` is lazy (binds forward, defers the .so to first diffusion step) and CPU-only, so this is
safe to run in a background thread overlapping the (transformer-idle) upsampling step."""
global _AOTI_APPLIED
if _AOTI_APPLIED or not AOTI_OK:
return
try:
t = time.perf_counter()
spaces.aoti_blocks_load(pipe.transformer, AOTI_REPO)
spaces.aoti_blocks_load(pipe.unconditional_transformer, AOTI_REPO)
_AOTI_APPLIED = True
print(f"[timing] aoti_blocks_load (both transformers): {time.perf_counter() - t:.2f}s", flush=True)
except Exception as e: # never let a bind hiccup block generation
print(f"[aoti] apply failed, running eager: {e!r}", flush=True)
def remote_upsample(prompt, width, height):
"""Rewrite the prompt into Ideogram's native JSON caption via the hosted magic-prompt API."""
d = math.gcd(width, height) or 1
aspect_ratio = f"{width // d}x{height // d}" # Ideogram's WxH form
resp = requests.post(
IDEOGRAM_MAGIC_PROMPT_URL,
headers={"Api-Key": IDEOGRAM_API_KEY, "Content-Type": "application/json"},
json={"text_prompt": prompt, "aspect_ratio": aspect_ratio},
timeout=120,
)
resp.raise_for_status()
jp = resp.json().get("json_prompt")
if not jp:
raise RuntimeError("Ideogram API returned no json_prompt")
jp.pop("aspect_ratio", None)
for el in jp.get("compositional_deconstruction", {}).get("elements", []):
if isinstance(el, dict):
el.pop("bbox", None)
return json.dumps(jp, ensure_ascii=False, separators=(",", ":"))
# --- Dynamic GPU duration ---------------------------------------------------------------------------------
# Per-step diffusion time, linear in image tokens between the two measured anchors (1024 @ 1.10 it/s,
# 2048 @ 6 s/it). The chord overestimates in between, so it's a safe budget; clamped low for small images.
# Remote upsample is a network call done OFF the GPU (in `generate`), so it isn't budgeted here.
_TOK_1024, _TOK_2048 = (1024 // 16) ** 2, (2048 // 16) ** 2 # 4096, 16384 image tokens
_PS_1024, _PS_2048 = 1.0 / 1.10, 6.0 # measured seconds/iteration
_PS_B = (_PS_2048 - _PS_1024) / (_TOK_2048 - _TOK_1024)
_PS_A = _PS_1024 - _PS_B * _TOK_1024
LOCAL_UPSAMPLE_S = 15 # local Qwen graft+generate (~12s) with headroom
DIFFUSION_OVERHEAD_S = 8 # .so dlopen + block patch + cudnn setup on a cold worker's first forward
DURATION_MARGIN = 1.3
def _per_step(width, height):
return max(0.2, _PS_A + _PS_B * ((int(width) // 16) * (int(height) // 16)))
def _gpu_duration(final_prompt, mode, width, height, seed, do_local, progress=None):
steps = MODES.get(mode, MODES["Default · 20 steps"])["num_inference_steps"]
budget = steps * _per_step(width, height) + DIFFUSION_OVERHEAD_S
if do_local:
budget += LOCAL_UPSAMPLE_S
return max(60, int(math.ceil(budget * DURATION_MARGIN)))
@spaces.GPU(duration=_gpu_duration, size="xlarge")
def _gpu_generate(final_prompt, mode, width, height, seed, do_local, progress=gr.Progress(track_tqdm=True)):
# Overlap the AOTI block-patch with the (transformer-idle) local upsample, if any.
aoti_thread = Thread(target=_apply_aoti, daemon=True)
aoti_thread.start()
if do_local:
progress(0.0, desc="✍️ Upsampling (local Qwen)…")
t = time.perf_counter()
try:
final_prompt = pipe.upsample_prompt(
final_prompt, height=int(height), width=int(width), lm_head_repo_id=LM_HEAD_REPO
)[0]
print(f"[timing] upsample local: {time.perf_counter() - t:.2f}s", flush=True)
except Exception as e:
print(f"[upsample] local failed: {e!r}", flush=True)
gr.Warning("Local upsampler unavailable — generating from the raw prompt.")
aoti_thread.join() # ensure blocks are patched before the diffusion loop
progress(0.0, desc="🎨 Generating image…")
generator = torch.Generator(device="cuda").manual_seed(int(seed))
preset = MODES.get(mode, MODES["Default · 20 steps"])
t = time.perf_counter()
image = pipe(prompt=final_prompt, width=int(width), height=int(height), generator=generator, **preset).images[0]
print(f"[timing] diffusion ({mode}): {time.perf_counter() - t:.2f}s", flush=True)
try:
caption = json.loads(final_prompt)
except Exception:
caption = {"prompt": final_prompt}
return image, int(seed), caption
def generate(
prompt,
mode="Default · 20 steps",
upsampler=UPSAMPLERS[0],
width=1024,
height=1024,
seed=0,
randomize_seed=False,
progress=gr.Progress(track_tqdm=True),
):
if randomize_seed or seed < 0:
seed = random.randint(0, MAX_SEED)
# Remote upsample is a network call -> run it here, OFF the GPU. Fall back to local (on-GPU) on failure.
final_prompt, do_local = prompt, True
if upsampler == UPSAMPLERS[0] and IDEOGRAM_API_KEY:
progress(0.0, desc="✍️ Upsampling (Ideogram)…")
t = time.perf_counter()
try:
final_prompt = remote_upsample(prompt, int(width), int(height))
do_local = False
print(f"[timing] upsample remote (off-GPU): {time.perf_counter() - t:.2f}s", flush=True)
except Exception as e:
print(f"[upsample] remote failed, falling back to local: {e!r}", flush=True)
gr.Warning("Ideogram API unavailable — using the local Qwen upsampler.")
return _gpu_generate(final_prompt, mode, width, height, seed, do_local)
@spaces.GPU(size="xlarge")
def _warmup():
"""Warm the local upsampler (lazy LM-head graft) on the startup worker (no diffusion)."""
_apply_aoti() # no-op while AOTI is disabled
t = time.perf_counter()
pipe.upsample_prompt("a red apple on a wooden table", height=1024, width=1024, lm_head_repo_id=LM_HEAD_REPO)
print(f"[timing] warmup upsample: {time.perf_counter() - t:.2f}s", flush=True)
try:
_warmup()
except Exception as e: # a flaky ZeroGPU worker must not take down the Space
print(f"[warmup] failed (will warm lazily on first request): {e!r}", flush=True)
CSS='''
.dark .gradio-container { color: var(--body-text-color); }
'''
with gr.Blocks(theme=gr.themes.Citrus(), title="Ideogram 4", css=CSS) as demo:
gr.Markdown(
"# Ideogram 4\n"
"Ideogram's first open-weights model — a 9.3B-parameter text-to-image foundation model at the "
"forefront of design, with best-in-class text rendering.\n\n"
"[Model](https://huggingface.co/ideogram-ai/ideogram-4-nf4) · "
"[Model (fp8)](https://huggingface.co/ideogram-ai/ideogram-4-fp8) · "
"[Blog](https://ideogram.ai/blog/ideogram-4.0/)"
)
with gr.Row():
with gr.Column():
prompt = gr.Textbox(label="Prompt", value="a ginger cat wearing a tiny wizard hat reading a spellbook", lines=3)
mode = gr.Radio(choices=list(MODES.keys()), value="Default · 20 steps", label="Mode (speed ↔ quality)")
run = gr.Button("Generate", variant="primary")
with gr.Accordion("Advanced", open=False):
upsampler = gr.Radio(
choices=UPSAMPLERS,
value=UPSAMPLERS[0],
label="Prompt upsampler",
info="Rewrite into Ideogram's native JSON caption. Remote (Ideogram) preferred; falls back to local.",
)
with gr.Row():
width = gr.Slider(512, 2048, value=1024, step=64, label="Width")
height = gr.Slider(512, 2048, value=1024, step=64, label="Height")
with gr.Row():
seed = gr.Number(label="Seed", value=0, precision=0)
randomize = gr.Checkbox(label="Randomize seed", value=True)
with gr.Column():
out_image = gr.Image(label="Output", type="pil")
out_caption = gr.JSON(label="Caption fed to the model (upsampled when enabled)")
gr.Examples(
examples=[
["a ginger cat wearing a tiny wizard hat reading a spellbook"],
["an isometric illustration of a tiny city floating in the clouds"],
["a golden retriever on a skateboard"],
],
inputs=[prompt],
outputs=[out_image, seed, out_caption],
fn=generate,
cache_examples=True,
cache_mode="lazy",
)
run.click(
generate,
inputs=[prompt, mode, upsampler, width, height, seed, randomize],
outputs=[out_image, seed, out_caption],
)
demo.launch()