import os os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True") # outlines_core ships an @torch.compile bitmask kernel dynamo can't trace (torch.device const) -> noisy # WON'T CONVERT spam on every local upsample. We never use torch.compile at runtime, so disable dynamo. os.environ.setdefault("TORCHDYNAMO_DISABLE", "1") # diffusers (with Ideogram4 support) is pip-installed from the PR — see requirements.txt. No bundled source. import json import random import time from threading import Thread import gradio as gr import requests import spaces import torch from huggingface_hub import hf_hub_download from diffusers import Ideogram4Pipeline # Runtime shim (keeps the bundled diffusers pristine): cu130-era bitsandbytes returns Params4bit.shape as a # plain tuple, but diffusers' check_quantized_param_shape calls .numel() on it. math.prod handles both, so # this is a no-op once diffusers/bnb fix it upstream. import math # noqa: E402 from diffusers.quantizers.bitsandbytes.bnb_quantizer import BnB4BitDiffusersQuantizer # noqa: E402 def _check_quantized_param_shape(self, param_name, current_param, loaded_param): n = math.prod(tuple(current_param.shape)) inferred_shape = (n,) if "bias" in param_name else ((n + 1) // 2, 1) if tuple(loaded_param.shape) != tuple(inferred_shape): raise ValueError(f"Expected flattened shape of {param_name} to be {inferred_shape}, got {tuple(loaded_param.shape)}.") return True BnB4BitDiffusersQuantizer.check_quantized_param_shape = _check_quantized_param_shape MODEL_ID = "ideogram-ai/ideogram-4-nf4" LM_HEAD_REPO = "multimodalart/qwen3-vl-8b-instruct-lm-head" AOTI_REPO = "multimodalart/i4-block-aoti" AOTI_BLOCK_FILE = "Ideogram4TransformerBlock/package.pt2" MAX_SEED = 2**31 - 1 # Prompt upsampling: Ideogram's hosted magic-prompt (default) with the local Qwen graft as fallback. IDEOGRAM_MAGIC_PROMPT_URL = "https://api.ideogram.ai/v1/ideogram-v4/magic-prompt" IDEOGRAM_API_KEY = os.environ.get("IDEOGRAM_API_KEY") UPSAMPLERS = ["Ideogram (remote)", "Qwen (local)"] # V4 presets (forward step-order: main CFG 7.0 -> polish 3.0). MODES = { "Turbo · 12 steps": dict(num_inference_steps=12, guidance_schedule=(7.0,) * 11 + (3.0,) * 1, mu=0.5, std=1.75), "Default · 20 steps": dict(num_inference_steps=20, guidance_schedule=(7.0,) * 18 + (3.0,) * 2, mu=0.0, std=1.75), "Quality · 48 steps": dict(num_inference_steps=48, guidance_schedule=(7.0,) * 45 + (3.0,) * 3, mu=0.0, std=1.5), } # --- Pipeline: dequantize both transformers nf4 -> bf16 in the parent (CPU) so AOTI can bind its weight-less # graph to real bf16 weights (this is repo cold start, which is fine; function cold start stays fast). --- t = time.perf_counter() pipe = Ideogram4Pipeline.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16) pipe.transformer.dequantize() pipe.unconditional_transformer.dequantize() pipe.to("cuda") print(f"[timing] pipeline load + dequant: {time.perf_counter() - t:.1f}s", flush=True) # The local prompt-enhancer LM head is grafted lazily by `pipe.upsample_prompt` on first use (onto the worker's # GPU), so no explicit load is needed here. Local is only the fallback; Ideogram's remote API is the default. # Pre-fetch the AOTI package AND pre-warm torch-inductor's CPU-ISA probe in the PARENT (repo cold start). The # probe (valid_vec_isa_list) compiles test programs (~20s) the first time aoti_blocks_load builds a LazyAOTIModel; # doing it once here means every ZeroGPU fork inherits the functools.cache, so per-worker (function cold start) # aoti_blocks_load is just the ~instant block patch instead of a ~20s compile. try: hf_hub_download(AOTI_REPO, "package.pt2", subfolder="Ideogram4TransformerBlock") from torch._inductor.cpu_vec_isa import valid_vec_isa_list t = time.perf_counter() valid_vec_isa_list() print(f"[timing] vec-isa prewarm (parent): {time.perf_counter() - t:.1f}s", flush=True) AOTI_OK = True except Exception as e: AOTI_OK = False print(f"[aoti] prefetch/prewarm failed, running eager: {e!r}", flush=True) _AOTI_APPLIED = False def _apply_aoti(): """Patch the compiled block onto every Ideogram4TransformerBlock of both transformers (once per worker). `aoti_blocks_load` is lazy (binds forward, defers the .so to first diffusion step) and CPU-only, so this is safe to run in a background thread overlapping the (transformer-idle) upsampling step.""" global _AOTI_APPLIED if _AOTI_APPLIED or not AOTI_OK: return try: t = time.perf_counter() spaces.aoti_blocks_load(pipe.transformer, AOTI_REPO) spaces.aoti_blocks_load(pipe.unconditional_transformer, AOTI_REPO) _AOTI_APPLIED = True print(f"[timing] aoti_blocks_load (both transformers): {time.perf_counter() - t:.2f}s", flush=True) except Exception as e: # never let a bind hiccup block generation print(f"[aoti] apply failed, running eager: {e!r}", flush=True) def remote_upsample(prompt, width, height): """Rewrite the prompt into Ideogram's native JSON caption via the hosted magic-prompt API.""" d = math.gcd(width, height) or 1 aspect_ratio = f"{width // d}x{height // d}" # Ideogram's WxH form resp = requests.post( IDEOGRAM_MAGIC_PROMPT_URL, headers={"Api-Key": IDEOGRAM_API_KEY, "Content-Type": "application/json"}, json={"text_prompt": prompt, "aspect_ratio": aspect_ratio}, timeout=120, ) resp.raise_for_status() jp = resp.json().get("json_prompt") if not jp: raise RuntimeError("Ideogram API returned no json_prompt") jp.pop("aspect_ratio", None) for el in jp.get("compositional_deconstruction", {}).get("elements", []): if isinstance(el, dict): el.pop("bbox", None) return json.dumps(jp, ensure_ascii=False, separators=(",", ":")) # --- Dynamic GPU duration --------------------------------------------------------------------------------- # Per-step diffusion time, linear in image tokens between the two measured anchors (1024 @ 1.10 it/s, # 2048 @ 6 s/it). The chord overestimates in between, so it's a safe budget; clamped low for small images. # Remote upsample is a network call done OFF the GPU (in `generate`), so it isn't budgeted here. _TOK_1024, _TOK_2048 = (1024 // 16) ** 2, (2048 // 16) ** 2 # 4096, 16384 image tokens _PS_1024, _PS_2048 = 1.0 / 1.10, 6.0 # measured seconds/iteration _PS_B = (_PS_2048 - _PS_1024) / (_TOK_2048 - _TOK_1024) _PS_A = _PS_1024 - _PS_B * _TOK_1024 LOCAL_UPSAMPLE_S = 15 # local Qwen graft+generate (~12s) with headroom DIFFUSION_OVERHEAD_S = 8 # .so dlopen + block patch + cudnn setup on a cold worker's first forward DURATION_MARGIN = 1.3 def _per_step(width, height): return max(0.2, _PS_A + _PS_B * ((int(width) // 16) * (int(height) // 16))) def _gpu_duration(final_prompt, mode, width, height, seed, do_local, progress=None): steps = MODES.get(mode, MODES["Default · 20 steps"])["num_inference_steps"] budget = steps * _per_step(width, height) + DIFFUSION_OVERHEAD_S if do_local: budget += LOCAL_UPSAMPLE_S return max(60, int(math.ceil(budget * DURATION_MARGIN))) @spaces.GPU(duration=_gpu_duration, size="xlarge") def _gpu_generate(final_prompt, mode, width, height, seed, do_local, progress=gr.Progress(track_tqdm=True)): # Overlap the AOTI block-patch with the (transformer-idle) local upsample, if any. aoti_thread = Thread(target=_apply_aoti, daemon=True) aoti_thread.start() if do_local: progress(0.0, desc="✍️ Upsampling (local Qwen)…") t = time.perf_counter() try: final_prompt = pipe.upsample_prompt( final_prompt, height=int(height), width=int(width), lm_head_repo_id=LM_HEAD_REPO )[0] print(f"[timing] upsample local: {time.perf_counter() - t:.2f}s", flush=True) except Exception as e: print(f"[upsample] local failed: {e!r}", flush=True) gr.Warning("Local upsampler unavailable — generating from the raw prompt.") aoti_thread.join() # ensure blocks are patched before the diffusion loop progress(0.0, desc="🎨 Generating image…") generator = torch.Generator(device="cuda").manual_seed(int(seed)) preset = MODES.get(mode, MODES["Default · 20 steps"]) t = time.perf_counter() image = pipe(prompt=final_prompt, width=int(width), height=int(height), generator=generator, **preset).images[0] print(f"[timing] diffusion ({mode}): {time.perf_counter() - t:.2f}s", flush=True) try: caption = json.loads(final_prompt) except Exception: caption = {"prompt": final_prompt} return image, int(seed), caption def generate( prompt, mode="Default · 20 steps", upsampler=UPSAMPLERS[0], width=1024, height=1024, seed=0, randomize_seed=False, progress=gr.Progress(track_tqdm=True), ): if randomize_seed or seed < 0: seed = random.randint(0, MAX_SEED) # Remote upsample is a network call -> run it here, OFF the GPU. Fall back to local (on-GPU) on failure. final_prompt, do_local = prompt, True if upsampler == UPSAMPLERS[0] and IDEOGRAM_API_KEY: progress(0.0, desc="✍️ Upsampling (Ideogram)…") t = time.perf_counter() try: final_prompt = remote_upsample(prompt, int(width), int(height)) do_local = False print(f"[timing] upsample remote (off-GPU): {time.perf_counter() - t:.2f}s", flush=True) except Exception as e: print(f"[upsample] remote failed, falling back to local: {e!r}", flush=True) gr.Warning("Ideogram API unavailable — using the local Qwen upsampler.") return _gpu_generate(final_prompt, mode, width, height, seed, do_local) @spaces.GPU(size="xlarge") def _warmup(): """Warm the local upsampler (lazy LM-head graft) on the startup worker (no diffusion).""" _apply_aoti() # no-op while AOTI is disabled t = time.perf_counter() pipe.upsample_prompt("a red apple on a wooden table", height=1024, width=1024, lm_head_repo_id=LM_HEAD_REPO) print(f"[timing] warmup upsample: {time.perf_counter() - t:.2f}s", flush=True) try: _warmup() except Exception as e: # a flaky ZeroGPU worker must not take down the Space print(f"[warmup] failed (will warm lazily on first request): {e!r}", flush=True) CSS=''' .dark .gradio-container { color: var(--body-text-color); } ''' with gr.Blocks(theme=gr.themes.Citrus(), title="Ideogram 4", css=CSS) as demo: gr.Markdown( "# Ideogram 4\n" "Ideogram's first open-weights model — a 9.3B-parameter text-to-image foundation model at the " "forefront of design, with best-in-class text rendering.\n\n" "[Model](https://huggingface.co/ideogram-ai/ideogram-4-nf4) · " "[Model (fp8)](https://huggingface.co/ideogram-ai/ideogram-4-fp8) · " "[Blog](https://ideogram.ai/blog/ideogram-4.0/)" ) with gr.Row(): with gr.Column(): prompt = gr.Textbox(label="Prompt", value="a ginger cat wearing a tiny wizard hat reading a spellbook", lines=3) mode = gr.Radio(choices=list(MODES.keys()), value="Default · 20 steps", label="Mode (speed ↔ quality)") run = gr.Button("Generate", variant="primary") with gr.Accordion("Advanced", open=False): upsampler = gr.Radio( choices=UPSAMPLERS, value=UPSAMPLERS[0], label="Prompt upsampler", info="Rewrite into Ideogram's native JSON caption. Remote (Ideogram) preferred; falls back to local.", ) with gr.Row(): width = gr.Slider(512, 2048, value=1024, step=64, label="Width") height = gr.Slider(512, 2048, value=1024, step=64, label="Height") with gr.Row(): seed = gr.Number(label="Seed", value=0, precision=0) randomize = gr.Checkbox(label="Randomize seed", value=True) with gr.Column(): out_image = gr.Image(label="Output", type="pil") out_caption = gr.JSON(label="Caption fed to the model (upsampled when enabled)") gr.Examples( examples=[ ["a ginger cat wearing a tiny wizard hat reading a spellbook"], ["an isometric illustration of a tiny city floating in the clouds"], ["a golden retriever on a skateboard"], ], inputs=[prompt], outputs=[out_image, seed, out_caption], fn=generate, cache_examples=True, cache_mode="lazy", ) run.click( generate, inputs=[prompt, mode, upsampler, width, height, seed, randomize], outputs=[out_image, seed, out_caption], ) demo.launch()