Spaces:
Running on Zero
Running on Zero
| import os | |
| os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True") | |
| # outlines_core ships an @torch.compile bitmask kernel dynamo can't trace (torch.device const) -> noisy | |
| # WON'T CONVERT spam on every local upsample. We never use torch.compile at runtime, so disable dynamo. | |
| os.environ.setdefault("TORCHDYNAMO_DISABLE", "1") | |
| # diffusers (with Ideogram4 support) is pip-installed from the PR — see requirements.txt. No bundled source. | |
| import json | |
| import random | |
| import time | |
| from threading import Thread | |
| import gradio as gr | |
| import requests | |
| import spaces | |
| import torch | |
| from huggingface_hub import hf_hub_download | |
| from diffusers import Ideogram4Pipeline | |
| # Runtime shim (keeps the bundled diffusers pristine): cu130-era bitsandbytes returns Params4bit.shape as a | |
| # plain tuple, but diffusers' check_quantized_param_shape calls .numel() on it. math.prod handles both, so | |
| # this is a no-op once diffusers/bnb fix it upstream. | |
| import math # noqa: E402 | |
| from diffusers.quantizers.bitsandbytes.bnb_quantizer import BnB4BitDiffusersQuantizer # noqa: E402 | |
| def _check_quantized_param_shape(self, param_name, current_param, loaded_param): | |
| n = math.prod(tuple(current_param.shape)) | |
| inferred_shape = (n,) if "bias" in param_name else ((n + 1) // 2, 1) | |
| if tuple(loaded_param.shape) != tuple(inferred_shape): | |
| raise ValueError(f"Expected flattened shape of {param_name} to be {inferred_shape}, got {tuple(loaded_param.shape)}.") | |
| return True | |
| BnB4BitDiffusersQuantizer.check_quantized_param_shape = _check_quantized_param_shape | |
| MODEL_ID = "ideogram-ai/ideogram-4-nf4" | |
| LM_HEAD_REPO = "multimodalart/qwen3-vl-8b-instruct-lm-head" | |
| AOTI_REPO = "multimodalart/i4-block-aoti" | |
| AOTI_BLOCK_FILE = "Ideogram4TransformerBlock/package.pt2" | |
| MAX_SEED = 2**31 - 1 | |
| # Prompt upsampling: Ideogram's hosted magic-prompt (default) with the local Qwen graft as fallback. | |
| IDEOGRAM_MAGIC_PROMPT_URL = "https://api.ideogram.ai/v1/ideogram-v4/magic-prompt" | |
| IDEOGRAM_API_KEY = os.environ.get("IDEOGRAM_API_KEY") | |
| UPSAMPLERS = ["Ideogram (remote)", "Qwen (local)"] | |
| # V4 presets (forward step-order: main CFG 7.0 -> polish 3.0). | |
| MODES = { | |
| "Turbo · 12 steps": dict(num_inference_steps=12, guidance_schedule=(7.0,) * 11 + (3.0,) * 1, mu=0.5, std=1.75), | |
| "Default · 20 steps": dict(num_inference_steps=20, guidance_schedule=(7.0,) * 18 + (3.0,) * 2, mu=0.0, std=1.75), | |
| "Quality · 48 steps": dict(num_inference_steps=48, guidance_schedule=(7.0,) * 45 + (3.0,) * 3, mu=0.0, std=1.5), | |
| } | |
| # --- Pipeline: dequantize both transformers nf4 -> bf16 in the parent (CPU) so AOTI can bind its weight-less | |
| # graph to real bf16 weights (this is repo cold start, which is fine; function cold start stays fast). --- | |
| t = time.perf_counter() | |
| pipe = Ideogram4Pipeline.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16) | |
| pipe.transformer.dequantize() | |
| pipe.unconditional_transformer.dequantize() | |
| pipe.to("cuda") | |
| print(f"[timing] pipeline load + dequant: {time.perf_counter() - t:.1f}s", flush=True) | |
| # The local prompt-enhancer LM head is grafted lazily by `pipe.upsample_prompt` on first use (onto the worker's | |
| # GPU), so no explicit load is needed here. Local is only the fallback; Ideogram's remote API is the default. | |
| # Pre-fetch the AOTI package AND pre-warm torch-inductor's CPU-ISA probe in the PARENT (repo cold start). The | |
| # probe (valid_vec_isa_list) compiles test programs (~20s) the first time aoti_blocks_load builds a LazyAOTIModel; | |
| # doing it once here means every ZeroGPU fork inherits the functools.cache, so per-worker (function cold start) | |
| # aoti_blocks_load is just the ~instant block patch instead of a ~20s compile. | |
| try: | |
| hf_hub_download(AOTI_REPO, "package.pt2", subfolder="Ideogram4TransformerBlock") | |
| from torch._inductor.cpu_vec_isa import valid_vec_isa_list | |
| t = time.perf_counter() | |
| valid_vec_isa_list() | |
| print(f"[timing] vec-isa prewarm (parent): {time.perf_counter() - t:.1f}s", flush=True) | |
| AOTI_OK = True | |
| except Exception as e: | |
| AOTI_OK = False | |
| print(f"[aoti] prefetch/prewarm failed, running eager: {e!r}", flush=True) | |
| _AOTI_APPLIED = False | |
| def _apply_aoti(): | |
| """Patch the compiled block onto every Ideogram4TransformerBlock of both transformers (once per worker). | |
| `aoti_blocks_load` is lazy (binds forward, defers the .so to first diffusion step) and CPU-only, so this is | |
| safe to run in a background thread overlapping the (transformer-idle) upsampling step.""" | |
| global _AOTI_APPLIED | |
| if _AOTI_APPLIED or not AOTI_OK: | |
| return | |
| try: | |
| t = time.perf_counter() | |
| spaces.aoti_blocks_load(pipe.transformer, AOTI_REPO) | |
| spaces.aoti_blocks_load(pipe.unconditional_transformer, AOTI_REPO) | |
| _AOTI_APPLIED = True | |
| print(f"[timing] aoti_blocks_load (both transformers): {time.perf_counter() - t:.2f}s", flush=True) | |
| except Exception as e: # never let a bind hiccup block generation | |
| print(f"[aoti] apply failed, running eager: {e!r}", flush=True) | |
| def remote_upsample(prompt, width, height): | |
| """Rewrite the prompt into Ideogram's native JSON caption via the hosted magic-prompt API.""" | |
| d = math.gcd(width, height) or 1 | |
| aspect_ratio = f"{width // d}x{height // d}" # Ideogram's WxH form | |
| resp = requests.post( | |
| IDEOGRAM_MAGIC_PROMPT_URL, | |
| headers={"Api-Key": IDEOGRAM_API_KEY, "Content-Type": "application/json"}, | |
| json={"text_prompt": prompt, "aspect_ratio": aspect_ratio}, | |
| timeout=120, | |
| ) | |
| resp.raise_for_status() | |
| jp = resp.json().get("json_prompt") | |
| if not jp: | |
| raise RuntimeError("Ideogram API returned no json_prompt") | |
| jp.pop("aspect_ratio", None) | |
| for el in jp.get("compositional_deconstruction", {}).get("elements", []): | |
| if isinstance(el, dict): | |
| el.pop("bbox", None) | |
| return json.dumps(jp, ensure_ascii=False, separators=(",", ":")) | |
| # --- Dynamic GPU duration --------------------------------------------------------------------------------- | |
| # Per-step diffusion time, linear in image tokens between the two measured anchors (1024 @ 1.10 it/s, | |
| # 2048 @ 6 s/it). The chord overestimates in between, so it's a safe budget; clamped low for small images. | |
| # Remote upsample is a network call done OFF the GPU (in `generate`), so it isn't budgeted here. | |
| _TOK_1024, _TOK_2048 = (1024 // 16) ** 2, (2048 // 16) ** 2 # 4096, 16384 image tokens | |
| _PS_1024, _PS_2048 = 1.0 / 1.10, 6.0 # measured seconds/iteration | |
| _PS_B = (_PS_2048 - _PS_1024) / (_TOK_2048 - _TOK_1024) | |
| _PS_A = _PS_1024 - _PS_B * _TOK_1024 | |
| LOCAL_UPSAMPLE_S = 15 # local Qwen graft+generate (~12s) with headroom | |
| DIFFUSION_OVERHEAD_S = 8 # .so dlopen + block patch + cudnn setup on a cold worker's first forward | |
| DURATION_MARGIN = 1.3 | |
| def _per_step(width, height): | |
| return max(0.2, _PS_A + _PS_B * ((int(width) // 16) * (int(height) // 16))) | |
| def _gpu_duration(final_prompt, mode, width, height, seed, do_local, progress=None): | |
| steps = MODES.get(mode, MODES["Default · 20 steps"])["num_inference_steps"] | |
| budget = steps * _per_step(width, height) + DIFFUSION_OVERHEAD_S | |
| if do_local: | |
| budget += LOCAL_UPSAMPLE_S | |
| return max(60, int(math.ceil(budget * DURATION_MARGIN))) | |
| def _gpu_generate(final_prompt, mode, width, height, seed, do_local, progress=gr.Progress(track_tqdm=True)): | |
| # Overlap the AOTI block-patch with the (transformer-idle) local upsample, if any. | |
| aoti_thread = Thread(target=_apply_aoti, daemon=True) | |
| aoti_thread.start() | |
| if do_local: | |
| progress(0.0, desc="✍️ Upsampling (local Qwen)…") | |
| t = time.perf_counter() | |
| try: | |
| final_prompt = pipe.upsample_prompt( | |
| final_prompt, height=int(height), width=int(width), lm_head_repo_id=LM_HEAD_REPO | |
| )[0] | |
| print(f"[timing] upsample local: {time.perf_counter() - t:.2f}s", flush=True) | |
| except Exception as e: | |
| print(f"[upsample] local failed: {e!r}", flush=True) | |
| gr.Warning("Local upsampler unavailable — generating from the raw prompt.") | |
| aoti_thread.join() # ensure blocks are patched before the diffusion loop | |
| progress(0.0, desc="🎨 Generating image…") | |
| generator = torch.Generator(device="cuda").manual_seed(int(seed)) | |
| preset = MODES.get(mode, MODES["Default · 20 steps"]) | |
| t = time.perf_counter() | |
| image = pipe(prompt=final_prompt, width=int(width), height=int(height), generator=generator, **preset).images[0] | |
| print(f"[timing] diffusion ({mode}): {time.perf_counter() - t:.2f}s", flush=True) | |
| try: | |
| caption = json.loads(final_prompt) | |
| except Exception: | |
| caption = {"prompt": final_prompt} | |
| return image, int(seed), caption | |
| def generate( | |
| prompt, | |
| mode="Default · 20 steps", | |
| upsampler=UPSAMPLERS[0], | |
| width=1024, | |
| height=1024, | |
| seed=0, | |
| randomize_seed=False, | |
| progress=gr.Progress(track_tqdm=True), | |
| ): | |
| if randomize_seed or seed < 0: | |
| seed = random.randint(0, MAX_SEED) | |
| # Remote upsample is a network call -> run it here, OFF the GPU. Fall back to local (on-GPU) on failure. | |
| final_prompt, do_local = prompt, True | |
| if upsampler == UPSAMPLERS[0] and IDEOGRAM_API_KEY: | |
| progress(0.0, desc="✍️ Upsampling (Ideogram)…") | |
| t = time.perf_counter() | |
| try: | |
| final_prompt = remote_upsample(prompt, int(width), int(height)) | |
| do_local = False | |
| print(f"[timing] upsample remote (off-GPU): {time.perf_counter() - t:.2f}s", flush=True) | |
| except Exception as e: | |
| print(f"[upsample] remote failed, falling back to local: {e!r}", flush=True) | |
| gr.Warning("Ideogram API unavailable — using the local Qwen upsampler.") | |
| return _gpu_generate(final_prompt, mode, width, height, seed, do_local) | |
| def _warmup(): | |
| """Warm the local upsampler (lazy LM-head graft) on the startup worker (no diffusion).""" | |
| _apply_aoti() # no-op while AOTI is disabled | |
| t = time.perf_counter() | |
| pipe.upsample_prompt("a red apple on a wooden table", height=1024, width=1024, lm_head_repo_id=LM_HEAD_REPO) | |
| print(f"[timing] warmup upsample: {time.perf_counter() - t:.2f}s", flush=True) | |
| try: | |
| _warmup() | |
| except Exception as e: # a flaky ZeroGPU worker must not take down the Space | |
| print(f"[warmup] failed (will warm lazily on first request): {e!r}", flush=True) | |
| CSS=''' | |
| .dark .gradio-container { color: var(--body-text-color); } | |
| ''' | |
| with gr.Blocks(theme=gr.themes.Citrus(), title="Ideogram 4", css=CSS) as demo: | |
| gr.Markdown( | |
| "# Ideogram 4\n" | |
| "Ideogram's first open-weights model — a 9.3B-parameter text-to-image foundation model at the " | |
| "forefront of design, with best-in-class text rendering.\n\n" | |
| "[Model](https://huggingface.co/ideogram-ai/ideogram-4-nf4) · " | |
| "[Model (fp8)](https://huggingface.co/ideogram-ai/ideogram-4-fp8) · " | |
| "[Blog](https://ideogram.ai/blog/ideogram-4.0/)" | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| prompt = gr.Textbox(label="Prompt", value="a ginger cat wearing a tiny wizard hat reading a spellbook", lines=3) | |
| mode = gr.Radio(choices=list(MODES.keys()), value="Default · 20 steps", label="Mode (speed ↔ quality)") | |
| run = gr.Button("Generate", variant="primary") | |
| with gr.Accordion("Advanced", open=False): | |
| upsampler = gr.Radio( | |
| choices=UPSAMPLERS, | |
| value=UPSAMPLERS[0], | |
| label="Prompt upsampler", | |
| info="Rewrite into Ideogram's native JSON caption. Remote (Ideogram) preferred; falls back to local.", | |
| ) | |
| with gr.Row(): | |
| width = gr.Slider(512, 2048, value=1024, step=64, label="Width") | |
| height = gr.Slider(512, 2048, value=1024, step=64, label="Height") | |
| with gr.Row(): | |
| seed = gr.Number(label="Seed", value=0, precision=0) | |
| randomize = gr.Checkbox(label="Randomize seed", value=True) | |
| with gr.Column(): | |
| out_image = gr.Image(label="Output", type="pil") | |
| out_caption = gr.JSON(label="Caption fed to the model (upsampled when enabled)") | |
| gr.Examples( | |
| examples=[ | |
| ["a ginger cat wearing a tiny wizard hat reading a spellbook"], | |
| ["an isometric illustration of a tiny city floating in the clouds"], | |
| ["a golden retriever on a skateboard"], | |
| ], | |
| inputs=[prompt], | |
| outputs=[out_image, seed, out_caption], | |
| fn=generate, | |
| cache_examples=True, | |
| cache_mode="lazy", | |
| ) | |
| run.click( | |
| generate, | |
| inputs=[prompt, mode, upsampler, width, height, seed, randomize], | |
| outputs=[out_image, seed, out_caption], | |
| ) | |
| demo.launch() | |