Spaces:

ideogram-ai
/

ideogram4

Running on Zero

App Files Files Community

ideogram4 / app.py

multimodalart HF Staff

Fix CSS and remove queue

b6b3100 verified about 18 hours ago

raw

history blame contribute delete

12.9 kB

	import os

	os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
	# outlines_core ships an @torch.compile bitmask kernel dynamo can't trace (torch.device const) -> noisy
	# WON'T CONVERT spam on every local upsample. We never use torch.compile at runtime, so disable dynamo.
	os.environ.setdefault("TORCHDYNAMO_DISABLE", "1")

	# diffusers (with Ideogram4 support) is pip-installed from the PR — see requirements.txt. No bundled source.

	import json
	import random
	import time
	from threading import Thread

	import gradio as gr
	import requests
	import spaces
	import torch
	from huggingface_hub import hf_hub_download

	from diffusers import Ideogram4Pipeline

	# Runtime shim (keeps the bundled diffusers pristine): cu130-era bitsandbytes returns Params4bit.shape as a
	# plain tuple, but diffusers' check_quantized_param_shape calls .numel() on it. math.prod handles both, so
	# this is a no-op once diffusers/bnb fix it upstream.
	import math # noqa: E402

	from diffusers.quantizers.bitsandbytes.bnb_quantizer import BnB4BitDiffusersQuantizer # noqa: E402


	def _check_quantized_param_shape(self, param_name, current_param, loaded_param):
	n = math.prod(tuple(current_param.shape))
	inferred_shape = (n,) if "bias" in param_name else ((n + 1) // 2, 1)
	if tuple(loaded_param.shape) != tuple(inferred_shape):
	raise ValueError(f"Expected flattened shape of {param_name} to be {inferred_shape}, got {tuple(loaded_param.shape)}.")
	return True


	BnB4BitDiffusersQuantizer.check_quantized_param_shape = _check_quantized_param_shape

	MODEL_ID = "ideogram-ai/ideogram-4-nf4"
	LM_HEAD_REPO = "multimodalart/qwen3-vl-8b-instruct-lm-head"
	AOTI_REPO = "multimodalart/i4-block-aoti"
	AOTI_BLOCK_FILE = "Ideogram4TransformerBlock/package.pt2"
	MAX_SEED = 2**31 - 1

	# Prompt upsampling: Ideogram's hosted magic-prompt (default) with the local Qwen graft as fallback.
	IDEOGRAM_MAGIC_PROMPT_URL = "https://api.ideogram.ai/v1/ideogram-v4/magic-prompt"
	IDEOGRAM_API_KEY = os.environ.get("IDEOGRAM_API_KEY")
	UPSAMPLERS = ["Ideogram (remote)", "Qwen (local)"]

	# V4 presets (forward step-order: main CFG 7.0 -> polish 3.0).
	MODES = {
	"Turbo · 12 steps": dict(num_inference_steps=12, guidance_schedule=(7.0,) * 11 + (3.0,) * 1, mu=0.5, std=1.75),
	"Default · 20 steps": dict(num_inference_steps=20, guidance_schedule=(7.0,) * 18 + (3.0,) * 2, mu=0.0, std=1.75),
	"Quality · 48 steps": dict(num_inference_steps=48, guidance_schedule=(7.0,) * 45 + (3.0,) * 3, mu=0.0, std=1.5),
	}

	# --- Pipeline: dequantize both transformers nf4 -> bf16 in the parent (CPU) so AOTI can bind its weight-less
	# graph to real bf16 weights (this is repo cold start, which is fine; function cold start stays fast). ---
	t = time.perf_counter()
	pipe = Ideogram4Pipeline.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16)
	pipe.transformer.dequantize()
	pipe.unconditional_transformer.dequantize()
	pipe.to("cuda")
	print(f"[timing] pipeline load + dequant: {time.perf_counter() - t:.1f}s", flush=True)

	# The local prompt-enhancer LM head is grafted lazily by `pipe.upsample_prompt` on first use (onto the worker's
	# GPU), so no explicit load is needed here. Local is only the fallback; Ideogram's remote API is the default.

	# Pre-fetch the AOTI package AND pre-warm torch-inductor's CPU-ISA probe in the PARENT (repo cold start). The
	# probe (valid_vec_isa_list) compiles test programs (~20s) the first time aoti_blocks_load builds a LazyAOTIModel;
	# doing it once here means every ZeroGPU fork inherits the functools.cache, so per-worker (function cold start)
	# aoti_blocks_load is just the ~instant block patch instead of a ~20s compile.
	try:
	hf_hub_download(AOTI_REPO, "package.pt2", subfolder="Ideogram4TransformerBlock")
	from torch._inductor.cpu_vec_isa import valid_vec_isa_list

	t = time.perf_counter()
	valid_vec_isa_list()
	print(f"[timing] vec-isa prewarm (parent): {time.perf_counter() - t:.1f}s", flush=True)
	AOTI_OK = True
	except Exception as e:
	AOTI_OK = False
	print(f"[aoti] prefetch/prewarm failed, running eager: {e!r}", flush=True)

	_AOTI_APPLIED = False


	def _apply_aoti():
	"""Patch the compiled block onto every Ideogram4TransformerBlock of both transformers (once per worker).

	`aoti_blocks_load` is lazy (binds forward, defers the .so to first diffusion step) and CPU-only, so this is
	safe to run in a background thread overlapping the (transformer-idle) upsampling step."""
	global _AOTI_APPLIED
	if _AOTI_APPLIED or not AOTI_OK:
	return
	try:
	t = time.perf_counter()
	spaces.aoti_blocks_load(pipe.transformer, AOTI_REPO)
	spaces.aoti_blocks_load(pipe.unconditional_transformer, AOTI_REPO)
	_AOTI_APPLIED = True
	print(f"[timing] aoti_blocks_load (both transformers): {time.perf_counter() - t:.2f}s", flush=True)
	except Exception as e: # never let a bind hiccup block generation
	print(f"[aoti] apply failed, running eager: {e!r}", flush=True)


	def remote_upsample(prompt, width, height):
	"""Rewrite the prompt into Ideogram's native JSON caption via the hosted magic-prompt API."""
	d = math.gcd(width, height) or 1
	aspect_ratio = f"{width // d}x{height // d}" # Ideogram's WxH form
	resp = requests.post(
	IDEOGRAM_MAGIC_PROMPT_URL,
	headers={"Api-Key": IDEOGRAM_API_KEY, "Content-Type": "application/json"},
	json={"text_prompt": prompt, "aspect_ratio": aspect_ratio},
	timeout=120,
	)
	resp.raise_for_status()
	jp = resp.json().get("json_prompt")
	if not jp:
	raise RuntimeError("Ideogram API returned no json_prompt")
	jp.pop("aspect_ratio", None)
	for el in jp.get("compositional_deconstruction", {}).get("elements", []):
	if isinstance(el, dict):
	el.pop("bbox", None)
	return json.dumps(jp, ensure_ascii=False, separators=(",", ":"))


	# --- Dynamic GPU duration ---------------------------------------------------------------------------------
	# Per-step diffusion time, linear in image tokens between the two measured anchors (1024 @ 1.10 it/s,
	# 2048 @ 6 s/it). The chord overestimates in between, so it's a safe budget; clamped low for small images.
	# Remote upsample is a network call done OFF the GPU (in `generate`), so it isn't budgeted here.
	_TOK_1024, _TOK_2048 = (1024 // 16) 2, (2048 // 16) 2 # 4096, 16384 image tokens
	_PS_1024, _PS_2048 = 1.0 / 1.10, 6.0 # measured seconds/iteration
	_PS_B = (_PS_2048 - _PS_1024) / (_TOK_2048 - _TOK_1024)
	_PS_A = _PS_1024 - _PS_B * _TOK_1024
	LOCAL_UPSAMPLE_S = 15 # local Qwen graft+generate (~12s) with headroom
	DIFFUSION_OVERHEAD_S = 8 # .so dlopen + block patch + cudnn setup on a cold worker's first forward
	DURATION_MARGIN = 1.3


	def _per_step(width, height):
	return max(0.2, _PS_A + _PS_B * ((int(width) // 16) * (int(height) // 16)))


	def _gpu_duration(final_prompt, mode, width, height, seed, do_local, progress=None):
	steps = MODES.get(mode, MODES["Default · 20 steps"])["num_inference_steps"]
	budget = steps * _per_step(width, height) + DIFFUSION_OVERHEAD_S
	if do_local:
	budget += LOCAL_UPSAMPLE_S
	return max(60, int(math.ceil(budget * DURATION_MARGIN)))


	@spaces.GPU(duration=_gpu_duration, size="xlarge")
	def _gpu_generate(final_prompt, mode, width, height, seed, do_local, progress=gr.Progress(track_tqdm=True)):
	# Overlap the AOTI block-patch with the (transformer-idle) local upsample, if any.
	aoti_thread = Thread(target=_apply_aoti, daemon=True)
	aoti_thread.start()
	if do_local:
	progress(0.0, desc="✍️ Upsampling (local Qwen)…")
	t = time.perf_counter()
	try:
	final_prompt = pipe.upsample_prompt(
	final_prompt, height=int(height), width=int(width), lm_head_repo_id=LM_HEAD_REPO
	)[0]
	print(f"[timing] upsample local: {time.perf_counter() - t:.2f}s", flush=True)
	except Exception as e:
	print(f"[upsample] local failed: {e!r}", flush=True)
	gr.Warning("Local upsampler unavailable — generating from the raw prompt.")
	aoti_thread.join() # ensure blocks are patched before the diffusion loop

	progress(0.0, desc="🎨 Generating image…")
	generator = torch.Generator(device="cuda").manual_seed(int(seed))
	preset = MODES.get(mode, MODES["Default · 20 steps"])
	t = time.perf_counter()
	image = pipe(prompt=final_prompt, width=int(width), height=int(height), generator=generator, **preset).images[0]
	print(f"[timing] diffusion ({mode}): {time.perf_counter() - t:.2f}s", flush=True)

	try:
	caption = json.loads(final_prompt)
	except Exception:
	caption = {"prompt": final_prompt}
	return image, int(seed), caption


	def generate(
	prompt,
	mode="Default · 20 steps",
	upsampler=UPSAMPLERS[0],
	width=1024,
	height=1024,
	seed=0,
	randomize_seed=False,
	progress=gr.Progress(track_tqdm=True),
	):
	if randomize_seed or seed < 0:
	seed = random.randint(0, MAX_SEED)

	# Remote upsample is a network call -> run it here, OFF the GPU. Fall back to local (on-GPU) on failure.
	final_prompt, do_local = prompt, True
	if upsampler == UPSAMPLERS[0] and IDEOGRAM_API_KEY:
	progress(0.0, desc="✍️ Upsampling (Ideogram)…")
	t = time.perf_counter()
	try:
	final_prompt = remote_upsample(prompt, int(width), int(height))
	do_local = False
	print(f"[timing] upsample remote (off-GPU): {time.perf_counter() - t:.2f}s", flush=True)
	except Exception as e:
	print(f"[upsample] remote failed, falling back to local: {e!r}", flush=True)
	gr.Warning("Ideogram API unavailable — using the local Qwen upsampler.")

	return _gpu_generate(final_prompt, mode, width, height, seed, do_local)


	@spaces.GPU(size="xlarge")
	def _warmup():
	"""Warm the local upsampler (lazy LM-head graft) on the startup worker (no diffusion)."""
	_apply_aoti() # no-op while AOTI is disabled
	t = time.perf_counter()
	pipe.upsample_prompt("a red apple on a wooden table", height=1024, width=1024, lm_head_repo_id=LM_HEAD_REPO)
	print(f"[timing] warmup upsample: {time.perf_counter() - t:.2f}s", flush=True)


	try:
	_warmup()
	except Exception as e: # a flaky ZeroGPU worker must not take down the Space
	print(f"[warmup] failed (will warm lazily on first request): {e!r}", flush=True)

	CSS='''
	.dark .gradio-container { color: var(--body-text-color); }
	'''
	with gr.Blocks(theme=gr.themes.Citrus(), title="Ideogram 4", css=CSS) as demo:
	gr.Markdown(
	"# Ideogram 4\n"
	"Ideogram's first open-weights model — a 9.3B-parameter text-to-image foundation model at the "
	"forefront of design, with best-in-class text rendering.\n\n"
	"[Model](https://huggingface.co/ideogram-ai/ideogram-4-nf4) · "
	"[Model (fp8)](https://huggingface.co/ideogram-ai/ideogram-4-fp8) · "
	"[Blog](https://ideogram.ai/blog/ideogram-4.0/)"
	)

	with gr.Row():
	with gr.Column():
	prompt = gr.Textbox(label="Prompt", value="a ginger cat wearing a tiny wizard hat reading a spellbook", lines=3)
	mode = gr.Radio(choices=list(MODES.keys()), value="Default · 20 steps", label="Mode (speed ↔ quality)")
	run = gr.Button("Generate", variant="primary")
	with gr.Accordion("Advanced", open=False):
	upsampler = gr.Radio(
	choices=UPSAMPLERS,
	value=UPSAMPLERS[0],
	label="Prompt upsampler",
	info="Rewrite into Ideogram's native JSON caption. Remote (Ideogram) preferred; falls back to local.",
	)
	with gr.Row():
	width = gr.Slider(512, 2048, value=1024, step=64, label="Width")
	height = gr.Slider(512, 2048, value=1024, step=64, label="Height")
	with gr.Row():
	seed = gr.Number(label="Seed", value=0, precision=0)
	randomize = gr.Checkbox(label="Randomize seed", value=True)
	with gr.Column():
	out_image = gr.Image(label="Output", type="pil")
	out_caption = gr.JSON(label="Caption fed to the model (upsampled when enabled)")

	gr.Examples(
	examples=[
	["a ginger cat wearing a tiny wizard hat reading a spellbook"],
	["an isometric illustration of a tiny city floating in the clouds"],
	["a golden retriever on a skateboard"],
	],
	inputs=[prompt],
	outputs=[out_image, seed, out_caption],
	fn=generate,
	cache_examples=True,
	cache_mode="lazy",
	)

	run.click(
	generate,
	inputs=[prompt, mode, upsampler, width, height, seed, randomize],
	outputs=[out_image, seed, out_caption],
	)

	demo.launch()