import os

os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
# outlines_core ships an @torch.compile bitmask kernel dynamo can't trace (torch.device const) -> noisy
# WON'T CONVERT spam on every local upsample. We never use torch.compile at runtime, so disable dynamo.
os.environ.setdefault("TORCHDYNAMO_DISABLE", "1")

# diffusers (with Ideogram4 support) is pip-installed from the PR — see requirements.txt. No bundled source.

import json
import random
import time
from threading import Thread

import gradio as gr
import requests
import spaces
import torch
from huggingface_hub import hf_hub_download

from diffusers import Ideogram4Pipeline

# Runtime shim (keeps the bundled diffusers pristine): cu130-era bitsandbytes returns Params4bit.shape as a
# plain tuple, but diffusers' check_quantized_param_shape calls .numel() on it. math.prod handles both, so
# this is a no-op once diffusers/bnb fix it upstream.
import math  # noqa: E402

from diffusers.quantizers.bitsandbytes.bnb_quantizer import BnB4BitDiffusersQuantizer  # noqa: E402


def _check_quantized_param_shape(self, param_name, current_param, loaded_param):
    n = math.prod(tuple(current_param.shape))
    inferred_shape = (n,) if "bias" in param_name else ((n + 1) // 2, 1)
    if tuple(loaded_param.shape) != tuple(inferred_shape):
        raise ValueError(f"Expected flattened shape of {param_name} to be {inferred_shape}, got {tuple(loaded_param.shape)}.")
    return True


BnB4BitDiffusersQuantizer.check_quantized_param_shape = _check_quantized_param_shape

MODEL_ID = "ideogram-ai/ideogram-4-nf4"
LM_HEAD_REPO = "multimodalart/qwen3-vl-8b-instruct-lm-head"
AOTI_REPO = "multimodalart/i4-block-aoti"
AOTI_BLOCK_FILE = "Ideogram4TransformerBlock/package.pt2"
MAX_SEED = 2**31 - 1

# Prompt upsampling: Ideogram's hosted magic-prompt (default) with the local Qwen graft as fallback.
IDEOGRAM_MAGIC_PROMPT_URL = "https://api.ideogram.ai/v1/ideogram-v4/magic-prompt"
IDEOGRAM_API_KEY = os.environ.get("IDEOGRAM_API_KEY")
UPSAMPLERS = ["Ideogram (remote)", "Qwen (local)"]

# V4 presets (forward step-order: main CFG 7.0 -> polish 3.0).
MODES = {
    "Turbo · 12 steps": dict(num_inference_steps=12, guidance_schedule=(7.0,) * 11 + (3.0,) * 1, mu=0.5, std=1.75),
    "Default · 20 steps": dict(num_inference_steps=20, guidance_schedule=(7.0,) * 18 + (3.0,) * 2, mu=0.0, std=1.75),
    "Quality · 48 steps": dict(num_inference_steps=48, guidance_schedule=(7.0,) * 45 + (3.0,) * 3, mu=0.0, std=1.5),
}

# --- Pipeline: dequantize both transformers nf4 -> bf16 in the parent (CPU) so AOTI can bind its weight-less
# graph to real bf16 weights (this is repo cold start, which is fine; function cold start stays fast). ---
t = time.perf_counter()
pipe = Ideogram4Pipeline.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16)
pipe.transformer.dequantize()
pipe.unconditional_transformer.dequantize()
pipe.to("cuda")
print(f"[timing] pipeline load + dequant: {time.perf_counter() - t:.1f}s", flush=True)

# The local prompt-enhancer LM head is grafted lazily by `pipe.upsample_prompt` on first use (onto the worker's
# GPU), so no explicit load is needed here. Local is only the fallback; Ideogram's remote API is the default.

# Pre-fetch the AOTI package AND pre-warm torch-inductor's CPU-ISA probe in the PARENT (repo cold start). The
# probe (valid_vec_isa_list) compiles test programs (~20s) the first time aoti_blocks_load builds a LazyAOTIModel;
# doing it once here means every ZeroGPU fork inherits the functools.cache, so per-worker (function cold start)
# aoti_blocks_load is just the ~instant block patch instead of a ~20s compile.
try:
    hf_hub_download(AOTI_REPO, "package.pt2", subfolder="Ideogram4TransformerBlock")
    from torch._inductor.cpu_vec_isa import valid_vec_isa_list

    t = time.perf_counter()
    valid_vec_isa_list()
    print(f"[timing] vec-isa prewarm (parent): {time.perf_counter() - t:.1f}s", flush=True)
    AOTI_OK = True
except Exception as e:
    AOTI_OK = False
    print(f"[aoti] prefetch/prewarm failed, running eager: {e!r}", flush=True)

_AOTI_APPLIED = False


def _apply_aoti():
    """Patch the compiled block onto every Ideogram4TransformerBlock of both transformers (once per worker).

    `aoti_blocks_load` is lazy (binds forward, defers the .so to first diffusion step) and CPU-only, so this is
    safe to run in a background thread overlapping the (transformer-idle) upsampling step."""
    global _AOTI_APPLIED
    if _AOTI_APPLIED or not AOTI_OK:
        return
    try:
        t = time.perf_counter()
        spaces.aoti_blocks_load(pipe.transformer, AOTI_REPO)
        spaces.aoti_blocks_load(pipe.unconditional_transformer, AOTI_REPO)
        _AOTI_APPLIED = True
        print(f"[timing] aoti_blocks_load (both transformers): {time.perf_counter() - t:.2f}s", flush=True)
    except Exception as e:  # never let a bind hiccup block generation
        print(f"[aoti] apply failed, running eager: {e!r}", flush=True)


def remote_upsample(prompt, width, height):
    """Rewrite the prompt into Ideogram's native JSON caption via the hosted magic-prompt API."""
    d = math.gcd(width, height) or 1
    aspect_ratio = f"{width // d}x{height // d}"  # Ideogram's WxH form
    resp = requests.post(
        IDEOGRAM_MAGIC_PROMPT_URL,
        headers={"Api-Key": IDEOGRAM_API_KEY, "Content-Type": "application/json"},
        json={"text_prompt": prompt, "aspect_ratio": aspect_ratio},
        timeout=120,
    )
    resp.raise_for_status()
    jp = resp.json().get("json_prompt")
    if not jp:
        raise RuntimeError("Ideogram API returned no json_prompt")
    jp.pop("aspect_ratio", None)
    for el in jp.get("compositional_deconstruction", {}).get("elements", []):
        if isinstance(el, dict):
            el.pop("bbox", None)
    return json.dumps(jp, ensure_ascii=False, separators=(",", ":"))


# --- Dynamic GPU duration ---------------------------------------------------------------------------------
# Per-step diffusion time, linear in image tokens between the two measured anchors (1024 @ 1.10 it/s,
# 2048 @ 6 s/it). The chord overestimates in between, so it's a safe budget; clamped low for small images.
# Remote upsample is a network call done OFF the GPU (in `generate`), so it isn't budgeted here.
_TOK_1024, _TOK_2048 = (1024 // 16) ** 2, (2048 // 16) ** 2  # 4096, 16384 image tokens
_PS_1024, _PS_2048 = 1.0 / 1.10, 6.0  # measured seconds/iteration
_PS_B = (_PS_2048 - _PS_1024) / (_TOK_2048 - _TOK_1024)
_PS_A = _PS_1024 - _PS_B * _TOK_1024
LOCAL_UPSAMPLE_S = 15  # local Qwen graft+generate (~12s) with headroom
DIFFUSION_OVERHEAD_S = 8  # .so dlopen + block patch + cudnn setup on a cold worker's first forward
DURATION_MARGIN = 1.3


def _per_step(width, height):
    return max(0.2, _PS_A + _PS_B * ((int(width) // 16) * (int(height) // 16)))


def _gpu_duration(final_prompt, mode, width, height, seed, do_local, progress=None):
    steps = MODES.get(mode, MODES["Default · 20 steps"])["num_inference_steps"]
    budget = steps * _per_step(width, height) + DIFFUSION_OVERHEAD_S
    if do_local:
        budget += LOCAL_UPSAMPLE_S
    return max(60, int(math.ceil(budget * DURATION_MARGIN)))


@spaces.GPU(duration=_gpu_duration, size="xlarge")
def _gpu_generate(final_prompt, mode, width, height, seed, do_local, progress=gr.Progress(track_tqdm=True)):
    # Overlap the AOTI block-patch with the (transformer-idle) local upsample, if any.
    aoti_thread = Thread(target=_apply_aoti, daemon=True)
    aoti_thread.start()
    if do_local:
        progress(0.0, desc="✍️ Upsampling (local Qwen)…")
        t = time.perf_counter()
        try:
            final_prompt = pipe.upsample_prompt(
                final_prompt, height=int(height), width=int(width), lm_head_repo_id=LM_HEAD_REPO
            )[0]
            print(f"[timing] upsample local: {time.perf_counter() - t:.2f}s", flush=True)
        except Exception as e:
            print(f"[upsample] local failed: {e!r}", flush=True)
            gr.Warning("Local upsampler unavailable — generating from the raw prompt.")
    aoti_thread.join()  # ensure blocks are patched before the diffusion loop

    progress(0.0, desc="🎨 Generating image…")
    generator = torch.Generator(device="cuda").manual_seed(int(seed))
    preset = MODES.get(mode, MODES["Default · 20 steps"])
    t = time.perf_counter()
    image = pipe(prompt=final_prompt, width=int(width), height=int(height), generator=generator, **preset).images[0]
    print(f"[timing] diffusion ({mode}): {time.perf_counter() - t:.2f}s", flush=True)

    try:
        caption = json.loads(final_prompt)
    except Exception:
        caption = {"prompt": final_prompt}
    return image, int(seed), caption


def generate(
    prompt,
    mode="Default · 20 steps",
    upsampler=UPSAMPLERS[0],
    width=1024,
    height=1024,
    seed=0,
    randomize_seed=False,
    progress=gr.Progress(track_tqdm=True),
):
    if randomize_seed or seed < 0:
        seed = random.randint(0, MAX_SEED)

    # Remote upsample is a network call -> run it here, OFF the GPU. Fall back to local (on-GPU) on failure.
    final_prompt, do_local = prompt, True
    if upsampler == UPSAMPLERS[0] and IDEOGRAM_API_KEY:
        progress(0.0, desc="✍️ Upsampling (Ideogram)…")
        t = time.perf_counter()
        try:
            final_prompt = remote_upsample(prompt, int(width), int(height))
            do_local = False
            print(f"[timing] upsample remote (off-GPU): {time.perf_counter() - t:.2f}s", flush=True)
        except Exception as e:
            print(f"[upsample] remote failed, falling back to local: {e!r}", flush=True)
            gr.Warning("Ideogram API unavailable — using the local Qwen upsampler.")

    return _gpu_generate(final_prompt, mode, width, height, seed, do_local)


@spaces.GPU(size="xlarge")
def _warmup():
    """Warm the local upsampler (lazy LM-head graft) on the startup worker (no diffusion)."""
    _apply_aoti()  # no-op while AOTI is disabled
    t = time.perf_counter()
    pipe.upsample_prompt("a red apple on a wooden table", height=1024, width=1024, lm_head_repo_id=LM_HEAD_REPO)
    print(f"[timing] warmup upsample: {time.perf_counter() - t:.2f}s", flush=True)


try:
    _warmup()
except Exception as e:  # a flaky ZeroGPU worker must not take down the Space
    print(f"[warmup] failed (will warm lazily on first request): {e!r}", flush=True)

CSS='''
.dark .gradio-container { color: var(--body-text-color); }
'''
with gr.Blocks(theme=gr.themes.Citrus(), title="Ideogram 4", css=CSS) as demo:
    gr.Markdown(
        "# Ideogram 4\n"
        "Ideogram's first open-weights model — a 9.3B-parameter text-to-image foundation model at the "
        "forefront of design, with best-in-class text rendering.\n\n"
        "[Model](https://huggingface.co/ideogram-ai/ideogram-4-nf4) · "
        "[Model (fp8)](https://huggingface.co/ideogram-ai/ideogram-4-fp8) · "
        "[Blog](https://ideogram.ai/blog/ideogram-4.0/)"
    )

    with gr.Row():
        with gr.Column():
            prompt = gr.Textbox(label="Prompt", value="a ginger cat wearing a tiny wizard hat reading a spellbook", lines=3)
            mode = gr.Radio(choices=list(MODES.keys()), value="Default · 20 steps", label="Mode (speed ↔ quality)")
            run = gr.Button("Generate", variant="primary")
            with gr.Accordion("Advanced", open=False):
                upsampler = gr.Radio(
                    choices=UPSAMPLERS,
                    value=UPSAMPLERS[0],
                    label="Prompt upsampler",
                    info="Rewrite into Ideogram's native JSON caption. Remote (Ideogram) preferred; falls back to local.",
                )
                with gr.Row():
                    width = gr.Slider(512, 2048, value=1024, step=64, label="Width")
                    height = gr.Slider(512, 2048, value=1024, step=64, label="Height")
                with gr.Row():
                    seed = gr.Number(label="Seed", value=0, precision=0)
                    randomize = gr.Checkbox(label="Randomize seed", value=True)
        with gr.Column():
            out_image = gr.Image(label="Output", type="pil")
            out_caption = gr.JSON(label="Caption fed to the model (upsampled when enabled)")

    gr.Examples(
        examples=[
            ["a ginger cat wearing a tiny wizard hat reading a spellbook"],
            ["an isometric illustration of a tiny city floating in the clouds"],
            ["a golden retriever on a skateboard"],
        ],
        inputs=[prompt],
        outputs=[out_image, seed, out_caption],
        fn=generate,
        cache_examples=True,
        cache_mode="lazy",
    )

    run.click(
        generate,
        inputs=[prompt, mode, upsampler, width, height, seed, randomize],
        outputs=[out_image, seed, out_caption],
    )

demo.launch()