"""Modal app: runs MiniCPM-V-4.6 captioning on a remote GPU."""

import modal

app = modal.App("photographers-archive")

image = (
    modal.Image.debian_slim(python_version="3.12")
    .pip_install(
        "transformers[torch]>=5.7.0",
        "torchvision",
        "av",
        "Pillow",
        "torch>=2.1.0",
        "accelerate",
    )
)

MODEL_ID = "openbmb/MiniCPM-V-4.6"

CAPTION_PROMPT = """You are a wedding and portrait photo archivist. Analyze this image and return ONLY a valid JSON object — no markdown, no explanation, no code fences.

Crucial Prompting Guidelines:
1. Be highly specific with textures, fabrics, and patterns in "attire" (e.g., "lace wedding gown", "black velvet tuxedo", "pinstripe suit").
2. Avoid generic descriptions in "summary". Focus on explicit visual facts (who, what, where, explicit actions).
3. Under "primary_subjects", explicitly label roles if evident (e.g., "bride", "groom", "bridesmaid", "groomsman", "mother of the bride").
4. For "depth_of_field", specify features like "shallow depth of field", "bokeh background", or "deep focus".

Use this exact schema:
{
  "summary": "2-3 sentence description of the scene",
  "subjects": {
    "people_count": 0,
    "primary_subjects": [],
    "relationships": [],
    "attire": []
  },
  "scene": {
    "location_type": "",
    "environment": "",
    "setting_details": []
  },
  "actions": {
    "primary_action": "",
    "body_language": []
  },
  "lighting": {
    "lighting_style": "",
    "time_of_day_estimate": ""
  },
  "composition": {
    "shot_type": "",
    "camera_angle": ""
  },
  "mood": {
    "primary_emotions": [],
    "atmosphere": ""
  },
  "technical_cues": {
    "color_palette": [],
    "depth_of_field": ""
  },
  "search_tags": [],
  "archive_keywords": []
}"""

# Cache model weights in a Modal Volume so they persist across cold starts
model_volume = modal.Volume.from_name("minicpm-weights", create_if_missing=True)


@app.cls(
    image=image,
    gpu="A10G",
    volumes={"/model-cache": model_volume},
    timeout=300,
)
class Captioner:
    @modal.enter()
    def load(self):
        import torch
        from transformers import AutoModelForImageTextToText, AutoProcessor

        self.processor = AutoProcessor.from_pretrained(
            MODEL_ID, cache_dir="/model-cache"
        )
        self.model = AutoModelForImageTextToText.from_pretrained(
            MODEL_ID,
            torch_dtype=torch.bfloat16,
            device_map="auto",
            cache_dir="/model-cache",
        )

    @modal.method()
    def caption(self, image_bytes: bytes, filename: str = "image.jpg") -> str:
        import json
        import os
        import re
        import tempfile
        import torch

        suffix = os.path.splitext(filename)[-1] or ".jpg"
        with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as f:
            f.write(image_bytes)
            tmp_path = f.name

        try:
            messages = [
                {
                    "role": "user",
                    "content": [
                        {"type": "image", "url": tmp_path},
                        {"type": "text", "text": CAPTION_PROMPT},
                    ],
                }
            ]

            inputs = self.processor.apply_chat_template(
                messages,
                tokenize=True,
                add_generation_prompt=True,
                return_dict=True,
                return_tensors="pt",
                downsample_mode="16x",
                max_slice_nums=9,
            ).to(self.model.device)

            with torch.inference_mode():
                generated_ids = self.model.generate(
                    **inputs,
                    downsample_mode="16x",
                    max_new_tokens=2048,
                    do_sample=False,
                )

            trimmed = [
                out_ids[len(in_ids):]
                for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
            ]
            raw = self.processor.batch_decode(
                trimmed,
                skip_special_tokens=True,
                clean_up_tokenization_spaces=False,
            )[0].strip()

            # Strip markdown code fences if model added them anyway
            raw = re.sub(r"^```(?:json)?\s*", "", raw)
            raw = re.sub(r"\s*```$", "", raw)

            # Validate JSON — if broken, return as plain text so it still gets stored
            try:
                json.loads(raw)
            except json.JSONDecodeError:
                # Attempt to salvage by extracting the outermost {...} block
                match = re.search(r"\{.*\}", raw, re.DOTALL)
                if match:
                    candidate = match.group(0)
                    try:
                        json.loads(candidate)
                        return candidate
                    except json.JSONDecodeError:
                        pass
                # Give up and return raw — caption_store will treat it as plain text
                return raw

            return raw
        finally:
            os.unlink(tmp_path)