| """Modal app: runs MiniCPM-V-4.6 captioning on a remote GPU."""
|
|
|
| import modal
|
|
|
| app = modal.App("photographers-archive")
|
|
|
| image = (
|
| modal.Image.debian_slim(python_version="3.12")
|
| .pip_install(
|
| "transformers[torch]>=5.7.0",
|
| "torchvision",
|
| "av",
|
| "Pillow",
|
| "torch>=2.1.0",
|
| "accelerate",
|
| )
|
| )
|
|
|
| MODEL_ID = "openbmb/MiniCPM-V-4.6"
|
|
|
| CAPTION_PROMPT = """You are a wedding and portrait photo archivist. Analyze this image and return ONLY a valid JSON object — no markdown, no explanation, no code fences.
|
|
|
| Crucial Prompting Guidelines:
|
| 1. Be highly specific with textures, fabrics, and patterns in "attire" (e.g., "lace wedding gown", "black velvet tuxedo", "pinstripe suit").
|
| 2. Avoid generic descriptions in "summary". Focus on explicit visual facts (who, what, where, explicit actions).
|
| 3. Under "primary_subjects", explicitly label roles if evident (e.g., "bride", "groom", "bridesmaid", "groomsman", "mother of the bride").
|
| 4. For "depth_of_field", specify features like "shallow depth of field", "bokeh background", or "deep focus".
|
|
|
| Use this exact schema:
|
| {
|
| "summary": "2-3 sentence description of the scene",
|
| "subjects": {
|
| "people_count": 0,
|
| "primary_subjects": [],
|
| "relationships": [],
|
| "attire": []
|
| },
|
| "scene": {
|
| "location_type": "",
|
| "environment": "",
|
| "setting_details": []
|
| },
|
| "actions": {
|
| "primary_action": "",
|
| "body_language": []
|
| },
|
| "lighting": {
|
| "lighting_style": "",
|
| "time_of_day_estimate": ""
|
| },
|
| "composition": {
|
| "shot_type": "",
|
| "camera_angle": ""
|
| },
|
| "mood": {
|
| "primary_emotions": [],
|
| "atmosphere": ""
|
| },
|
| "technical_cues": {
|
| "color_palette": [],
|
| "depth_of_field": ""
|
| },
|
| "search_tags": [],
|
| "archive_keywords": []
|
| }"""
|
|
|
|
|
| model_volume = modal.Volume.from_name("minicpm-weights", create_if_missing=True)
|
|
|
|
|
| @app.cls(
|
| image=image,
|
| gpu="A10G",
|
| volumes={"/model-cache": model_volume},
|
| timeout=300,
|
| )
|
| class Captioner:
|
| @modal.enter()
|
| def load(self):
|
| import torch
|
| from transformers import AutoModelForImageTextToText, AutoProcessor
|
|
|
| self.processor = AutoProcessor.from_pretrained(
|
| MODEL_ID, cache_dir="/model-cache"
|
| )
|
| self.model = AutoModelForImageTextToText.from_pretrained(
|
| MODEL_ID,
|
| torch_dtype=torch.bfloat16,
|
| device_map="auto",
|
| cache_dir="/model-cache",
|
| )
|
|
|
| @modal.method()
|
| def caption(self, image_bytes: bytes, filename: str = "image.jpg") -> str:
|
| import json
|
| import os
|
| import re
|
| import tempfile
|
| import torch
|
|
|
| suffix = os.path.splitext(filename)[-1] or ".jpg"
|
| with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as f:
|
| f.write(image_bytes)
|
| tmp_path = f.name
|
|
|
| try:
|
| messages = [
|
| {
|
| "role": "user",
|
| "content": [
|
| {"type": "image", "url": tmp_path},
|
| {"type": "text", "text": CAPTION_PROMPT},
|
| ],
|
| }
|
| ]
|
|
|
| inputs = self.processor.apply_chat_template(
|
| messages,
|
| tokenize=True,
|
| add_generation_prompt=True,
|
| return_dict=True,
|
| return_tensors="pt",
|
| downsample_mode="16x",
|
| max_slice_nums=9,
|
| ).to(self.model.device)
|
|
|
| with torch.inference_mode():
|
| generated_ids = self.model.generate(
|
| **inputs,
|
| downsample_mode="16x",
|
| max_new_tokens=2048,
|
| do_sample=False,
|
| )
|
|
|
| trimmed = [
|
| out_ids[len(in_ids):]
|
| for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
|
| ]
|
| raw = self.processor.batch_decode(
|
| trimmed,
|
| skip_special_tokens=True,
|
| clean_up_tokenization_spaces=False,
|
| )[0].strip()
|
|
|
|
|
| raw = re.sub(r"^```(?:json)?\s*", "", raw)
|
| raw = re.sub(r"\s*```$", "", raw)
|
|
|
|
|
| try:
|
| json.loads(raw)
|
| except json.JSONDecodeError:
|
|
|
| match = re.search(r"\{.*\}", raw, re.DOTALL)
|
| if match:
|
| candidate = match.group(0)
|
| try:
|
| json.loads(candidate)
|
| return candidate
|
| except json.JSONDecodeError:
|
| pass
|
|
|
| return raw
|
|
|
| return raw
|
| finally:
|
| os.unlink(tmp_path)
|
|
|