"""Modal app: runs MiniCPM-V-4.6 captioning on a remote GPU.""" import modal app = modal.App("photographers-archive") image = ( modal.Image.debian_slim(python_version="3.12") .pip_install( "transformers[torch]>=5.7.0", "torchvision", "av", "Pillow", "torch>=2.1.0", "accelerate", ) ) MODEL_ID = "openbmb/MiniCPM-V-4.6" CAPTION_PROMPT = """You are a wedding and portrait photo archivist. Analyze this image and return ONLY a valid JSON object — no markdown, no explanation, no code fences. Crucial Prompting Guidelines: 1. Be highly specific with textures, fabrics, and patterns in "attire" (e.g., "lace wedding gown", "black velvet tuxedo", "pinstripe suit"). 2. Avoid generic descriptions in "summary". Focus on explicit visual facts (who, what, where, explicit actions). 3. Under "primary_subjects", explicitly label roles if evident (e.g., "bride", "groom", "bridesmaid", "groomsman", "mother of the bride"). 4. For "depth_of_field", specify features like "shallow depth of field", "bokeh background", or "deep focus". Use this exact schema: { "summary": "2-3 sentence description of the scene", "subjects": { "people_count": 0, "primary_subjects": [], "relationships": [], "attire": [] }, "scene": { "location_type": "", "environment": "", "setting_details": [] }, "actions": { "primary_action": "", "body_language": [] }, "lighting": { "lighting_style": "", "time_of_day_estimate": "" }, "composition": { "shot_type": "", "camera_angle": "" }, "mood": { "primary_emotions": [], "atmosphere": "" }, "technical_cues": { "color_palette": [], "depth_of_field": "" }, "search_tags": [], "archive_keywords": [] }""" # Cache model weights in a Modal Volume so they persist across cold starts model_volume = modal.Volume.from_name("minicpm-weights", create_if_missing=True) @app.cls( image=image, gpu="A10G", volumes={"/model-cache": model_volume}, timeout=300, ) class Captioner: @modal.enter() def load(self): import torch from transformers import AutoModelForImageTextToText, AutoProcessor self.processor = AutoProcessor.from_pretrained( MODEL_ID, cache_dir="/model-cache" ) self.model = AutoModelForImageTextToText.from_pretrained( MODEL_ID, torch_dtype=torch.bfloat16, device_map="auto", cache_dir="/model-cache", ) @modal.method() def caption(self, image_bytes: bytes, filename: str = "image.jpg") -> str: import json import os import re import tempfile import torch suffix = os.path.splitext(filename)[-1] or ".jpg" with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as f: f.write(image_bytes) tmp_path = f.name try: messages = [ { "role": "user", "content": [ {"type": "image", "url": tmp_path}, {"type": "text", "text": CAPTION_PROMPT}, ], } ] inputs = self.processor.apply_chat_template( messages, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt", downsample_mode="16x", max_slice_nums=9, ).to(self.model.device) with torch.inference_mode(): generated_ids = self.model.generate( **inputs, downsample_mode="16x", max_new_tokens=2048, do_sample=False, ) trimmed = [ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) ] raw = self.processor.batch_decode( trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False, )[0].strip() # Strip markdown code fences if model added them anyway raw = re.sub(r"^```(?:json)?\s*", "", raw) raw = re.sub(r"\s*```$", "", raw) # Validate JSON — if broken, return as plain text so it still gets stored try: json.loads(raw) except json.JSONDecodeError: # Attempt to salvage by extracting the outermost {...} block match = re.search(r"\{.*\}", raw, re.DOTALL) if match: candidate = match.group(0) try: json.loads(candidate) return candidate except json.JSONDecodeError: pass # Give up and return raw — caption_store will treat it as plain text return raw return raw finally: os.unlink(tmp_path)