ShutterSearch / modal_caption.py
SwikarG's picture
upload files
4a02afe verified
Raw
History Blame Contribute Delete
5.28 kB
"""Modal app: runs MiniCPM-V-4.6 captioning on a remote GPU."""
import modal
app = modal.App("photographers-archive")
image = (
modal.Image.debian_slim(python_version="3.12")
.pip_install(
"transformers[torch]>=5.7.0",
"torchvision",
"av",
"Pillow",
"torch>=2.1.0",
"accelerate",
)
)
MODEL_ID = "openbmb/MiniCPM-V-4.6"
CAPTION_PROMPT = """You are a wedding and portrait photo archivist. Analyze this image and return ONLY a valid JSON object — no markdown, no explanation, no code fences.
Crucial Prompting Guidelines:
1. Be highly specific with textures, fabrics, and patterns in "attire" (e.g., "lace wedding gown", "black velvet tuxedo", "pinstripe suit").
2. Avoid generic descriptions in "summary". Focus on explicit visual facts (who, what, where, explicit actions).
3. Under "primary_subjects", explicitly label roles if evident (e.g., "bride", "groom", "bridesmaid", "groomsman", "mother of the bride").
4. For "depth_of_field", specify features like "shallow depth of field", "bokeh background", or "deep focus".
Use this exact schema:
{
"summary": "2-3 sentence description of the scene",
"subjects": {
"people_count": 0,
"primary_subjects": [],
"relationships": [],
"attire": []
},
"scene": {
"location_type": "",
"environment": "",
"setting_details": []
},
"actions": {
"primary_action": "",
"body_language": []
},
"lighting": {
"lighting_style": "",
"time_of_day_estimate": ""
},
"composition": {
"shot_type": "",
"camera_angle": ""
},
"mood": {
"primary_emotions": [],
"atmosphere": ""
},
"technical_cues": {
"color_palette": [],
"depth_of_field": ""
},
"search_tags": [],
"archive_keywords": []
}"""
# Cache model weights in a Modal Volume so they persist across cold starts
model_volume = modal.Volume.from_name("minicpm-weights", create_if_missing=True)
@app.cls(
image=image,
gpu="A10G",
volumes={"/model-cache": model_volume},
timeout=300,
)
class Captioner:
@modal.enter()
def load(self):
import torch
from transformers import AutoModelForImageTextToText, AutoProcessor
self.processor = AutoProcessor.from_pretrained(
MODEL_ID, cache_dir="/model-cache"
)
self.model = AutoModelForImageTextToText.from_pretrained(
MODEL_ID,
torch_dtype=torch.bfloat16,
device_map="auto",
cache_dir="/model-cache",
)
@modal.method()
def caption(self, image_bytes: bytes, filename: str = "image.jpg") -> str:
import json
import os
import re
import tempfile
import torch
suffix = os.path.splitext(filename)[-1] or ".jpg"
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as f:
f.write(image_bytes)
tmp_path = f.name
try:
messages = [
{
"role": "user",
"content": [
{"type": "image", "url": tmp_path},
{"type": "text", "text": CAPTION_PROMPT},
],
}
]
inputs = self.processor.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True,
return_dict=True,
return_tensors="pt",
downsample_mode="16x",
max_slice_nums=9,
).to(self.model.device)
with torch.inference_mode():
generated_ids = self.model.generate(
**inputs,
downsample_mode="16x",
max_new_tokens=2048,
do_sample=False,
)
trimmed = [
out_ids[len(in_ids):]
for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
raw = self.processor.batch_decode(
trimmed,
skip_special_tokens=True,
clean_up_tokenization_spaces=False,
)[0].strip()
# Strip markdown code fences if model added them anyway
raw = re.sub(r"^```(?:json)?\s*", "", raw)
raw = re.sub(r"\s*```$", "", raw)
# Validate JSON — if broken, return as plain text so it still gets stored
try:
json.loads(raw)
except json.JSONDecodeError:
# Attempt to salvage by extracting the outermost {...} block
match = re.search(r"\{.*\}", raw, re.DOTALL)
if match:
candidate = match.group(0)
try:
json.loads(candidate)
return candidate
except json.JSONDecodeError:
pass
# Give up and return raw — caption_store will treat it as plain text
return raw
return raw
finally:
os.unlink(tmp_path)