Spaces:

build-small-hackathon
/

ShutterSearch

Running

App Files Files Community

ShutterSearch / modal_caption.py

SwikarG

upload files

4a02afe verified 12 days ago

Raw

History Blame Contribute Delete

5.28 kB

	"""Modal app: runs MiniCPM-V-4.6 captioning on a remote GPU."""

	import modal

	app = modal.App("photographers-archive")

	image = (
	modal.Image.debian_slim(python_version="3.12")
	.pip_install(
	"transformers[torch]>=5.7.0",
	"torchvision",
	"av",
	"Pillow",
	"torch>=2.1.0",
	"accelerate",
	)
	)

	MODEL_ID = "openbmb/MiniCPM-V-4.6"

	CAPTION_PROMPT = """You are a wedding and portrait photo archivist. Analyze this image and return ONLY a valid JSON object — no markdown, no explanation, no code fences.

	Crucial Prompting Guidelines:
	1. Be highly specific with textures, fabrics, and patterns in "attire" (e.g., "lace wedding gown", "black velvet tuxedo", "pinstripe suit").
	2. Avoid generic descriptions in "summary". Focus on explicit visual facts (who, what, where, explicit actions).
	3. Under "primary_subjects", explicitly label roles if evident (e.g., "bride", "groom", "bridesmaid", "groomsman", "mother of the bride").
	4. For "depth_of_field", specify features like "shallow depth of field", "bokeh background", or "deep focus".

	Use this exact schema:
	{
	"summary": "2-3 sentence description of the scene",
	"subjects": {
	"people_count": 0,
	"primary_subjects": [],
	"relationships": [],
	"attire": []
	},
	"scene": {
	"location_type": "",
	"environment": "",
	"setting_details": []
	},
	"actions": {
	"primary_action": "",
	"body_language": []
	},
	"lighting": {
	"lighting_style": "",
	"time_of_day_estimate": ""
	},
	"composition": {
	"shot_type": "",
	"camera_angle": ""
	},
	"mood": {
	"primary_emotions": [],
	"atmosphere": ""
	},
	"technical_cues": {
	"color_palette": [],
	"depth_of_field": ""
	},
	"search_tags": [],
	"archive_keywords": []
	}"""

	# Cache model weights in a Modal Volume so they persist across cold starts
	model_volume = modal.Volume.from_name("minicpm-weights", create_if_missing=True)


	@app.cls(
	image=image,
	gpu="A10G",
	volumes={"/model-cache": model_volume},
	timeout=300,
	)
	class Captioner:
	@modal.enter()
	def load(self):
	import torch
	from transformers import AutoModelForImageTextToText, AutoProcessor

	self.processor = AutoProcessor.from_pretrained(
	MODEL_ID, cache_dir="/model-cache"
	)
	self.model = AutoModelForImageTextToText.from_pretrained(
	MODEL_ID,
	torch_dtype=torch.bfloat16,
	device_map="auto",
	cache_dir="/model-cache",
	)

	@modal.method()
	def caption(self, image_bytes: bytes, filename: str = "image.jpg") -> str:
	import json
	import os
	import re
	import tempfile
	import torch

	suffix = os.path.splitext(filename)[-1] or ".jpg"
	with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as f:
	f.write(image_bytes)
	tmp_path = f.name

	try:
	messages = [
	{
	"role": "user",
	"content": [
	{"type": "image", "url": tmp_path},
	{"type": "text", "text": CAPTION_PROMPT},
	],
	}
	]

	inputs = self.processor.apply_chat_template(
	messages,
	tokenize=True,
	add_generation_prompt=True,
	return_dict=True,
	return_tensors="pt",
	downsample_mode="16x",
	max_slice_nums=9,
	).to(self.model.device)

	with torch.inference_mode():
	generated_ids = self.model.generate(
	**inputs,
	downsample_mode="16x",
	max_new_tokens=2048,
	do_sample=False,
	)

	trimmed = [
	out_ids[len(in_ids):]
	for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
	]
	raw = self.processor.batch_decode(
	trimmed,
	skip_special_tokens=True,
	clean_up_tokenization_spaces=False,
	)[0].strip()

	# Strip markdown code fences if model added them anyway
	raw = re.sub(r"^```(?:json)?\s*", "", raw)
	raw = re.sub(r"\s*```$", "", raw)

	# Validate JSON — if broken, return as plain text so it still gets stored
	try:
	json.loads(raw)
	except json.JSONDecodeError:
	# Attempt to salvage by extracting the outermost {...} block
	match = re.search(r"\{.*\}", raw, re.DOTALL)
	if match:
	candidate = match.group(0)
	try:
	json.loads(candidate)
	return candidate
	except json.JSONDecodeError:
	pass
	# Give up and return raw — caption_store will treat it as plain text
	return raw

	return raw
	finally:
	os.unlink(tmp_path)