Spaces:

SaniaE
/

Image_Captioning_Ensemble_API

Sleeping

App Files Files Community

SaniaE commited on 15 days ago

Commit

4debe0a

verified ·

1 Parent(s): 0129fbe

revamped entire API logic

Browse files

Files changed (1) hide show

app.py +87 -147

app.py CHANGED Viewed

@@ -2,29 +2,27 @@ import os
 import torch
 import random
 import asyncio
 from PIL import Image, ImageFilter
 from fastapi import FastAPI, UploadFile, File, Query
-from fastapi.middleware.cors import CORSMiddleware
 from huggingface_hub import snapshot_download, login
 from transformers import (
     BlipProcessor, BlipForConditionalGeneration,
     ViTImageProcessor, AutoProcessor, AutoModelForCausalLM
 )
-import torch.nn.functional as F
-import numpy as np
-import io
-from fastapi.responses import StreamingResponse
-import matplotlib.pyplot as plt
-app = FastAPI()
-# Configuration
 REPO_ID = "SaniaE/Image_Captioning_Ensemble"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 MODELS = {}
-# Removed GIT, kept BLIP and ViT
 MODEL_SETTINGS = {
     "blip": {
         "subfolder": "blip",
@@ -53,10 +51,8 @@ async def startup_event():
         ckpt_path = os.path.join(local_dir, cfg["subfolder"])
         print(f"Loading {name} from {ckpt_path}...")
-        # Load Model
         model = cfg["inference_model"].from_pretrained(ckpt_path).to(DEVICE)
-        # Load Processor
         if name == "vit":
             i_proc = cfg["processor"][0].from_pretrained(cfg["pretrained_path"][0])
             t_proc = cfg["processor"][1].from_pretrained(cfg["pretrained_path"][1])
@@ -65,183 +61,127 @@ async def startup_event():
             processor = cfg["processor"].from_pretrained(cfg["pretrained_path"])
         MODELS[name] = {"model": model, "processor": processor}
-    print("Optimization Complete: GIT and Search removed. Ensemble is live!")
-# --- Helper for Parallel Inference ---
-def _generate_sync(m_name, image, temp, top_k, top_p):
     m_data = MODELS[m_name]
     model = m_data["model"]
     if m_name == "vit":
         i_proc, t_proc = m_data["processor"]
         inputs = i_proc(images=image, return_tensors="pt").to(DEVICE)
-        gen_ids = model.generate(
-            **inputs, max_length=300, do_sample=True,
-            temperature=temp, top_k=top_k, top_p=top_p
-        )
         return t_proc.batch_decode(gen_ids, skip_special_tokens=True)[0].strip()
     else:
         proc = m_data["processor"]
         inputs = proc(images=image, return_tensors="pt").to(DEVICE)
-        gen_ids = model.generate(
-            **inputs, max_length=300, do_sample=True,
-            temperature=temp, top_k=top_k, top_p=top_p
-        )
         return proc.batch_decode(gen_ids, skip_special_tokens=True)[0].strip()
-@app.post("/generate")
-async def generate_endpoint(
-    file: UploadFile = File(...),
-    temp: float = Query(0.8),
-    top_k: int = Query(100),
-    top_p: float = Query(0.9)
-):
-    image = Image.open(file.file).convert("RGB")
-    available = list(MODELS.keys()) # Only blip and vit
-    # Create 5 slots from the 2 remaining models
-    model_selection = random.choices(available, k=5)
-    tasks = [asyncio.to_thread(_generate_sync, m, image, temp, top_k, top_p) for m in model_selection]
-    captions = await asyncio.gather(*tasks)
-    return {"captions": captions, "mix": model_selection}
-@app.post("/ui-tester")
-async def ui_tester(file: UploadFile = File(...), description: str = Query(...)):
-    image = Image.open(file.file).convert("RGB")
-    blip_data = MODELS["blip"]
-    # 1. GET THE BASELINE (The model's "Perfect" loss for its own perception)
-    # We generate a caption using high-precision parameters to see its "truth"
-    inputs_gen = blip_data["processor"](images=image, return_tensors="pt").to(DEVICE)
-    with torch.no_grad():
-        generated_ids = blip_data["model"].generate(
-            **inputs_gen,
-            max_length=50,
-            num_beams=5, # Higher beams for a more stable "best guess"
-            temperature=1.0
-        )
-        baseline_caption = blip_data["processor"].decode(generated_ids[0], skip_special_tokens=True)
-        # Calculate loss for the model's own generated caption
-        baseline_inputs = blip_data["processor"](images=image, text=baseline_caption, return_tensors="pt").to(DEVICE)
-        baseline_outputs = blip_data["model"](**baseline_inputs, labels=baseline_inputs["input_ids"])
-        baseline_loss = baseline_outputs.loss.item()
-    # 2. CALCULATE USER LOSS
-    user_inputs = blip_data["processor"](images=image, text=description, return_tensors="pt").to(DEVICE)
-    with torch.no_grad():
-        user_outputs = blip_data["model"](**user_inputs, labels=user_inputs["input_ids"])
-        user_loss = user_outputs.loss.item()
-    # 3. RELATIVE SCORING (The "Intuition" Fix)
-    # This ratio tells us how close the user is to the model's internal maximum confidence
-    relative_ratio = baseline_loss / user_loss
-    # Scaling: If the user matches the model's perception, they get ~95%.
-    # If they are significantly off (like Orange vs Yellow), they land in the 60s.
-    # This prevents the 0% "confusion ceiling" you saw earlier.
-    confidence_score = min(100.0, round((relative_ratio ** 1.5) * 100, 2))
     return {
-        "confidence_score": f"{confidence_score}%",
-        "model_perceived_caption": baseline_caption,
-        "raw_metrics": {
-            "user_loss": round(user_loss, 4),
-            "baseline_loss": round(baseline_loss, 4),
-            "delta": round(user_loss - baseline_loss, 4)
-        },
-        "status": "Match Found" if confidence_score > 55 else "Partial Match" if confidence_score > 30 else "No Match",
-        "is_valid": confidence_score > 55
     }
-@app.post("/concept-ensemble")
-async def concept_ensemble(file: UploadFile = File(...), user_prompt: str = Query(...)):
-    image = Image.open(file.file).convert("RGB")
-    blip = MODELS["blip"]
-    inputs_gen = blip["processor"](images=image, return_tensors="pt").to(DEVICE)
-    with torch.no_grad():
-        generated_ids = blip["model"].generate(**inputs_gen, max_length=40)
-        model_caption = blip["processor"].decode(generated_ids[0], skip_special_tokens=True)
-    def get_clean_embedding(text):
-        inputs = blip["processor"](text=text, return_tensors="pt", padding=True).to(DEVICE)
-        with torch.no_grad():
-            outputs = blip["model"].text_decoder.bert(**inputs)
-            return F.normalize(outputs.last_hidden_state.mean(dim=1), p=2, dim=-1)
-    user_embed = get_clean_embedding(user_prompt)
-    model_embed = get_clean_embedding(model_caption)
-    # --- MLE TRICK: Word-Level Calibration ---
-    # This prevents 'Pink Cafe' and 'Yellow Sofa' from being 0.99
-    user_words = set(user_prompt.lower().split())
-    model_words = set(model_caption.lower().split())
-    intersection = user_words.intersection(model_words)
-    union = user_words.union(model_words)
-    jaccard_sim = len(intersection) / len(union) if len(union) > 0 else 0
-    # Calculate raw embedding similarity
-    raw_sim = torch.matmul(user_embed, model_embed.T).item()
-    # Weighted Similarity: Combine vector meaning with actual word overlap
-    # This will pull the 0.99 score down if the keywords don't match
-    calibrated_overlap = (raw_sim * 0.4) + (jaccard_sim * 0.6)
-    # Visual alignment
-    with torch.no_grad():
-        vision_outputs = blip["model"].vision_model(inputs_gen["pixel_values"])
-        image_embed = F.normalize(vision_outputs.last_hidden_state[:, 0, :], p=2, dim=-1)
-        sim_image_user = torch.matmul(image_embed, user_embed.T).item()
-    return {
-        "captions": {"user": user_prompt, "model": model_caption},
-        "similarity_scores": {
-            "semantic_overlap": round(calibrated_overlap, 4),
-            "visual_alignment": round(sim_image_user, 4),
-            "word_match_penalty": round(1 - jaccard_sim, 2)
-        },
-        "interpretation": "Perspective Divergence" if calibrated_overlap < 0.6 else "Strong Agreement"
-    }
-@app.post("/saliency-explorer/image")
-async def get_saliency_heatmap(file: UploadFile = File(...), query_text: str = Query(...)):
     image_bytes = await file.read()
     orig_img = Image.open(io.BytesIO(image_bytes)).convert("RGB")
     blip = MODELS["blip"]
     inputs = blip["processor"](images=orig_img, text=query_text, return_tensors="pt").to(DEVICE)
-    # We use the text_decoder because that's where the image and text actually 'meet'
     with torch.no_grad():
         outputs = blip["model"].text_decoder(
             input_ids=inputs.input_ids,
             attention_mask=inputs.attention_mask,
-            encoder_hidden_states=blip["model"].vision_model(inputs.pixel_values).last_hidden_state,
-            output_attentions=True # This is key
         )
-        # Get Cross-Attentions (the link between text and image)
-        # Shape: (layers, batch, heads, text_tokens, image_patches)
         cross_attentions = outputs.cross_attentions[-1]
-        mask_1d = cross_attentions[0, :, 1:-1, 1:].mean(dim=(0, 1)) # Note the 1: at the end
-        grid_size = int(np.sqrt(mask_1d.shape[-1])) # This will now be 24
         mask = mask_1d.view(grid_size, grid_size).cpu().numpy()
-    # Normalize and create the "Glow"
     mask = (mask - mask.min()) / (mask.max() - mask.min() + 1e-8)
     mask_pill = Image.fromarray((mask * 255).astype('uint8')).resize(orig_img.size, resample=Image.BICUBIC)
-    mask_pill = mask_pill.filter(ImageFilter.GaussianBlur(radius=12)) # The XAI Glow
     heatmap_rgba = plt.get_cmap('jet')(np.array(mask_pill)/255.0)
     heatmap_img = Image.fromarray((heatmap_rgba[:, :, :3] * 255).astype('uint8')).convert("RGB")
     blended_img = Image.blend(orig_img, heatmap_img, alpha=0.5)
     buf = io.BytesIO()
     blended_img.save(buf, format="PNG")
     buf.seek(0)
-    return StreamingResponse(buf, media_type="image/png")

 import torch
 import random
 import asyncio
+import io
+import numpy as np
+import matplotlib.pyplot as plt
 from PIL import Image, ImageFilter
 from fastapi import FastAPI, UploadFile, File, Query
+from fastapi.responses import StreamingResponse
 from huggingface_hub import snapshot_download, login
+import torch.nn.functional as F
 from transformers import (
     BlipProcessor, BlipForConditionalGeneration,
     ViTImageProcessor, AutoProcessor, AutoModelForCausalLM
 )
+app = FastAPI(title="XAI Auditor Ensemble")
+# --- Configuration & State ---
 REPO_ID = "SaniaE/Image_Captioning_Ensemble"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 MODELS = {}
 MODEL_SETTINGS = {
     "blip": {
         "subfolder": "blip",
         ckpt_path = os.path.join(local_dir, cfg["subfolder"])
         print(f"Loading {name} from {ckpt_path}...")
         model = cfg["inference_model"].from_pretrained(ckpt_path).to(DEVICE)
         if name == "vit":
             i_proc = cfg["processor"][0].from_pretrained(cfg["pretrained_path"][0])
             t_proc = cfg["processor"][1].from_pretrained(cfg["pretrained_path"][1])
             processor = cfg["processor"].from_pretrained(cfg["pretrained_path"])
         MODELS[name] = {"model": model, "processor": processor}
+    print("Optimization Complete: Ensemble is live!")
+# --- Core Logic Helpers ---
+def _generate_sync(m_name, image, temp=0.7):
+    """Synchronous generator tailored for the specific architecture."""
     m_data = MODELS[m_name]
     model = m_data["model"]
     if m_name == "vit":
         i_proc, t_proc = m_data["processor"]
         inputs = i_proc(images=image, return_tensors="pt").to(DEVICE)
+        gen_ids = model.generate(**inputs, max_length=50, do_sample=True, temperature=temp)
         return t_proc.batch_decode(gen_ids, skip_special_tokens=True)[0].strip()
     else:
         proc = m_data["processor"]
         inputs = proc(images=image, return_tensors="pt").to(DEVICE)
+        gen_ids = model.generate(**inputs, max_length=50, do_sample=True, temperature=temp)
         return proc.batch_decode(gen_ids, skip_special_tokens=True)[0].strip()
+# --- Endpoint 1: The Multi-Perspective Generator ---
+@app.post("/generate-caption")
+async def generate_caption(file: UploadFile = File(...), temp: float = Query(0.7)):
+    image_bytes = await file.read()
+    image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
+    # Run both architectures in parallel
+    tasks = [
+        asyncio.to_thread(_generate_sync, "blip", image, temp),
+        asyncio.to_thread(_generate_sync, "vit", image, temp)
+    ]
+    captions = await asyncio.gather(*tasks)
     return {
+        "blip_caption": captions[0],
+        "vit_git_caption": captions[1]
     }
+# --- Endpoint 2: The Saliency Explorer (XAI Glow) ---
+@app.post("/saliency-explorer")
+async def get_saliency_map(file: UploadFile = File(...), query_text: str = Query(...)):
     image_bytes = await file.read()
     orig_img = Image.open(io.BytesIO(image_bytes)).convert("RGB")
     blip = MODELS["blip"]
     inputs = blip["processor"](images=orig_img, text=query_text, return_tensors="pt").to(DEVICE)
     with torch.no_grad():
+        vision_hidden = blip["model"].vision_model(inputs.pixel_values).last_hidden_state
         outputs = blip["model"].text_decoder(
             input_ids=inputs.input_ids,
             attention_mask=inputs.attention_mask,
+            encoder_hidden_states=vision_hidden,
+            output_attentions=True
         )
+        # Slicing out the [CLS] token from cross-attentions
         cross_attentions = outputs.cross_attentions[-1]
+        mask_1d = cross_attentions[0, :, 1:-1, 1:].mean(dim=(0, 1))
+        grid_size = int(np.sqrt(mask_1d.shape[-1]))
         mask = mask_1d.view(grid_size, grid_size).cpu().numpy()
+    # Normalization & XAI Glow Application
     mask = (mask - mask.min()) / (mask.max() - mask.min() + 1e-8)
     mask_pill = Image.fromarray((mask * 255).astype('uint8')).resize(orig_img.size, resample=Image.BICUBIC)
+    mask_pill = mask_pill.filter(ImageFilter.GaussianBlur(radius=12))
     heatmap_rgba = plt.get_cmap('jet')(np.array(mask_pill)/255.0)
     heatmap_img = Image.fromarray((heatmap_rgba[:, :, :3] * 255).astype('uint8')).convert("RGB")
     blended_img = Image.blend(orig_img, heatmap_img, alpha=0.5)
     buf = io.BytesIO()
     blended_img.save(buf, format="PNG")
     buf.seek(0)
+    return StreamingResponse(buf, media_type="image/png")
+# --- Endpoint 3: Internal Debate (Audit Mode) ---
+@app.post("/internal-debate")
+async def internal_debate(file: UploadFile = File(...), user_prompt: str = Query(...)):
+    image_bytes = await file.read()
+    image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
+    # 1. Gather model perceptions
+    blip_caption = await asyncio.to_thread(_generate_sync, "blip", image)
+    vit_caption = await asyncio.to_thread(_generate_sync, "vit", image)
+    # 2. Semantic Embedding Logic
+    blip_data = MODELS["blip"]
+    def get_emb(text):
+        inputs = blip_data["processor"](text=text, return_tensors="pt", padding=True).to(DEVICE)
+        with torch.no_grad():
+            return F.normalize(blip_data["model"].text_decoder.bert(**inputs).last_hidden_state.mean(dim=1), p=2, dim=-1)
+    u_emb = get_emb(user_prompt)
+    b_emb = get_emb(blip_caption)
+    v_emb = get_emb(vit_caption)
+    # 3. MLE Calibration (Jaccard Weighting)
+    def calibrate(emb1, emb2, t1, t2):
+        s1, s2 = set(t1.lower().split()), set(t2.lower().split())
+        jaccard = len(s1 & s2) / len(s1 | s2) if s1 | s2 else 0
+        cosine = torch.matmul(emb1, emb2.T).item()
+        return (cosine * 0.4) + (jaccard * 0.6)
+    score_blip = calibrate(u_emb, b_emb, user_prompt, blip_caption)
+    score_vit = calibrate(u_emb, v_emb, user_prompt, vit_caption)
+    consensus = calibrate(b_emb, v_emb, blip_caption, vit_caption)
+    return {
+        "perspectives": {
+            "user_intent": user_prompt,
+            "blip_view": blip_caption,
+            "vit_git_view": vit_caption
+        },
+        "audit_metrics": {
+            "user_vs_blip": round(score_blip, 4),
+            "user_vs_vit": round(score_vit, 4),
+            "inter_model_consensus": round(consensus, 4)
+        },
+        "verdict": "Consensus" if consensus > 0.65 else "Perspective Divergence"
+    }