Spaces:

SaniaE
/

Image_Captioning_Ensemble_API

Sleeping

App Files Files Community

SaniaE commited on 14 days ago

Commit

b5397cf

verified ·

1 Parent(s): c441112

updated endpoint logic

Browse files

Files changed (1) hide show

app.py +49 -58

app.py CHANGED Viewed

@@ -159,93 +159,84 @@ async def ui_tester(file: UploadFile = File(...), description: str = Query(...))
 async def concept_ensemble(file: UploadFile = File(...), user_prompt: str = Query(...)):
     image = Image.open(file.file).convert("RGB")
     blip = MODELS["blip"]
-    # 1. Model Baseline (Generating its own perception)
     inputs_gen = blip["processor"](images=image, return_tensors="pt").to(DEVICE)
     with torch.no_grad():
         generated_ids = blip["model"].generate(**inputs_gen, max_length=40)
         model_caption = blip["processor"].decode(generated_ids[0], skip_special_tokens=True)
-    # 2. Embedding Calculation
-    texts = [user_prompt, model_caption]
-    inputs_text = blip["processor"](text=texts, return_tensors="pt", padding=True).to(DEVICE)
     with torch.no_grad():
-        # 1. Get Image Embeddings from the vision_model
         vision_outputs = blip["model"].vision_model(inputs_gen["pixel_values"])
-        image_embeds = vision_outputs.last_hidden_state[:, 0, :] # Use [CLS] token
-        # 2. Get Text Embeddings using the text_decoder's bert model
-        # BLIP's text_decoder typically wraps a BERT-like architecture
-        text_outputs = blip["model"].text_decoder.bert(**inputs_text)
-        text_embeds = text_outputs.last_hidden_state[:, 0, :] # Use [CLS] token
-        # Normalize
-        image_embeds = F.normalize(image_embeds, p=2, dim=-1)
-        text_embeds = F.normalize(text_embeds, p=2, dim=-1)
-        # Similarity Matrix calculation
-        sim_image_user = torch.matmul(image_embeds, text_embeds[0].T).item()
-        sim_image_model = torch.matmul(image_embeds, text_embeds[1].T).item()
-        sim_user_model = torch.matmul(text_embeds[0], text_embeds[1].T).item()
     return {
-        "captions": {
-            "user": user_prompt,
-            "model_best_guess": model_caption
-        },
         "similarity_scores": {
-            "visual_alignment_user": round(float(sim_image_user), 4),
-            "visual_alignment_model": round(float(sim_image_model), 4),
-            "semantic_overlap": round(float(sim_user_model), 4)
         },
-        "interpretation": "Strong Agreement" if sim_user_model > 0.85 else "Diverse Perspectives"
     }
 @app.post("/saliency-explorer/image")
 async def get_saliency_heatmap(file: UploadFile = File(...), query_text: str = Query(...)):
-    # 1. Load Image
     image_bytes = await file.read()
     orig_img = Image.open(io.BytesIO(image_bytes)).convert("RGB")
     blip = MODELS["blip"]
-    # Ensure pixel_values can track gradients
     inputs = blip["processor"](images=orig_img, text=query_text, return_tensors="pt").to(DEVICE)
-    inputs.pixel_values.requires_grad = True
-    # 2. Extract Gradients for Saliency
-    outputs = blip["model"](**inputs, labels=inputs["input_ids"])
-    loss = outputs.loss
-    loss.backward()
-    # Generate Saliency from gradients of pixel values
-    # We take the maximum absolute gradient across the RGB channels
-    grad = inputs.pixel_values.grad.abs().max(dim=1)[0][0].cpu().numpy()
-    # 3. Create Heatmap with "Glow" Effect (XAI Style)
-    # Normalize to [0, 1]
-    grad = (grad - grad.min()) / (grad.max() - grad.min() + 1e-8)
-    # Apply Gaussian Blur to smooth tiny speckles into a professional heatmap
-    grad_pill = Image.fromarray((grad * 255).astype('uint8'))
-    grad_pill = grad_pill.filter(ImageFilter.GaussianBlur(radius=8))
-    grad_smoothed = np.array(grad_pill) / 255.0
-    # Apply colormap (jet)
     cm = plt.get_cmap('jet')
-    heatmap_rgba = cm(grad_smoothed)
-    # Convert heatmap to PIL and resize to original image dimensions
     heatmap_img = Image.fromarray((heatmap_rgba[:, :, :3] * 255).astype('uint8')).convert("RGB")
-    heatmap_img = heatmap_img.resize(orig_img.size, resample=Image.BILINEAR)
-    # 4. Blend Original + Heatmap (Adjust alpha for visibility on dark/light UIs)
-    # 0.5 alpha provides a strong clear highlight for the "Rorompok" sofa
     blended_img = Image.blend(orig_img, heatmap_img, alpha=0.5)
-    # 5. Stream back
     buf = io.BytesIO()
     blended_img.save(buf, format="PNG")
     buf.seek(0)
     return StreamingResponse(buf, media_type="image/png")

 async def concept_ensemble(file: UploadFile = File(...), user_prompt: str = Query(...)):
     image = Image.open(file.file).convert("RGB")
     blip = MODELS["blip"]
+    # Get model's caption
     inputs_gen = blip["processor"](images=image, return_tensors="pt").to(DEVICE)
     with torch.no_grad():
         generated_ids = blip["model"].generate(**inputs_gen, max_length=40)
         model_caption = blip["processor"].decode(generated_ids[0], skip_special_tokens=True)
+    # 1. NEW: Localized Keyword Embedding
+    # We focus on the core nouns and adjectives to prevent 'template bias'
+    def get_focused_embedding(text):
+        inputs = blip["processor"](text=text, return_tensors="pt", padding=True).to(DEVICE)
+        with torch.no_grad():
+            # Get output from the BERT-based text decoder
+            outputs = blip["model"].text_decoder.bert(**inputs)
+            # Average hidden states of ALL tokens to capture keyword specifics
+            return F.normalize(outputs.last_hidden_state.mean(dim=1), p=2, dim=-1)
+    user_embed = get_focused_embedding(user_prompt)
+    model_embed = get_focused_embedding(model_caption)
+    # Visual alignment
     with torch.no_grad():
         vision_outputs = blip["model"].vision_model(inputs_gen["pixel_values"])
+        image_embed = F.normalize(vision_outputs.last_hidden_state[:, 0, :], p=2, dim=-1)
+    # 2. Calculate Corrected Scores
+    sim_image_user = torch.matmul(image_embed, user_embed.T).item()
+    sim_image_model = torch.matmul(image_embed, model_embed.T).item()
+    sim_user_model = torch.matmul(user_embed, model_embed.T).item()
     return {
+        "captions": {"user": user_prompt, "model": model_caption},
         "similarity_scores": {
+            "visual_alignment_user": round(sim_image_user, 4),
+            "visual_alignment_model": round(sim_image_model, 4),
+            "semantic_overlap": round(sim_user_model, 4)
         },
+        "interpretation": "Strong Agreement" if sim_user_model > 0.8 else "Perspective Divergence"
     }
 @app.post("/saliency-explorer/image")
 async def get_saliency_heatmap(file: UploadFile = File(...), query_text: str = Query(...)):
     image_bytes = await file.read()
     orig_img = Image.open(io.BytesIO(image_bytes)).convert("RGB")
     blip = MODELS["blip"]
+    # We enable 'output_attentions' to grab the internal map directly
     inputs = blip["processor"](images=orig_img, text=query_text, return_tensors="pt").to(DEVICE)
+    with torch.no_grad():
+        outputs = blip["model"](**inputs, output_attentions=True)
+        # Use the last layer of vision encoder self-attention
+        # Shape: (batch, heads, patches, patches)
+        attentions = outputs.vision_model_output.attentions[-1]
+        # Average across heads and take the attention from the [CLS] token to all patches
+        # Patch size for BLIP is typically 14x14 or 16x16
+        grid_size = int(np.sqrt(attentions.shape[-1] - 1))
+        # Remove [CLS] token and reshape to grid
+        mask = attentions[0, :, 0, 1:].mean(0).view(grid_size, grid_size).cpu().numpy()
+    # 1. Normalize and Upscale
+    mask = (mask - mask.min()) / (mask.max() - mask.min() + 1e-8)
+    mask_pill = Image.fromarray((mask * 255).astype('uint8')).resize(orig_img.size, resample=Image.BICUBIC)
+    # 2. Apply Gaussian Glow for XAI Aesthetic
+    mask_pill = mask_pill.filter(ImageFilter.GaussianBlur(radius=15))
+    mask_final = np.array(mask_pill) / 255.0
+    # 3. Apply Colormap and Blend
     cm = plt.get_cmap('jet')
+    heatmap_rgba = cm(mask_final)
     heatmap_img = Image.fromarray((heatmap_rgba[:, :, :3] * 255).astype('uint8')).convert("RGB")
     blended_img = Image.blend(orig_img, heatmap_img, alpha=0.5)
     buf = io.BytesIO()
     blended_img.save(buf, format="PNG")
     buf.seek(0)
     return StreamingResponse(buf, media_type="image/png")