| import os |
| import io |
| import asyncio |
| import random |
| import numpy as np |
| import torch |
| import torch.nn.functional as F |
| import matplotlib.pyplot as plt |
| from PIL import Image, ImageFilter |
| from fastapi import FastAPI, UploadFile, File, Query |
| from fastapi.responses import StreamingResponse |
| from huggingface_hub import snapshot_download, login |
|
|
| from transformers import ( |
| BlipProcessor, BlipForConditionalGeneration, |
| ViTImageProcessor, AutoProcessor, AutoModelForCausalLM, |
| CLIPModel, CLIPProcessor |
| ) |
|
|
| app = FastAPI(title="XAI Auditor Ensemble with CLIP Jury") |
|
|
| |
| REPO_ID = "SaniaE/Image_Captioning_Ensemble" |
| DEVICE = "cuda" if torch.cuda.is_available() else "cpu" |
| MODELS = {} |
|
|
| |
| MODEL_CONFIGS = { |
| "blip": { |
| "subfolder": "blip", |
| "proc_class": BlipProcessor, |
| "model_class": BlipForConditionalGeneration, |
| "base_path": "Salesforce/blip-image-captioning-large" |
| }, |
| "vit": { |
| "subfolder": "vit", |
| "proc_classes": [ViTImageProcessor, AutoProcessor], |
| "model_class": AutoModelForCausalLM, |
| "base_paths": ["nlpconnect/vit-gpt2-image-captioning", "microsoft/git-large"] |
| }, |
| "clip": { |
| "model_subfolder": "clip/clip_model", |
| "proc_subfolder": "clip/clip_processor" |
| } |
| } |
|
|
| @app.on_event("startup") |
| async def startup_event(): |
| global MODELS |
| token = os.getenv("HF_Token") |
| if token: login(token=token) |
| |
| print(f"Syncing weights from {REPO_ID}...") |
| local_dir = snapshot_download(repo_id=REPO_ID, token=token, local_dir="weights") |
|
|
| |
| cfg_b = MODEL_CONFIGS["blip"] |
| MODELS["blip"] = { |
| "model": cfg_b["model_class"].from_pretrained(os.path.join(local_dir, cfg_b["subfolder"])).to(DEVICE), |
| "processor": cfg_b["proc_class"].from_pretrained(cfg_b["base_path"]) |
| } |
|
|
| |
| cfg_v = MODEL_CONFIGS["vit"] |
| MODELS["vit"] = { |
| "model": cfg_v["model_class"].from_pretrained(os.path.join(local_dir, cfg_v["subfolder"])).to(DEVICE), |
| "processor": ( |
| cfg_v["proc_classes"][0].from_pretrained(cfg_v["base_paths"][0]), |
| cfg_v["proc_classes"][1].from_pretrained(cfg_v["base_paths"][1]) |
| ) |
| } |
|
|
| |
| cfg_c = MODEL_CONFIGS["clip"] |
| MODELS["clip"] = { |
| "model": CLIPModel.from_pretrained(os.path.join(local_dir, cfg_c["model_subfolder"])).to(DEVICE), |
| "processor": CLIPProcessor.from_pretrained(os.path.join(local_dir, cfg_c["proc_subfolder"])) |
| } |
| |
| print("All models synchronized. Auditor is active.") |
|
|
| |
|
|
| def _generate_sync(m_name, image, temp, top_k, top_p): |
| m_data = MODELS[m_name] |
| if m_name == "vit": |
| i_proc, t_proc = m_data["processor"] |
| inputs = i_proc(images=image, return_tensors="pt").to(DEVICE) |
| ids = m_data["model"].generate(**inputs, max_length=80, do_sample=True, temperature=temp, top_k=top_k, top_p=top_p) |
| return t_proc.batch_decode(ids, skip_special_tokens=True)[0].strip() |
| else: |
| proc = m_data["processor"] |
| inputs = proc(images=image, return_tensors="pt").to(DEVICE) |
| ids = m_data["model"].generate(**inputs, max_length=80, do_sample=True, temperature=temp, top_k=top_k, top_p=top_p) |
| return proc.batch_decode(ids, skip_special_tokens=True)[0].strip() |
|
|
| |
|
|
| @app.post("/generate") |
| async def generate_captions( |
| file: UploadFile = File(...), |
| temp: float = Query(0.8), |
| top_k: int = Query(50), |
| top_p: float = Query(0.9) |
| ): |
| """Generates 5 diverse captions using the model ensemble.""" |
| image = Image.open(file.file).convert("RGB") |
| architectures = ["blip", "vit"] |
| selection = random.choices(architectures, k=5) |
| |
| tasks = [asyncio.to_thread(_generate_sync, m, image, temp, top_k, top_p) for m in selection] |
| captions = await asyncio.gather(*tasks) |
| |
| return {"captions": captions, "metadata": {"models_used": selection, "temp": temp}} |
|
|
| @app.post("/saliency") |
| async def get_vision_saliency(file: UploadFile = File(...)): |
| """Objective Saliency: Shows what the Vision Encoder focuses on (Self-Attention).""" |
| image_bytes = await file.read() |
| orig_img = Image.open(io.BytesIO(image_bytes)).convert("RGB") |
| |
| blip = MODELS["blip"] |
| inputs = blip["processor"](images=orig_img, return_tensors="pt").to(DEVICE) |
| |
| with torch.no_grad(): |
| outputs = blip["model"].vision_model(inputs.pixel_values, output_attentions=True) |
| attentions = outputs.attentions[-1] |
| |
| mask_1d = attentions[0, :, 0, 1:].mean(dim=0) |
| grid_size = int(np.sqrt(mask_1d.shape[-1])) |
| mask = mask_1d.view(grid_size, grid_size).cpu().numpy() |
|
|
| mask = (mask - mask.min()) / (mask.max() - mask.min() + 1e-8) |
| mask_img = Image.fromarray((mask * 255).astype('uint8')).resize(orig_img.size, resample=Image.BICUBIC) |
| mask_img = mask_img.filter(ImageFilter.GaussianBlur(radius=10)) |
| |
| heatmap = plt.get_cmap('magma')(np.array(mask_img)/255.0) |
| heatmap_img = Image.fromarray((heatmap[:, :, :3] * 255).astype('uint8')).convert("RGB") |
| blended = Image.blend(orig_img, heatmap_img, alpha=0.6) |
| |
| buf = io.BytesIO() |
| blended.save(buf, format="PNG") |
| buf.seek(0) |
| return StreamingResponse(buf, media_type="image/png") |
|
|
| @app.post("/audit") |
| async def internal_debate_audit(file: UploadFile = File(...), user_prompt: str = Query(...)): |
| """The CLIP-Powered Jury: Compares User Intent vs. Model Perception.""" |
| image_bytes = await file.read() |
| image = Image.open(io.BytesIO(image_bytes)).convert("RGB") |
| |
| |
| blip_caption = await asyncio.to_thread(_generate_sync, "blip", image, 0.7, 50, 0.9) |
| |
| |
| clip_m = MODELS["clip"]["model"] |
| clip_p = MODELS["clip"]["processor"] |
| |
| inputs = clip_p(text=[user_prompt, blip_caption], images=image, return_tensors="pt", padding=True).to(DEVICE) |
| |
| with torch.no_grad(): |
| outputs = clip_m(**inputs) |
| probs = outputs.logits_per_image.softmax(dim=-1).cpu().numpy()[0] |
| |
| u_score, m_score = float(probs[0]), float(probs[1]) |
|
|
| |
| if u_score < 0.35: |
| verdict = "Perspective Divergence: Intent not grounded in image." |
| elif abs(u_score - m_score) < 0.15: |
| verdict = "Consensus: High Alignment." |
| else: |
| verdict = "Model Bias Detected." |
|
|
| return { |
| "perspectives": {"user": user_prompt, "ai": blip_caption}, |
| "audit_scores": {"intent_grounding": round(u_score, 4), "ai_grounding": round(m_score, 4)}, |
| "verdict": verdict |
| } |