Spaces:

SaniaE
/

Image_Captioning_Ensemble_API

Sleeping

App Files Files Community

SaniaE commited on 15 days ago

Commit

ed741f2

verified ·

1 Parent(s): 8f6f2d9

added app.py

Browse files

Files changed (1) hide show

app.py +152 -0

app.py ADDED Viewed

	@@ -0,0 +1,152 @@

+import os
+import torch
+import random
+from PIL import Image
+from fastapi import FastAPI, UploadFile, File, Query
+from fastapi.middleware.cors import CORSMiddleware
+from huggingface_hub import snapshot_download, login
+from transformers import (
+    BlipProcessor, BlipForConditionalGeneration,
+    ViTImageProcessor, AutoProcessor, AutoModelForCausalLM
+)
+from sentence_transformers import SentenceTransformer, util
+app = FastAPI()
+# Configuration
+REPO_ID = "SaniaE/Image_Captioning_Ensemble"
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+MODELS = {}
+SEARCH_MODEL = None
+# We'll map your local folder names to the specific config
+MODEL_SETTINGS = {
+    "blip": {
+        "subfolder": "blip",
+        "processor": BlipProcessor,
+        "pretrained_path": "Salesforce/blip-image-captioning-large",
+        "inference_model": BlipForConditionalGeneration
+    },
+    "vit": {
+        "subfolder": "vit",
+        "processor": [ViTImageProcessor, AutoProcessor],
+        "pretrained_path": ["nlpconnect/vit-gpt2-image-captioning", "microsoft/git-large"],
+        "inference_model": AutoModelForCausalLM
+    },
+    "git": {
+        "subfolder": "git",
+        "processor": AutoProcessor,
+        "pretrained_path": "microsoft/git-base",
+        "inference_model": AutoModelForCausalLM
+    }
+}
+@app.on_event("startup")
+async def startup_event():
+    global MODELS, SEARCH_MODEL
+    # 1. Authenticate and Download from Private Repo
+    token = os.getenv("HF_TOKEN")
+    if token:
+        login(token=token)
+    print(f"Downloading ensemble models from {REPO_ID}...")
+    # This downloads the whole repo into a local 'weights' directory
+    local_dir = snapshot_download(repo_id=REPO_ID, token=token, local_dir="weights")
+    # 2. Load Models from the downloaded folders
+    for name, cfg in MODEL_SETTINGS.items():
+        ckpt_path = os.path.join(local_dir, cfg["subfolder"])
+        inf_model = cfg["inference_model"]
+        pretrained = cfg["pretrained_path"]
+        proc_class = cfg["processor"]
+        print(f"Loading {name} from {ckpt_path}...")
+        # from_pretrained handles .safetensors automatically
+        model = inf_model.from_pretrained(ckpt_path).to(DEVICE)
+        if name == "vit":
+            i_proc = proc_class[0].from_pretrained(pretrained[0])
+            t_proc = proc_class[1].from_pretrained(pretrained[1])
+            processor = (i_proc, t_proc)
+        else:
+            processor = proc_class.from_pretrained(pretrained)
+        MODELS[name] = {"model": model, "processor": processor}
+    SEARCH_MODEL = SentenceTransformer('clip-ViT-B-32')
+    print("Ensemble is live!")
+@app.post("/generate")
+async def generate_endpoint(
+    file: UploadFile = File(...),
+    temp: float = Query(0.8),
+    top_k: int = Query(100),
+    top_p: float = Query(0.9)
+):
+    image = Image.open(file.file).convert("RGB")
+    captions = []
+    # Randomly select which models to use for the 5 slots
+    available = list(MODELS.keys())
+    model_selection = random.choices(available, k=5)
+    for m_name in model_selection:
+        m_data = MODELS[m_name]
+        model = m_data["model"]
+        if m_name == "vit":
+            i_proc, t_proc = m_data["processor"]
+            pixel_values = i_proc(images=image, return_tensors="pt").pixel_values.to(DEVICE)
+            gen_ids = model.generate(
+                pixel_values=pixel_values, max_length=200, do_sample=True,
+                temperature=temp, top_k=top_k, top_p=top_p
+            )
+            cap = t_proc.batch_decode(gen_ids, skip_special_tokens=True)[0]
+        else:
+            proc = m_data["processor"]
+            pixel_values = proc(images=image, return_tensors="pt").pixel_values.to(DEVICE)
+            gen_ids = model.generate(
+                pixel_values=pixel_values, max_length=200, do_sample=True,
+                temperature=temp, top_k=top_k, top_p=top_p
+            )
+            cap = proc.batch_decode(gen_ids, skip_special_tokens=True)[0]
+        captions.append(cap.strip())
+    return {"captions": captions, "mix": model_selection}
+@app.post("/ui-tester")
+async def ui_tester(file: UploadFile = File(...), description: str = Query(...)):
+    """Matches a user description against an image using CLIP embeddings."""
+    image = Image.open(file.file).convert("RGB")
+    img_emb = SEARCH_MODEL.encode(image)
+    txt_emb = SEARCH_MODEL.encode(description)
+    # Calculate cosine similarity
+    score = util.cos_sim(img_emb, txt_emb).item()
+    return {
+        "match_score": round(score, 4),
+        "is_match": score > 0.25, # Threshold can be adjusted
+        "status": "High correlation" if score > 0.3 else "Low correlation"
+    }
+@app.get("/ui-search")
+async def ui_search(description: str = Query(...)):
+    """Returns top image matches from a gallery based on a text description."""
+    if not IMAGE_GALLERY_EMBEDDINGS:
+        return {"error": "Gallery not initialized"}
+    query_emb = SEARCH_MODEL.encode(description)
+    hits = util.semantic_search(query_emb, IMAGE_GALLERY_EMBEDDINGS, top_k=3)
+    results = []
+    for hit in hits[0]:
+        results.append({
+            "image_path": IMAGE_PATHS[hit['corpus_id']],
+            "score": round(hit['score'], 4)
+        })
+    return {"results": results}