| | import os |
| | import torch |
| | import io |
| | from fastapi import FastAPI, File, UploadFile |
| | from transformers import AutoProcessor, AutoModelForCausalLM |
| | from ultralytics import YOLO |
| | from PIL import Image |
| | import uvicorn |
| |
|
| | |
| | app = FastAPI(title="YOLO + GIT Large: Final Visual Description API") |
| |
|
| | device = "cuda" if torch.cuda.is_available() else "cpu" |
| | MY_MODEL_PATH = 'best.pt' |
| |
|
| | print(f"🔄 جاري التحميل على جهاز: {device}...") |
| |
|
| | |
| | try: |
| | detection_model = YOLO(MY_MODEL_PATH) |
| | print("✅ YOLO Model: Loaded successfully") |
| | except Exception as e: |
| | print(f"⚠️ YOLO Warning: Using default yolov8n.pt - {e}") |
| | detection_model = YOLO("yolov8n.pt") |
| |
|
| | |
| | model_name = "microsoft/git-large" |
| | processor = AutoProcessor.from_pretrained(model_name) |
| | caption_model = AutoModelForCausalLM.from_pretrained(model_name).to(device) |
| | print(f"✅ Caption Model: {model_name} Loaded") |
| |
|
| | @app.get("/") |
| | def home(): |
| | return {"status": "Online", "instruction": "Use /docs to test the /analyze endpoint"} |
| |
|
| | |
| |
|
| | @app.post("/analyze") |
| | async def analyze_image(file: UploadFile = File(...)): |
| | |
| | data = await file.read() |
| | original_image = Image.open(io.BytesIO(data)).convert("RGB") |
| |
|
| | |
| | results = detection_model(original_image, conf=0.25) |
| | integrated_results = [] |
| |
|
| | for r in results: |
| | for i, box in enumerate(r.boxes): |
| | label = r.names[int(box.cls)] |
| | coords = box.xyxy[0].tolist() |
| |
|
| | |
| | pad = 20 |
| | left = max(0, coords[0] - pad) |
| | top = max(0, coords[1] - pad) |
| | right = min(original_image.width, coords[2] + pad) |
| | bottom = min(original_image.height, coords[3] + pad) |
| | |
| | cropped_img = original_image.crop((left, top, right, bottom)) |
| |
|
| | |
| | |
| | inputs = processor(images=cropped_img, return_tensors="pt").to(device) |
| | |
| | generated_ids = caption_model.generate( |
| | pixel_values=inputs.pixel_values, |
| | max_length=60, |
| | min_length=12, |
| | num_beams=5, |
| | repetition_penalty=1.5, |
| | early_stopping=True |
| | ) |
| | |
| | |
| | description = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] |
| |
|
| | integrated_results.append({ |
| | "object_id": i + 1, |
| | "label": label, |
| | "confidence": f"{float(box.conf[0]):.2f}", |
| | "visual_description": f"Detected {label}: {description.strip()}" |
| | }) |
| |
|
| | |
| | if not integrated_results: |
| | inputs = processor(images=original_image, return_tensors="pt").to(device) |
| | out = caption_model.generate(pixel_values=inputs.pixel_values, max_length=50) |
| | general_desc = processor.batch_decode(out, skip_special_tokens=True)[0] |
| | return { |
| | "message": "No specific objects detected by YOLO.", |
| | "general_scene_description": general_desc |
| | } |
| |
|
| | return { |
| | "detected_count": len(integrated_results), |
| | "results": integrated_results |
| | } |
| |
|
| | |
| | if __name__ == "__main__": |
| | uvicorn.run(app, host="0.0.0.0", port=7860) |