Final_App / app.py
ek-5's picture
Update app.py
f2df4ab verified
import os
import torch
import io
from fastapi import FastAPI, File, UploadFile
from transformers import AutoProcessor, AutoModelForCausalLM
from ultralytics import YOLO
from PIL import Image
import uvicorn
# --- 1. إعداد التطبيق والموديلات ---
app = FastAPI(title="YOLO + GIT Large: Final Visual Description API")
device = "cuda" if torch.cuda.is_available() else "cpu"
MY_MODEL_PATH = 'best.pt'
print(f"🔄 جاري التحميل على جهاز: {device}...")
# تحميل YOLO الخاص بكِ
try:
detection_model = YOLO(MY_MODEL_PATH)
print("✅ YOLO Model: Loaded successfully")
except Exception as e:
print(f"⚠️ YOLO Warning: Using default yolov8n.pt - {e}")
detection_model = YOLO("yolov8n.pt")
# تحميل موديل الوصف GIT-Large
model_name = "microsoft/git-large"
processor = AutoProcessor.from_pretrained(model_name)
caption_model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
print(f"✅ Caption Model: {model_name} Loaded")
@app.get("/")
def home():
return {"status": "Online", "instruction": "Use /docs to test the /analyze endpoint"}
# --- 2. وظيفة المعالجة والتحليل ---
@app.post("/analyze")
async def analyze_image(file: UploadFile = File(...)):
# قراءة الصورة
data = await file.read()
original_image = Image.open(io.BytesIO(data)).convert("RGB")
# كشف الأجسام باستخدام YOLO
results = detection_model(original_image, conf=0.25)
integrated_results = []
for r in results:
for i, box in enumerate(r.boxes):
label = r.names[int(box.cls)]
coords = box.xyxy[0].tolist()
# قص العنصر مع هامش (Padding) 20 بكسل لرؤية الشكل واللون بوضوح
pad = 20
left = max(0, coords[0] - pad)
top = max(0, coords[1] - pad)
right = min(original_image.width, coords[2] + pad)
bottom = min(original_image.height, coords[3] + pad)
cropped_img = original_image.crop((left, top, right, bottom))
# --- استراتيجية الوصف الحر (بدون برومبت نصي مقيد) ---
# نترك الموديل يحلل الصورة بصرياً فقط
inputs = processor(images=cropped_img, return_tensors="pt").to(device)
generated_ids = caption_model.generate(
pixel_values=inputs.pixel_values,
max_length=60, # طول كافٍ لوصف اللون والشكل
min_length=12, # إجبار الموديل على التفصيل وعدم الاختصار
num_beams=5, # جودة عالية في اختيار الكلمات
repetition_penalty=1.5,
early_stopping=True
)
# فك التشفير للوصف الناتج
description = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
integrated_results.append({
"object_id": i + 1,
"label": label,
"confidence": f"{float(box.conf[0]):.2f}",
"visual_description": f"Detected {label}: {description.strip()}"
})
# في حال لم يتم كشف أي شيء
if not integrated_results:
inputs = processor(images=original_image, return_tensors="pt").to(device)
out = caption_model.generate(pixel_values=inputs.pixel_values, max_length=50)
general_desc = processor.batch_decode(out, skip_special_tokens=True)[0]
return {
"message": "No specific objects detected by YOLO.",
"general_scene_description": general_desc
}
return {
"detected_count": len(integrated_results),
"results": integrated_results
}
# --- 3. تشغيل السيرفر ---
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=7860)