import os import torch import io from fastapi import FastAPI, File, UploadFile from transformers import AutoProcessor, AutoModelForCausalLM from ultralytics import YOLO from PIL import Image import uvicorn # --- 1. إعداد التطبيق والموديلات --- app = FastAPI(title="YOLO + GIT Large: Final Visual Description API") device = "cuda" if torch.cuda.is_available() else "cpu" MY_MODEL_PATH = 'best.pt' print(f"🔄 جاري التحميل على جهاز: {device}...") # تحميل YOLO الخاص بكِ try: detection_model = YOLO(MY_MODEL_PATH) print("✅ YOLO Model: Loaded successfully") except Exception as e: print(f"⚠️ YOLO Warning: Using default yolov8n.pt - {e}") detection_model = YOLO("yolov8n.pt") # تحميل موديل الوصف GIT-Large model_name = "microsoft/git-large" processor = AutoProcessor.from_pretrained(model_name) caption_model = AutoModelForCausalLM.from_pretrained(model_name).to(device) print(f"✅ Caption Model: {model_name} Loaded") @app.get("/") def home(): return {"status": "Online", "instruction": "Use /docs to test the /analyze endpoint"} # --- 2. وظيفة المعالجة والتحليل --- @app.post("/analyze") async def analyze_image(file: UploadFile = File(...)): # قراءة الصورة data = await file.read() original_image = Image.open(io.BytesIO(data)).convert("RGB") # كشف الأجسام باستخدام YOLO results = detection_model(original_image, conf=0.25) integrated_results = [] for r in results: for i, box in enumerate(r.boxes): label = r.names[int(box.cls)] coords = box.xyxy[0].tolist() # قص العنصر مع هامش (Padding) 20 بكسل لرؤية الشكل واللون بوضوح pad = 20 left = max(0, coords[0] - pad) top = max(0, coords[1] - pad) right = min(original_image.width, coords[2] + pad) bottom = min(original_image.height, coords[3] + pad) cropped_img = original_image.crop((left, top, right, bottom)) # --- استراتيجية الوصف الحر (بدون برومبت نصي مقيد) --- # نترك الموديل يحلل الصورة بصرياً فقط inputs = processor(images=cropped_img, return_tensors="pt").to(device) generated_ids = caption_model.generate( pixel_values=inputs.pixel_values, max_length=60, # طول كافٍ لوصف اللون والشكل min_length=12, # إجبار الموديل على التفصيل وعدم الاختصار num_beams=5, # جودة عالية في اختيار الكلمات repetition_penalty=1.5, early_stopping=True ) # فك التشفير للوصف الناتج description = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] integrated_results.append({ "object_id": i + 1, "label": label, "confidence": f"{float(box.conf[0]):.2f}", "visual_description": f"Detected {label}: {description.strip()}" }) # في حال لم يتم كشف أي شيء if not integrated_results: inputs = processor(images=original_image, return_tensors="pt").to(device) out = caption_model.generate(pixel_values=inputs.pixel_values, max_length=50) general_desc = processor.batch_decode(out, skip_special_tokens=True)[0] return { "message": "No specific objects detected by YOLO.", "general_scene_description": general_desc } return { "detected_count": len(integrated_results), "results": integrated_results } # --- 3. تشغيل السيرفر --- if __name__ == "__main__": uvicorn.run(app, host="0.0.0.0", port=7860)