ek-5 commited on
Commit
0cff19b
·
verified ·
1 Parent(s): 9daf1a5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -13
app.py CHANGED
@@ -7,7 +7,7 @@ from ultralytics import YOLO
7
  from PIL import Image
8
  import uvicorn
9
 
10
- app = FastAPI(title="YOLO + GIT Large: Color & Shape Edition")
11
 
12
  device = "cuda" if torch.cuda.is_available() else "cpu"
13
  MY_MODEL_PATH = 'best.pt'
@@ -15,12 +15,18 @@ MY_MODEL_PATH = 'best.pt'
15
  # تحميل الموديلات
16
  try:
17
  detection_model = YOLO(MY_MODEL_PATH)
 
18
  except:
19
  detection_model = YOLO("yolov8n.pt")
 
20
 
21
  processor = AutoProcessor.from_pretrained("microsoft/git-large")
22
  caption_model = AutoModelForCausalLM.from_pretrained("microsoft/git-large").to(device)
23
 
 
 
 
 
24
  @app.post("/analyze")
25
  async def analyze_image(file: UploadFile = File(...)):
26
  data = await file.read()
@@ -34,35 +40,46 @@ async def analyze_image(file: UploadFile = File(...)):
34
  label = r.names[int(box.cls)]
35
  coords = box.xyxy[0].tolist()
36
 
37
- # قص العنصر مع هامش (Padding) لرؤية الشكل واللون بوضوح
38
- pad = 10
39
  cropped_img = original_image.crop((
40
  max(0, coords[0]-pad), max(0, coords[1]-pad),
41
  min(original_image.width, coords[2]+pad), min(original_image.height, coords[3]+pad)
42
  ))
43
 
44
- # --- التعديل الجوهري للوصف ---
45
- # نستخدم برومبت مفتوح ليقوم الموديل بالوصف التلقائي
46
- inputs = processor(images=cropped_img, return_tensors="pt").to(device)
 
 
47
 
48
  generated_ids = caption_model.generate(
49
  pixel_values=inputs.pixel_values,
50
- max_new_tokens=50,
 
51
  num_beams=5,
52
- do_sample=True, # تفعيل التنوع في الكلمات
53
- temperature=0.8, # درجة "إبداع" لوصف الألوان بدقة
54
- repetition_penalty=1.2
55
  )
56
 
57
- description = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
 
 
 
 
 
 
58
 
59
  integrated_results.append({
60
  "object_id": i + 1,
61
  "label": label,
62
- "visual_description": f"This {label} is {description}"
63
  })
64
 
 
 
 
65
  return {"results": integrated_results}
66
 
67
  if __name__ == "__main__":
68
- uvicorn.run(app, host="0.0.0.0", port=7860)
 
7
  from PIL import Image
8
  import uvicorn
9
 
10
+ app = FastAPI(title="YOLO + GIT Large: Visual Analysis (Color & Shape)")
11
 
12
  device = "cuda" if torch.cuda.is_available() else "cpu"
13
  MY_MODEL_PATH = 'best.pt'
 
15
  # تحميل الموديلات
16
  try:
17
  detection_model = YOLO(MY_MODEL_PATH)
18
+ print("✅ YOLO Model Loaded")
19
  except:
20
  detection_model = YOLO("yolov8n.pt")
21
+ print("⚠️ Using Default YOLOv8n")
22
 
23
  processor = AutoProcessor.from_pretrained("microsoft/git-large")
24
  caption_model = AutoModelForCausalLM.from_pretrained("microsoft/git-large").to(device)
25
 
26
+ @app.get("/")
27
+ def home():
28
+ return {"message": "Server is running. Use /docs to test /analyze endpoint."}
29
+
30
  @app.post("/analyze")
31
  async def analyze_image(file: UploadFile = File(...)):
32
  data = await file.read()
 
40
  label = r.names[int(box.cls)]
41
  coords = box.xyxy[0].tolist()
42
 
43
+ # قص العنصر مع هامش (Padding) 15 بكسل لرؤية الزوايا والأطراف بدقة
44
+ pad = 15
45
  cropped_img = original_image.crop((
46
  max(0, coords[0]-pad), max(0, coords[1]-pad),
47
  min(original_image.width, coords[2]+pad), min(original_image.height, coords[3]+pad)
48
  ))
49
 
50
+ # --- التعديل هنا: برومبت يركز على الصفات البصرية ---
51
+ # بدأنا الجملة بصفات "اللون والشكل" ليقوم الموديل بإكمال الوصف
52
+ prompt = f"a photo of a {label}. the specific color and shape of this {label} are"
53
+
54
+ inputs = processor(images=cropped_img, text=prompt, return_tensors="pt").to(device)
55
 
56
  generated_ids = caption_model.generate(
57
  pixel_values=inputs.pixel_values,
58
+ input_ids=inputs.input_ids,
59
+ max_new_tokens=40, # عدد كلمات كافٍ للوصف
60
  num_beams=5,
61
+ repetition_penalty=1.3,
62
+ do_sample=False # نستخدم Beam Search هنا لدقة أعلى في الألوان
 
63
  )
64
 
65
+ full_desc = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
66
+
67
+ # تنظيف النتيجة لاستخراج الوصف فقط بعد البرومبت
68
+ if prompt in full_desc:
69
+ visual_details = full_desc.split(prompt)[-1].strip()
70
+ else:
71
+ visual_details = full_desc.replace(f"a photo of a {label}", "").strip()
72
 
73
  integrated_results.append({
74
  "object_id": i + 1,
75
  "label": label,
76
+ "visual_description": f"The {label} has {visual_details}"
77
  })
78
 
79
+ if not integrated_results:
80
+ return {"message": "No objects detected."}
81
+
82
  return {"results": integrated_results}
83
 
84
  if __name__ == "__main__":
85
+ uvicorn.run(app, host="0.0.0.0", port=7860)