File size: 31,651 Bytes
fcc5f8d
 
 
 
 
 
 
86062d2
 
fcc5f8d
 
 
 
86062d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fcc5f8d
86062d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0b851ec
 
 
18315be
0b851ec
18315be
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0b851ec
86062d2
 
 
 
 
fcc5f8d
86062d2
 
fcc5f8d
0b851ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86062d2
 
 
0b851ec
 
 
 
 
 
 
 
 
 
86062d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fcc5f8d
 
 
86062d2
 
 
 
0d13a55
86062d2
0d13a55
 
86062d2
 
 
 
0d13a55
86062d2
0b851ec
0d13a55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86062d2
 
 
 
 
0b851ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fcc5f8d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86062d2
0b851ec
 
 
fcc5f8d
86062d2
 
 
 
 
 
 
 
 
 
 
 
 
0b851ec
0d13a55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0b851ec
 
 
 
 
 
 
 
86062d2
0b851ec
 
 
 
 
 
 
 
 
86062d2
 
0b851ec
0d13a55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0b851ec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0d13a55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0b851ec
 
 
 
 
 
 
 
 
0d13a55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0b851ec
86062d2
0b851ec
 
86062d2
0b851ec
 
 
86062d2
0b851ec
 
 
fcc5f8d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86062d2
 
 
 
 
 
 
 
fcc5f8d
 
 
 
 
 
 
 
 
 
 
 
 
 
86062d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0b851ec
 
 
 
 
 
86062d2
 
fcc5f8d
 
 
 
 
 
 
 
86062d2
 
 
 
fcc5f8d
 
 
 
 
 
 
86062d2
fcc5f8d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86062d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fcc5f8d
86062d2
 
 
fcc5f8d
 
 
 
 
 
 
86062d2
 
 
 
fcc5f8d
 
0b851ec
 
 
 
 
 
 
86062d2
 
0b851ec
86062d2
 
 
 
 
 
 
 
 
 
 
 
fcc5f8d
 
86062d2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
import os
import io
import json
import base64
import gradio as gr
import numpy as np
import torch
import subprocess
import sys
from PIL import Image
from huggingface_hub import hf_hub_download
from typing import Optional, Dict, Any, List, Union

# Setup OmniParser repository and models
def setup_omniparser():
    """Clone OmniParser repository and download model weights"""
    try:
        # Check if OmniParser repository exists
        if not os.path.exists("OmniParser"):
            print("Cloning OmniParser repository...")
            subprocess.run(["git", "clone", "https://github.com/microsoft/OmniParser.git"], check=True)
        
        # Add OmniParser to Python path
        omniparser_path = os.path.abspath("OmniParser")
        if omniparser_path not in sys.path:
            sys.path.append(omniparser_path)
            print(f"Added {omniparser_path} to Python path")
        
        # Create weights directory
        os.makedirs("OmniParser/weights/icon_detect", exist_ok=True)
        os.makedirs("OmniParser/weights/icon_caption_florence", exist_ok=True)
        
        # Download model weights if they don't exist
        if not os.path.exists("OmniParser/weights/icon_detect/model.pt") or not os.path.exists("OmniParser/weights/icon_caption_florence/model.safetensors"):
            print("Downloading model weights...")
            
            # Download detection model files
            for f in ["train_args.yaml", "model.pt", "model.yaml"]:
                hf_hub_download(
                    repo_id="microsoft/OmniParser-v2.0",
                    filename=f"icon_detect/{f}",
                    local_dir="OmniParser/weights"
                )
            
            # Download caption model files
            for f in ["config.json", "generation_config.json", "model.safetensors"]:
                hf_hub_download(
                    repo_id="microsoft/OmniParser-v2.0",
                    filename=f"icon_caption/{f}",
                    local_dir="OmniParser/weights"
                )
                
            # Rename the caption folder to match expected path
            if os.path.exists("OmniParser/weights/icon_caption") and not os.path.exists("OmniParser/weights/icon_caption_florence"):
                os.rename("OmniParser/weights/icon_caption", "OmniParser/weights/icon_caption_florence")
        
        # Patch PaddleOCR initialization in utils.py to fix compatibility issue
        utils_path = os.path.join(omniparser_path, "util", "utils.py")
        if os.path.exists(utils_path):
            print("Patching utils.py to fix compatibility issues...")
            
            # Create a simplified version of utils.py with essential functions
            simplified_utils = """import os

import io

import cv2

import base64

import numpy as np

import torch

from PIL import Image, ImageDraw



def check_ocr_box(image, display_img=False, output_bb_format='xyxy', goal_filtering=None, 

                 easyocr_args=None, use_paddleocr=True):

    """
    Custom implementation of check_ocr_box that uses EasyOCR
    """

    try:

        import easyocr

        

        # Convert PIL Image to numpy array

        img_np = np.array(image)

        

        # Initialize EasyOCR

        reader = easyocr.Reader(['en'])

        

        # Run OCR

        results = reader.readtext(img_np)

        

        # Extract text and bounding boxes

        texts = []

        boxes = []

        

        for result in results:

            box, text, _ = result

            texts.append(text)

            

            # Convert box format if needed

            if output_bb_format == 'xyxy':

                # Convert from [[x1,y1],[x2,y2],[x3,y3],[x4,y4]] to [x1,y1,x3,y3]

                x1, y1 = box[0]

                x3, y3 = box[2]

                boxes.append([x1, y1, x3, y3])

            else:

                boxes.append(box)

        

        return (texts, boxes), False

    except Exception as e:

        print(f"Error in OCR: {str(e)}")

        return ([], []), False



def get_yolo_model(model_path):

    """
    Load YOLO model for icon detection
    """

    try:

        from ultralytics import YOLO

        model = YOLO(model_path)

        return model

    except Exception as e:

        print(f"Error loading YOLO model: {str(e)}")

        return None



def get_caption_model_processor(model_name, model_name_or_path):

    """
    Load caption model and processor
    """

    try:

        from transformers import AutoProcessor, AutoModelForCausalLM

        

        processor = AutoProcessor.from_pretrained(model_name_or_path)

        model = AutoModelForCausalLM.from_pretrained(

            model_name_or_path,

            torch_dtype=torch.float16,

            device_map="auto"

        )

        

        return (model, processor)

    except Exception as e:

        print(f"Error loading caption model: {str(e)}")

        return None



def get_som_labeled_img(image, yolo_model, BOX_TRESHOLD=0.05, output_coord_in_ratio=True, 

                        ocr_bbox=None, draw_bbox_config=None, caption_model_processor=None, 

                        ocr_text=None, iou_threshold=0.1, imgsz=640):

    """
    Simplified implementation of get_som_labeled_img
    """

    try:

        # Create a copy of the image for visualization

        vis_img = image.copy()

        draw = ImageDraw.Draw(vis_img)

        

        # Run YOLO detection

        results = yolo_model(image, imgsz=imgsz)

        

        # Process results

        elements = []

        for i, det in enumerate(results[0].boxes.data):

            x1, y1, x2, y2, conf, cls = det

            

            if conf < BOX_TRESHOLD:

                continue

                

            # Draw bounding box

            draw.rectangle([x1, y1, x2, y2], outline="red", width=2)

            

            # Generate caption

            caption = f"UI Element {i}"

            

            # Add to elements list

            elements.append({

                "id": i,

                "text": "",

                "caption": caption,

                "coordinates": [x1/image.width, y1/image.height, x2/image.width, y2/image.height],

                "is_interactable": True,

                "confidence": float(conf)

            })

        

        # Convert to base64

        buffered = io.BytesIO()

        vis_img.save(buffered, format="PNG")

        img_str = "data:image/png;base64," + base64.b64encode(buffered.getvalue()).decode()

        

        return img_str, [], elements

    except Exception as e:

        print(f"Error in get_som_labeled_img: {str(e)}")

        return "Error processing image", [], []

"""
            
            # Write the simplified utils.py
            with open(utils_path, 'w') as f:
                f.write(simplified_utils)
            print("Created simplified utils.py with essential functions")
        
        print("OmniParser setup completed successfully!")
        return True
    except Exception as e:
        print(f"Error setting up OmniParser: {str(e)}")
        return False

# Setup OmniParser
setup_success = setup_omniparser()

# Create our own implementation of check_ocr_box to avoid PaddleOCR issues
def custom_check_ocr_box(image, display_img=False, output_bb_format='xyxy', goal_filtering=None, 

                         easyocr_args=None, use_paddleocr=True):
    """

    Custom implementation of check_ocr_box that doesn't rely on PaddleOCR

    """
    print("Using custom OCR implementation (EasyOCR only)")
    try:
        import easyocr
        import numpy as np
        
        # Convert PIL Image to numpy array
        img_np = np.array(image)
        
        # Initialize EasyOCR
        reader = easyocr.Reader(['en'])
        
        # Run OCR
        results = reader.readtext(img_np)
        
        # Extract text and bounding boxes
        texts = []
        boxes = []
        
        for result in results:
            box, text, _ = result
            texts.append(text)
            
            # Convert box format if needed
            if output_bb_format == 'xyxy':
                # Convert from [[x1,y1],[x2,y2],[x3,y3],[x4,y4]] to [x1,y1,x3,y3]
                x1, y1 = box[0]
                x3, y3 = box[2]
                boxes.append([x1, y1, x3, y3])
            else:
                boxes.append(box)
        
        return (texts, boxes), False
    except Exception as e:
        print(f"Error in custom OCR: {str(e)}")
        return ([], []), False

# Import OmniParser utilities
if setup_success:
    try:
        # First try to import the patched version
        from OmniParser.util.utils import get_yolo_model, get_caption_model_processor, get_som_labeled_img
        
        # Try to import check_ocr_box, but use our custom version if it fails
        try:
            from OmniParser.util.utils import check_ocr_box
            print("Successfully imported all OmniParser utilities")
        except (ImportError, ValueError) as e:
            print(f"Using custom OCR implementation due to error: {str(e)}")
            check_ocr_box = custom_check_ocr_box
    except ImportError as e:
        print(f"Error importing OmniParser utilities: {str(e)}")
        # Fallback to a simple error message
        def error_message(*args, **kwargs):
            return "Error: OmniParser utilities could not be imported. Please check the logs."
        
        # Create dummy functions that return error messages
        check_ocr_box = get_yolo_model = get_caption_model_processor = get_som_labeled_img = error_message
else:
    print("Using dummy functions due to setup failure")
    # Create dummy functions that return error messages
    def error_message(*args, **kwargs):
        return "Error: OmniParser setup failed. Please check the logs."
    
    check_ocr_box = get_yolo_model = get_caption_model_processor = get_som_labeled_img = error_message

# Initialize models
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Initialize models with correct paths
try:
    # YOLO model for object detection
    yolo_model = get_yolo_model(model_path='OmniParser/weights/icon_detect/model.pt')
    
    # VLM (Vision Language Model) for captioning
    caption_model_processor = get_caption_model_processor(
        model_name="florence2", 
        model_name_or_path="OmniParser/weights/icon_caption_florence"
    )
    
    print("Models initialized successfully")
    models_initialized = True
    
    # ENHANCEMENT OPPORTUNITY: Data Fusion
    # The current implementation uses YOLO for detection and VLM for captioning separately.
    # A more integrated approach could:
    # 1. Use YOLO for initial detection of UI elements
    # 2. Use VLM to refine the detections and provide more context
    # 3. Implement a confidence-based merging strategy for overlapping detections
    # 4. Use SAM (Segment Anything Model) for more precise segmentation of UI elements
    #
    # Example implementation:
    # ```
    # def enhanced_detection(image, yolo_model, vlm_model, sam_model):
    #     # Get YOLO detections
    #     yolo_boxes = yolo_model(image)
    #     
    #     # Use VLM to analyze the entire image for context
    #     global_context = vlm_model.analyze_image(image)
    #     
    #     # For each YOLO box, use VLM to get more detailed information
    #     refined_detections = []
    #     for box in yolo_boxes:
    #         # Crop the region
    #         region = crop_image(image, box)
    #         
    #         # Get VLM description
    #         description = vlm_model.describe_region(region, context=global_context)
    #         
    #         # Use SAM for precise segmentation
    #         mask = sam_model.segment(image, box)
    #         
    #         refined_detections.append({
    #             "box": box,
    #             "description": description,
    #             "mask": mask,
    #             "confidence": combine_confidence(box.conf, description.conf)
    #         })
    #     
    #     return refined_detections
    # ```
    
except Exception as e:
    print(f"Error initializing models: {str(e)}")
    # Create dummy models for graceful failure
    yolo_model = None
    caption_model_processor = None
    models_initialized = False
    
# Fallback implementation for when OmniParser fails
def fallback_process_image(image):
    """

    Fallback implementation that simulates OmniParser functionality

    for when the actual models fail to load

    """
    from PIL import Image, ImageDraw, ImageFont
    import random
    
    # Create a copy of the image for visualization
    vis_img = image.copy()
    draw = ImageDraw.Draw(vis_img)
    
    # Define some mock UI element types
    element_types = ["Button", "Text Field", "Checkbox", "Dropdown", "Menu Item", "Icon", "Link"]
    
    # Generate some random elements
    elements = []
    num_elements = min(10, int(image.width * image.height / 50000))  # Scale with image size
    
    for i in range(num_elements):
        # Generate random position and size
        x1 = random.randint(0, image.width - 100)
        y1 = random.randint(0, image.height - 50)
        width = random.randint(50, 200)
        height = random.randint(30, 80)
        x2 = min(x1 + width, image.width)
        y2 = min(y1 + height, image.height)
        
        # Generate random element type and caption
        element_type = random.choice(element_types)
        captions = {
            "Button": ["Submit", "Cancel", "OK", "Apply", "Save"],
            "Text Field": ["Enter text", "Username", "Password", "Search", "Email"],
            "Checkbox": ["Select option", "Enable feature", "Remember me", "Agree to terms"],
            "Dropdown": ["Select item", "Choose option", "Select country", "Language"],
            "Menu Item": ["File", "Edit", "View", "Help", "Tools", "Settings"],
            "Icon": ["Home", "Settings", "Profile", "Notification", "Search"],
            "Link": ["Learn more", "Click here", "Details", "Documentation", "Help"]
        }
        text = random.choice(captions[element_type])
        caption = f"{element_type}: {text}"
        
        # Add to elements list
        elements.append({
            "id": i,
            "text": text,
            "caption": caption,
            "coordinates": [x1/image.width, y1/image.height, x2/image.width, y2/image.height],
            "is_interactable": element_type in ["Button", "Checkbox", "Dropdown", "Link", "Text Field"],
            "confidence": random.uniform(0.7, 0.95)
        })
        
        # Draw on visualization
        draw.rectangle([x1, y1, x2, y2], outline="red", width=2)
        draw.text((x1, y1 - 10), f"{i}: {text}", fill="red")
    
    return {
        "elements": elements,
        "visualization": vis_img,
        "note": "This is a fallback visualization as OmniParser models could not be loaded."
    }

def process_image(

    image: Image.Image,

    box_threshold: float = 0.05,

    iou_threshold: float = 0.1,

    use_paddleocr: bool = True,

    imgsz: int = 640

) -> Dict[str, Any]:
    """

    Process an image with OmniParser and return structured data

    

    Args:

        image: PIL Image to process

        box_threshold: Threshold for bounding box confidence

        iou_threshold: Threshold for IOU overlap

        use_paddleocr: Whether to use PaddleOCR for text detection

        imgsz: Image size for icon detection

        

    Returns:

        Dictionary with parsed elements and visualization

    """
    # Check if models are initialized
    if not models_initialized or yolo_model is None or caption_model_processor is None:
        print("Models not initialized properly, using fallback implementation")
        return fallback_process_image(image)
    
    try:
        # Calculate overlay ratio based on image size
        box_overlay_ratio = image.size[0] / 3200
        
        # Configure drawing parameters
        draw_bbox_config = {
            'text_scale': 0.8 * box_overlay_ratio,
            'text_thickness': max(int(2 * box_overlay_ratio), 1),
            'text_padding': max(int(3 * box_overlay_ratio), 1),
            'thickness': max(int(3 * box_overlay_ratio), 1),
        }
        
        # Run OCR to detect text
        try:
            # ENHANCEMENT OPPORTUNITY: OCR Integration
            # The current implementation uses OCR separately from YOLO detection.
            # A more integrated approach could:
            # 1. Use OCR results to refine YOLO detections
            # 2. Merge overlapping text and UI element detections
            # 3. Use text content to improve element classification
            #
            # Example implementation:
            # ```
            # def integrated_ocr_detection(image, ocr_results, yolo_detections):
            #     merged_detections = []
            #     
            #     # For each YOLO detection
            #     for yolo_box in yolo_detections:
            #         # Find overlapping OCR text
            #         overlapping_text = []
            #         for text, text_box in ocr_results:
            #             if calculate_iou(yolo_box, text_box) > threshold:
            #                 overlapping_text.append(text)
            #         
            #         # Use text content to refine element classification
            #         element_type = classify_element_with_text(yolo_box, overlapping_text)
            #         
            #         merged_detections.append({
            #             "box": yolo_box,
            #             "text": " ".join(overlapping_text),
            #             "type": element_type
            #         })
            #     
            #     return merged_detections
            # ```
            
            ocr_bbox_rslt, is_goal_filtered = check_ocr_box(
                image, 
                display_img=False, 
                output_bb_format='xyxy', 
                goal_filtering=None, 
                easyocr_args={'paragraph': False, 'text_threshold': 0.9}, 
                use_paddleocr=use_paddleocr
            )
            
            # Check if OCR returned an error message (string)
            if isinstance(ocr_bbox_rslt, str):
                print(f"OCR error: {ocr_bbox_rslt}, using fallback implementation")
                return fallback_process_image(image)
                
            text, ocr_bbox = ocr_bbox_rslt
        except Exception as e:
            print(f"OCR error: {str(e)}, using fallback implementation")
            return fallback_process_image(image)
        
        # Process image with OmniParser
        try:
            # ENHANCEMENT OPPORTUNITY: SAM Integration
            # The current implementation doesn't use SAM (Segment Anything Model).
            # Integrating SAM could:
            # 1. Provide more precise segmentation of UI elements
            # 2. Better handle complex UI layouts with overlapping elements
            # 3. Improve detection of irregular-shaped elements
            #
            # Example implementation:
            # ```
            # def integrate_sam(image, boxes, sam_model):
            #     # Initialize SAM predictor
            #     predictor = SamPredictor(sam_model)
            #     predictor.set_image(np.array(image))
            #     
            #     refined_elements = []
            #     for box in boxes:
            #         # Convert box to SAM input format
            #         input_box = np.array([box[0], box[1], box[2], box[3]])
            #         
            #         # Get SAM mask
            #         masks, scores, _ = predictor.predict(
            #             box=input_box,
            #             multimask_output=False
            #         )
            #         
            #         # Use the mask to refine the element boundaries
            #         refined_elements.append({
            #             "box": box,
            #             "mask": masks[0],
            #             "mask_confidence": scores[0]
            #         })
            #     
            #     return refined_elements
            # ```
            
            dino_labled_img, label_coordinates, parsed_content_list = get_som_labeled_img(
                image, 
                yolo_model, 
                BOX_TRESHOLD=box_threshold, 
                output_coord_in_ratio=True, 
                ocr_bbox=ocr_bbox,
                draw_bbox_config=draw_bbox_config, 
                caption_model_processor=caption_model_processor, 
                ocr_text=text,
                iou_threshold=iou_threshold, 
                imgsz=imgsz
            )
            
            # Check if get_som_labeled_img returned an error message (string)
            if isinstance(dino_labled_img, str) and not dino_labled_img.startswith("data:"):
                print(f"OmniParser error: {dino_labled_img}, using fallback implementation")
                return fallback_process_image(image)
            
            # Convert base64 image to PIL Image
            visualization = Image.open(io.BytesIO(base64.b64decode(dino_labled_img)))
            
            # Create structured output
            elements = []
            for i, element in enumerate(parsed_content_list):
                # ENHANCEMENT OPPORTUNITY: Confidence Scoring
                # The current implementation uses a simple confidence score.
                # A more sophisticated approach could:
                # 1. Combine confidence scores from multiple models (YOLO, VLM, OCR)
                # 2. Consider element context and relationships
                # 3. Use historical data to improve confidence scoring
                #
                # Example implementation:
                # ```
                # def calculate_confidence(yolo_conf, vlm_conf, ocr_conf, element_type):
                #     # Base confidence from YOLO
                #     base_conf = yolo_conf
                #     
                #     # Adjust based on VLM confidence
                #     if vlm_conf > 0.8:
                #         base_conf = (base_conf + vlm_conf) / 2
                #     
                #     # Adjust based on element type
                #     if element_type == "button" and ocr_conf > 0.9:
                #         base_conf = (base_conf + ocr_conf) / 2
                #     
                #     # Normalize to 0-1 range
                #     return min(1.0, base_conf)
                # ```
                
                elements.append({
                    "id": i,
                    "text": element.get("text", ""),
                    "caption": element.get("caption", ""),
                    "coordinates": element.get("coordinates", []),
                    "is_interactable": element.get("is_interactable", False),
                    "confidence": element.get("confidence", 0.0)
                })
            
            # ENHANCEMENT OPPORTUNITY: Predictive Monitoring
            # The current implementation doesn't include predictive monitoring.
            # Adding this could:
            # 1. Verify that detected elements make sense in the UI context
            # 2. Identify missing or incorrectly detected elements
            # 3. Provide feedback for improving detection accuracy
            #
            # Example implementation:
            # ```
            # def verify_detections(elements, image, vlm_model):
            #     # Use VLM to analyze the entire image
            #     global_description = vlm_model.describe_image(image)
            #     
            #     # Check if detected elements match the global description
            #     expected_elements = extract_expected_elements(global_description)
            #     
            #     # Compare detected vs expected
            #     missing_elements = [e for e in expected_elements if not any(
            #         similar_element(e, detected) for detected in elements
            #     )]
            #     
            #     # Provide feedback
            #     return {
            #         "verified_elements": elements,
            #         "missing_elements": missing_elements,
            #         "confidence": calculate_overall_confidence(elements, expected_elements)
            #     }
            # ```
            
            # Return structured data and visualization
            return {
                "elements": elements,
                "visualization": visualization
            }
        except Exception as e:
            print(f"OmniParser error: {str(e)}, using fallback implementation")
            return fallback_process_image(image)
    except Exception as e:
        print(f"Error processing image: {str(e)}, using fallback implementation")
        # Use fallback implementation
        return fallback_process_image(image)

# API endpoint function
def api_endpoint(image):
    """

    API endpoint that accepts an image and returns parsed elements

    

    Args:

        image: Uploaded image file

        

    Returns:

        JSON with parsed elements

    """
    if image is None:
        return json.dumps({"error": "No image provided"})
    
    try:
        # Process the image
        result = process_image(image)
        
        # Check if there was an error
        if "error" in result:
            return json.dumps({
                "status": "error",
                "error": result["error"],
                "elements": []
            })
        
        # Convert visualization to base64 for JSON response
        buffered = io.BytesIO()
        result["visualization"].save(buffered, format="PNG")
        img_str = base64.b64encode(buffered.getvalue()).decode()
        
        # Create response
        response = {
            "status": "success",
            "elements": result["elements"],
            "visualization": img_str
        }
        
        return json.dumps(response)
    except Exception as e:
        print(f"API endpoint error: {str(e)}")
        return json.dumps({
            "status": "error",
            "error": f"API processing error: {str(e)}",
            "elements": []
        })

# Function to handle UI submission
def handle_submission(image, box_threshold=0.05, iou_threshold=0.1, use_paddleocr=True, imgsz=640):
    """Handle UI submission and provide appropriate feedback"""
    if image is None:
        return {"error": "No image provided"}, None
    
    # Process the image
    result = process_image(
        image, 
        box_threshold=box_threshold, 
        iou_threshold=iou_threshold, 
        use_paddleocr=use_paddleocr, 
        imgsz=imgsz
    )
    
    # Return the result
    if "error" in result:
        return {"error": result["error"]}, result.get("visualization", None)
    elif "note" in result:
        # This is from the fallback implementation
        return {
            "note": result["note"],
            "elements": result["elements"]
        }, result["visualization"]
    else:
        return {"elements": result["elements"]}, result["visualization"]

# Create Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("""

    # OmniParser v2.0 API

    

    Upload an image to parse UI elements and get structured data.

    

    ## Quick Start

    

    You can use the [test UI image](/file=static/test_ui.png) to try out the API, or upload your own UI screenshot.

    

    ## API Usage

    

    You can use this API by sending a POST request with a file upload to this URL.

    

    ```python

    import requests

    

    # Replace with your actual API URL after deployment

    OMNIPARSER_API_URL = "https://your-username-omniparser-api.hf.space/api/parse"

    

    # Upload a file

    files = {'image': open('screenshot.png', 'rb')}

    

    # Send request

    response = requests.post(OMNIPARSER_API_URL, files=files)

    

    # Get JSON result

    result = response.json()

    ```

    """)
    
    with gr.Row():
        with gr.Column():
            image_input = gr.Image(type='pil', label='Upload image')
            
            # Function to load test image
            def load_test_image():
                if os.path.exists("static/test_ui.png"):
                    return Image.open("static/test_ui.png")
                return None
            
            test_image_button = gr.Button(value='Load Test Image')
            test_image_button.click(fn=load_test_image, inputs=[], outputs=[image_input])
            
            with gr.Accordion("Advanced Options", open=False):
                box_threshold = gr.Slider(
                    label='Box Threshold', 
                    minimum=0.01, 
                    maximum=1.0, 
                    step=0.01, 
                    value=0.05
                )
                iou_threshold = gr.Slider(
                    label='IOU Threshold', 
                    minimum=0.01, 
                    maximum=1.0, 
                    step=0.01, 
                    value=0.1
                )
                use_paddleocr = gr.Checkbox(
                    label='Use PaddleOCR', 
                    value=True
                )
                imgsz = gr.Slider(
                    label='Icon Detect Image Size', 
                    minimum=640, 
                    maximum=1920, 
                    step=32, 
                    value=640
                )
            
            submit_button = gr.Button(value='Parse Image', variant='primary')
            
            # Status message
            status = gr.Markdown("Ready to parse images")
        
        with gr.Column():
            json_output = gr.JSON(label='Parsed Elements (JSON)')
            image_output = gr.Image(type='pil', label='Visualization')
    
    # Connect the interface
    submit_button.click(
        fn=handle_submission,
        inputs=[image_input, box_threshold, iou_threshold, use_paddleocr, imgsz],
        outputs=[json_output, image_output],
        api_name="parse"  # This creates the /api/parse endpoint
    )
    
    # Function to get status
    def get_status():
        if models_initialized:
            return f"✅ OmniParser v2.0 API - Running on {'GPU' if torch.cuda.is_available() else 'CPU'}"
        else:
            return "⚠️ OmniParser v2.0 API - Running in fallback mode (models not loaded)"
    
    # Update status on load
    demo.load(
        fn=get_status,
        outputs=status
    )

# Create test image if it doesn't exist
try:
    if not os.path.exists("static/test_ui.png"):
        print("Creating test UI image...")
        from create_test_image import create_test_ui_image
        test_image_path = create_test_ui_image()
        print(f"Test image created at {test_image_path}")
except Exception as e:
    print(f"Error creating test image: {str(e)}")

# Launch the app
demo.launch()