diff --git a/.gitattributes b/.gitattributes index 34832b75648fde4a03b5ee7b1db774ffe3495a9a..88bae767982309feab8c7c214841a7eac4d4c527 100644 --- a/.gitattributes +++ b/.gitattributes @@ -52,3 +52,153 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text 10samples/sample_0007/overlays/overlay_accepted.png filter=lfs diff=lfs merge=lfs -text 10samples/sample_0007/overlays/overlay_intended.png filter=lfs diff=lfs merge=lfs -text 10samples/sample_0007/overlays/overlay_measured.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/bbox_overlay.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/crops/detect_refine_metal_barrier.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/crops/detect_refine_parked_dark_car.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/crops/detect_refine_pedestrian.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/crops/diversify_input_metal_barrier.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/crops/diversify_input_parked_dark_car.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/crops/diversify_input_pedestrian.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/main_image.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/references/ref_metal_barrier.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/references/ref_parked_dark_car.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/references/ref_pedestrian.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/bbox_overlay.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/detect_refine_street_trees.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/detect_refine_twilight_sky.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/detect_refine_vehicle_dashboard.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/diversify_input_double_yellow_lines.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/diversify_input_street_trees.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/diversify_input_twilight_sky.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/main_image.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_city_buildings.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_dark_car_left.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_dark_suv_driving.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_double_yellow_lines.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_parked_suv_right.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_pedestrian_left.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_pedestrian_right.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_pink_scooter.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_storefront_sign.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_street_light.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_street_signs.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_street_trees.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_twilight_sky.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_vehicle_dashboard.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_white_car_ahead.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/bbox_overlay.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/crops/detect_refine_black_sedan.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/crops/detect_refine_silver_car.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/crops/diversify_input_black_sedan.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/crops/diversify_input_shopper.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/crops/diversify_input_silver_car.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/main_image.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/references/ref_black_sedan.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/references/ref_shopper.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/references/ref_silver_car.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/bbox_overlay.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/crops/detect_refine_dark_parked_car.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/crops/detect_refine_plain_delivery_truck.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/crops/detect_refine_street_lines.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/crops/diversify_input_dark_parked_car.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/crops/diversify_input_plain_delivery_truck.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/crops/diversify_input_street_lines.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/main_image.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/references/ref_dark_parked_car.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/references/ref_pedestrian_walker.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/references/ref_plain_delivery_truck.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/references/ref_red_traffic_light.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/references/ref_street_lines.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/bbox_overlay.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/crops/detect_refine_silver_car.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/crops/detect_refine_street_trees.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/crops/diversify_input_silver_car.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/main_image.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/ref_man_in_suit.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/ref_person_yellow_top.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/ref_silver_car.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/ref_street_trees.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/ref_traffic_light.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/ref_young_girl.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/bbox_overlay.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/crops/detect_refine_blooming_trees.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/crops/detect_refine_double_solid_white_line.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/crops/detect_refine_white_panel_van.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/crops/diversify_input_blooming_trees.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/crops/diversify_input_double_solid_white_line.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/crops/diversify_input_white_panel_van.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/main_image.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/ref_blooming_trees.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/ref_curbside_trash_can.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/ref_double_solid_white_line.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/ref_emergency_vehicle.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/ref_green_street_sign.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/ref_museum_banner.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/ref_pedestrian.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/ref_white_panel_van.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000007/crops/detect_refine_dashboard.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000007/crops/detect_refine_multi_story_building_left.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000007/crops/detect_refine_overhead_wires.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000007/crops/detect_refine_red_brick_building.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000007/crops/detect_refine_street_light_pole.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000007/main_image.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/bbox_overlay.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/detect_refine_dashboard_reflection.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/detect_refine_passenger.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/diversify_input_dashboard_reflection.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/diversify_input_passenger.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/diversify_input_pedestrian_crossing.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/diversify_input_shopper_waiting.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/main_image.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/ref_businessman.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/ref_dashboard_reflection.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/ref_passenger.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/ref_pedestrian_crossing.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/ref_pedestrian_walking_away.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/ref_protester.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/ref_shopper_standing.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/ref_shopper_waiting.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/ref_street_lamp.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/ref_young_man.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/bbox_overlay.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/crops/detect_refine_overpass.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/crops/detect_refine_yellow_building.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/crops/detect_refine_yellow_lane_line.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/crops/diversify_input_overpass.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/crops/diversify_input_yellow_building.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/crops/diversify_input_yellow_lane_line.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/main_image.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/references/ref_overpass.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/references/ref_pedestrian_in_suit.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/references/ref_street_light.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/references/ref_yellow_building.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/references/ref_yellow_lane_line.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/bbox_overlay.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/crops/detect_refine_black_suv.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/crops/diversify_input_black_suv.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/crops/diversify_input_pedestrian_black_jacket.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/main_image.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/ref_black_suv.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/ref_distant_pedestrian.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/ref_man_pink_shirt.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/ref_pedestrian_backpack.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/ref_pedestrian_black_jacket.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/ref_pedestrian_light_blue_shirt.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/ref_pedestrian_light_jacket.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/ref_pedestrian_red_jacket.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/ref_pedestrian_striped_shirt.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/bbox_overlay.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/detect_refine_ego_car_dashboard.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/diversify_input_ego_car_dashboard.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/diversify_input_windshield_mount.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/main_image.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/ref_background_street_light.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/ref_blonde_woman.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/ref_ego_car_dashboard.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/ref_metal_structure.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/ref_oncoming_white_car.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/ref_overhead_streetlights.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/ref_silver_car.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/ref_vintage_street_lamp.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/ref_white_suv.png filter=lfs diff=lfs merge=lfs -text +samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/ref_windshield_mount.png filter=lfs diff=lfs merge=lfs -text diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/README.md b/samples_v8/driving/BDD100K_CrowdHuman_samples/README.md new file mode 100644 index 0000000000000000000000000000000000000000..874cae8d08f4d741479240ec6bd426b1e974d281 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/README.md @@ -0,0 +1,16 @@ +# samples_v8 + +Generated with `data_recipe_v8.md`: vocabulary-first planning, adaptive canvas selection, structured JSON compose prompts, no identity verification, no gate, SAM white-background reference postprocessing, and strict reference-completeness verification with regenerate-until-pass behavior. + +- chat model: `gcp/google/gemini-3.1-pro-preview` +- image model: `gcp/google/gemini-3-pro-image-preview` +- people references: `white_bg_full_body_front` +- non-person references: `white_bg_encyclopedia_photo` +- SAM postprocess: every generated reference is segmented with `sam_vit_b` and pasted onto pure `#ffffff` background +- reference verify max attempts per subject: `10` +- allowed canvases: `[{"aspect_ratio": "1:1", "size": [1024, 1024], "style": "photorealistic"}, {"aspect_ratio": "4:3", "size": [1152, 864], "style": "photorealistic"}, {"aspect_ratio": "3:4", "size": [864, 1152], "style": "photorealistic"}, {"aspect_ratio": "3:2", "size": [1248, 832], "style": "photorealistic"}, {"aspect_ratio": "2:3", "size": [832, 1248], "style": "photorealistic"}, {"aspect_ratio": "16:9", "size": [1280, 720], "style": "photorealistic"}, {"aspect_ratio": "9:16", "size": [720, 1280], "style": "photorealistic"}]` +- scenario mode: `driving` +- pools: `vocab_task_pool`, `plan_pool`, `scene_pool`, `detection_pool`, `reference_pool` +- bbox overlay: `bbox_overlay.png` draws every planned subject bbox; a sample is rejected and regenerated if any planned subject is still missing after VLM detection retries +- detection max attempts per subject: `3` +- launch args: `{"compose_workers": 3, "detect_max_attempts": 3, "detect_workers": 3, "emit_workers": 4, "idle_sleep": 1.0, "image_inflight": 32, "image_interval": 0.05, "image_max_retries": 8, "max_retries": 3, "no_topup": false, "plan_workers": 6, "ref_verify_max_attempts": 10, "reference_workers": 6, "requeue_in_progress": true, "seed": 1781927993, "status_interval": 30.0, "subject_detect_workers": 24, "target_samples": 10}` diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/dataset.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/dataset.json new file mode 100644 index 0000000000000000000000000000000000000000..3aa1f8473aa27eaa6f873039e8f741363565ffda --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/dataset.json @@ -0,0 +1,3666 @@ +[ + { + "sample_id": "sample_000001", + "target_total": 3, + "target_people": 1, + "target_objects": 2, + "canvas_size": [ + 1248, + 832 + ], + "canvas_aspect_ratio": "3:2", + "main_image": "main_image.png", + "bbox_overlay": "bbox_overlay.png", + "plan": "plan.json", + "detections": "detections.json", + "vocab_task": "vocab_task.json", + "n_planned": 3, + "n_detected": 3, + "n_subjects": 3, + "subjects": [ + { + "name": "pedestrian", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_69/273278,12fc4700013112375.jpg:person:3", + "source_name": "pedestrian", + "source_description": "A person wearing a dark coat and trousers. Source dataset: CrowdHuman. Scene context: A bustling city street lined with trees showcasing vibrant yellow autumn foliage, with many pedestrians walking in both directions.", + "sub_caption": "pedestrian: A person wearing a dark coat and trousers.. Scene role: Walking on the sidewalk alongside the street.", + "measured_bbox": [ + 0.177, + 0.0, + 0.3091, + 0.4552 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian.png", + "raw_ref_image": "references/raw_ref_pedestrian_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000001/references/raw_ref_pedestrian_attempt_01.png", + "output": "references/ref_pedestrian.png", + "mask": "references/sam_mask_pedestrian.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 332.0, + 40.0, + 693.0, + 999.0 + ], + "mask_score": 3.438137, + "mask_area_ratio": 0.157722, + "elapsed_seconds": 8.8351 + } + }, + { + "name": "parked_dark_car", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:bcb356f6-520dd65c:object:9", + "source_name": "parked dark car", + "source_description": "A dark-colored sedan parked on the right side of the street, behind the silver car. Source dataset: BDD100K. Scene context: A large white New York City bus is driving down a city street on an overcast day with other cars parked and driving around it.", + "sub_caption": "parked dark car: A dark-colored sedan.. Scene role: Parked on the right side of the street next to the curb.", + "measured_bbox": [ + 0.5856, + 0.0522, + 0.9973, + 0.6586 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_parked_dark_car.png", + "raw_ref_image": "references/raw_ref_parked_dark_car_attempt_02.png", + "reference_verify": "references/reference_verify_parked_dark_car.json", + "reference_verify_passed": true, + "reference_attempts": 2, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000001/references/raw_ref_parked_dark_car_attempt_02.png", + "output": "references/ref_parked_dark_car.png", + "mask": "references/sam_mask_parked_dark_car.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 17.0, + 427.0, + 1006.0, + 796.0 + ], + "mask_score": 3.312519, + "mask_area_ratio": 0.186911, + "elapsed_seconds": 8.4991 + } + }, + { + "name": "metal_barrier", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_74/284193,1da20000b642be5b.jpg:object:5", + "source_name": "metal barrier", + "source_description": "silver metal barricade placed near the entrance Source dataset: CrowdHuman. Scene context: People are gathered outside the entrance of a stone building with arched doorways and large windows.", + "sub_caption": "metal barrier: A silver metal barricade.. Scene role: Placed along the edge of the sidewalk near the parked car, separating the walkway from the street.", + "measured_bbox": [ + 0.0, + 0.1355, + 0.6068, + 0.558 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_metal_barrier.png", + "raw_ref_image": "references/raw_ref_metal_barrier_attempt_01.png", + "reference_verify": "references/reference_verify_metal_barrier.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000001/references/raw_ref_metal_barrier_attempt_01.png", + "output": "references/ref_metal_barrier.png", + "mask": "references/sam_mask_metal_barrier.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 37.0, + 201.0, + 1011.0, + 889.0 + ], + "mask_score": 2.936982, + "mask_area_ratio": 0.305722, + "elapsed_seconds": 9.8709 + } + } + ], + "not_emitted": [], + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } + }, + { + "sample_id": "sample_000002", + "target_total": 15, + "target_people": 2, + "target_objects": 13, + "canvas_size": [ + 1280, + 720 + ], + "canvas_aspect_ratio": "16:9", + "main_image": "main_image.png", + "bbox_overlay": "bbox_overlay.png", + "plan": "plan.json", + "detections": "detections.json", + "vocab_task": "vocab_task.json", + "n_planned": 15, + "n_detected": 15, + "n_subjects": 15, + "subjects": [ + { + "name": "pedestrian_right", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "BDD100K:c5d864fa-b0b2380b:person:2", + "source_name": "pedestrian", + "source_description": "Person walking away from the camera on the right sidewalk, wearing dark clothing. Source dataset: BDD100K. Scene context: View from inside a vehicle driving down a wet city street lined with tall buildings, with pedestrians on the sidewalk and several cars and taxis ahead in the rain.", + "sub_caption": "pedestrian: Person walking away from the camera on the right sidewalk, wearing dark clothing.. Scene role: Walking along the right-hand sidewalk under the trees.", + "measured_bbox": [ + 0.8872, + 0.491, + 0.9451, + 0.6701 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian_right.png", + "raw_ref_image": "references/raw_ref_pedestrian_right_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian_right.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_pedestrian_right_attempt_01.png", + "output": "references/ref_pedestrian_right.png", + "mask": "references/sam_mask_pedestrian_right.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 345.0, + 55.0, + 678.0, + 982.0 + ], + "mask_score": 3.462354, + "mask_area_ratio": 0.14014, + "elapsed_seconds": 8.2387 + } + }, + { + "name": "pedestrian_left", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "BDD100K:b714a088-861a043b:person:2", + "source_name": "pedestrian", + "source_description": "another person near the shop entrance on the left, partially obscured Source dataset: BDD100K. Scene context: a city street scene from the perspective of a vehicle, showing multiple cars in motion, buildings lining the road, and pedestrians on the sidewalks during dusk or early evening", + "sub_caption": "pedestrian: A person near a shop entrance on the left, partially obscured by shadows.. Scene role: Standing near a building entrance on the left side of the street.", + "measured_bbox": [ + 0.1301, + 0.5154, + 0.1517, + 0.611 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian_left.png", + "raw_ref_image": "references/raw_ref_pedestrian_left_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian_left.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_pedestrian_left_attempt_01.png", + "output": "references/ref_pedestrian_left.png", + "mask": "references/sam_mask_pedestrian_left.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 342.0, + 67.0, + 681.0, + 996.0 + ], + "mask_score": 3.481605, + "mask_area_ratio": 0.150858, + "elapsed_seconds": 8.1403 + } + }, + { + "name": "city_buildings", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_9/283554,31eeb000e9237b31.jpg:object:9", + "source_name": "building", + "source_description": "Various city buildings of different heights forming the skyline in the background. Source dataset: CrowdHuman. Scene context: People are walking and resting on the wooden walkway of a large suspension bridge with a city skyline in the background.", + "sub_caption": "building: Various city buildings of different heights forming the urban landscape along the street.. Scene role: Lining the street and forming the architectural background on both sides.", + "measured_bbox": [ + 0.3358, + 0.3425, + 0.4929, + 0.5277 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_city_buildings.png", + "raw_ref_image": "references/raw_ref_city_buildings_attempt_01.png", + "reference_verify": "references/reference_verify_city_buildings.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_city_buildings_attempt_01.png", + "output": "references/ref_city_buildings.png", + "mask": "references/sam_mask_city_buildings.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 27.0, + 178.0, + 996.0, + 865.0 + ], + "mask_score": 3.420089, + "mask_area_ratio": 0.463421, + "elapsed_seconds": 8.2735 + } + }, + { + "name": "pink_scooter", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_66/283991,a60c0001c08f63b.jpg:object:3", + "source_name": "pink scooter", + "source_description": "A prominent pink step-through style motor scooter. Source dataset: CrowdHuman. Scene context: A busy city intersection with many people riding scooters and some cars in the background.", + "sub_caption": "pink scooter: A prominent pink step-through style motor scooter.. Scene role: Parked on the right sidewalk near the street signs.", + "measured_bbox": [ + 0.677, + 0.56, + 0.7935, + 0.7095 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_pink_scooter.png", + "raw_ref_image": "references/raw_ref_pink_scooter_attempt_01.png", + "reference_verify": "references/reference_verify_pink_scooter.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_pink_scooter_attempt_01.png", + "output": "references/ref_pink_scooter.png", + "mask": "references/sam_mask_pink_scooter.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 41.0, + 120.0, + 982.0, + 920.0 + ], + "mask_score": 3.414017, + "mask_area_ratio": 0.259921, + "elapsed_seconds": 8.1841 + } + }, + { + "name": "street_signs", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b5ab0e46-8eab4733:object:8", + "source_name": "street signs", + "source_description": "Various street signs attached to a pole on the right side of the street, including a speed limit sign. Source dataset: BDD100K. Scene context: A narrow city street with several parked and moving box trucks and vans, bordered by multi-story buildings and construction barriers.", + "sub_caption": "street signs: Various street signs attached to a metal pole on the right side of the street.. Scene role: Mounted on a pole alongside the road on the right.", + "measured_bbox": [ + 0.8162, + 0.2869, + 0.8575, + 0.4063 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_street_signs.png", + "raw_ref_image": "references/raw_ref_street_signs_attempt_01.png", + "reference_verify": "references/reference_verify_street_signs.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_street_signs_attempt_01.png", + "output": "references/ref_street_signs.png", + "mask": "references/sam_mask_street_signs.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 226.0, + 71.0, + 707.0, + 967.0 + ], + "mask_score": 3.475593, + "mask_area_ratio": 0.25818, + "elapsed_seconds": 9.8621 + } + }, + { + "name": "storefront_sign", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_4/273275,46a6f0005d04fc24.jpg:object:7", + "source_name": "storefront sign", + "source_description": "A dark sign with white text 'DELI' and a smaller red 'ATM' sign beneath it on the left. Source dataset: CrowdHuman. Scene context: A densely crowded city street with numerous pedestrians walking towards the camera, while emergency vehicles with flashing lights are visible in the background traffic.", + "sub_caption": "storefront sign: A dark, illuminated sign structure above a shop entrance.. Scene role: Hanging above a shop entrance on the left side of the street, illuminating the adjacent pedestrian.", + "measured_bbox": [ + 0.1052, + 0.4218, + 0.186, + 0.4781 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_storefront_sign.png", + "raw_ref_image": "references/raw_ref_storefront_sign_attempt_01.png", + "reference_verify": "references/reference_verify_storefront_sign.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_storefront_sign_attempt_01.png", + "output": "references/ref_storefront_sign.png", + "mask": "references/sam_mask_storefront_sign.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 107.0, + 274.0, + 900.0, + 749.0 + ], + "mask_score": 3.354337, + "mask_area_ratio": 0.167885, + "elapsed_seconds": 8.1782 + } + }, + { + "name": "parked_suv_right", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c54441e6-400c221e:object:4", + "source_name": "parked SUV", + "source_description": "Dark-colored SUV parked ahead of the sedan on the right side of the road. Source dataset: BDD100K. Scene context: Nighttime driving scene approaching an intersection with green traffic lights and parked cars on the right.", + "sub_caption": "parked SUV: Dark-colored SUV parked on the right side of the road.. Scene role: Parked parallel to the curb on the right side of the street.", + "measured_bbox": [ + 0.6057, + 0.5099, + 0.7451, + 0.6703 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_parked_suv_right.png", + "raw_ref_image": "references/raw_ref_parked_suv_right_attempt_01.png", + "reference_verify": "references/reference_verify_parked_suv_right.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_parked_suv_right_attempt_01.png", + "output": "references/ref_parked_suv_right.png", + "mask": "references/sam_mask_parked_suv_right.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 52.0, + 216.0, + 993.0, + 835.0 + ], + "mask_score": 3.459027, + "mask_area_ratio": 0.361156, + "elapsed_seconds": 10.1865 + } + }, + { + "name": "dark_car_left", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:bb2e43e4-5e7a7129:object:6", + "source_name": "car", + "source_description": "A dark car parked along the left curb further ahead. Source dataset: BDD100K. Scene context: A dashcam view from a vehicle driving down a multi-lane road on a partly cloudy day, with a few other cars and pedestrians on the sidewalk.", + "sub_caption": "car: A dark car parked along the left curb further ahead.. Scene role: Parked alongside the left curb.", + "measured_bbox": [ + 0.2139, + 0.5323, + 0.3044, + 0.6201 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_dark_car_left.png", + "raw_ref_image": "references/raw_ref_dark_car_left_attempt_01.png", + "reference_verify": "references/reference_verify_dark_car_left.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_dark_car_left_attempt_01.png", + "output": "references/ref_dark_car_left.png", + "mask": "references/sam_mask_dark_car_left.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 42.0, + 237.0, + 982.0, + 794.0 + ], + "mask_score": 3.479099, + "mask_area_ratio": 0.30617, + "elapsed_seconds": 8.2274 + } + }, + { + "name": "dark_suv_driving", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c889c950-865ca5b6:object:0", + "source_name": "dark SUV", + "source_description": "A dark SUV traveling in the left lane, with visible red taillights. Source dataset: BDD100K. Scene context: Nighttime driving on a highway with several cars visible ahead, illuminated mainly by taillights and headlights.", + "sub_caption": "dark SUV: A dark SUV traveling in the left lane, with visible glowing red taillights.. Scene role: Driving ahead of the camera vehicle in the opposing or adjacent left lane.", + "measured_bbox": [ + 0.3005, + 0.5101, + 0.4179, + 0.6508 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_dark_suv_driving.png", + "raw_ref_image": "references/raw_ref_dark_suv_driving_attempt_01.png", + "reference_verify": "references/reference_verify_dark_suv_driving.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_dark_suv_driving_attempt_01.png", + "output": "references/ref_dark_suv_driving.png", + "mask": "references/sam_mask_dark_suv_driving.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 96.0, + 294.0, + 928.0, + 812.0 + ], + "mask_score": 3.455576, + "mask_area_ratio": 0.251452, + "elapsed_seconds": 9.8494 + } + }, + { + "name": "street_light", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c84f848e-2a5e0737:object:2", + "source_name": "street light", + "source_description": "Tall pole with a bright light on top, illuminating the road from the right side. Source dataset: BDD100K. Scene context: Nighttime driving view on a multi-lane highway with streetlights illuminating the dark road ahead.", + "sub_caption": "street light: Tall pole with a bright light on top, illuminating the road.. Scene role: Casting warm light onto the street from the right-hand sidewalk.", + "measured_bbox": [ + 0.8171, + 0.1755, + 0.8719, + 0.2202 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_street_light.png", + "raw_ref_image": "references/raw_ref_street_light_attempt_01.png", + "reference_verify": "references/reference_verify_street_light.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_street_light_attempt_01.png", + "output": "references/ref_street_light.png", + "mask": "references/sam_mask_street_light.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 71.0, + 277.0, + 913.0, + 727.0 + ], + "mask_score": 3.350243, + "mask_area_ratio": 0.068855, + "elapsed_seconds": 8.2963 + } + }, + { + "name": "vehicle_dashboard", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b5032e1d-dad95b60:object:9", + "source_name": "dashboard", + "source_description": "The dark, reflective dashboard and lower windshield area of the camera vehicle in the foreground. Source dataset: BDD100K. Scene context: View from inside a vehicle driving down a residential city street lined with trees, parked cars, and multi-story brick buildings.", + "sub_caption": "dashboard: The dark, reflective dashboard and lower windshield area of the camera vehicle.. Scene role: Occupying the bottom foreground of the image, establishing the perspective from inside the car.", + "measured_bbox": [ + 0.0, + 0.9261, + 1.0, + 1.0 + ], + "detection_confidence": "high", + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_vehicle_dashboard.png", + "raw_ref_image": "references/raw_ref_vehicle_dashboard_attempt_01.png", + "reference_verify": "references/reference_verify_vehicle_dashboard.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_vehicle_dashboard_attempt_01.png", + "output": "references/ref_vehicle_dashboard.png", + "mask": "references/sam_mask_vehicle_dashboard.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 24.0, + 317.0, + 1001.0, + 706.0 + ], + "mask_score": 2.942001, + "mask_area_ratio": 0.133658, + "elapsed_seconds": 8.3645 + } + }, + { + "name": "white_car_ahead", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b3a102ed-6ef54f5e:object:3", + "source_name": "white car", + "source_description": "A white car visible further down the road in the right lane. Source dataset: BDD100K. Scene context: Nighttime driving scene in a city with cars stopped in traffic and a construction site on the right.", + "sub_caption": "white car: A white car visible further down the road.. Scene role: Driving away in the right lane, further in the distance.", + "measured_bbox": [ + 0.4811, + 0.5382, + 0.5174, + 0.5915 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_white_car_ahead.png", + "raw_ref_image": "references/raw_ref_white_car_ahead_attempt_01.png", + "reference_verify": "references/reference_verify_white_car_ahead.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_white_car_ahead_attempt_01.png", + "output": "references/ref_white_car_ahead.png", + "mask": "references/sam_mask_white_car_ahead.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 112.0, + 168.0, + 912.0, + 855.0 + ], + "mask_score": 3.412999, + "mask_area_ratio": 0.338258, + "elapsed_seconds": 8.3339 + } + }, + { + "name": "double_yellow_lines", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c417a291-7802692d:object:8", + "source_name": "yellow lines", + "source_description": "Double yellow painted lines separating opposite directions of traffic. Source dataset: BDD100K. Scene context: A pedestrian crossing a street at a crosswalk with several cars stopped around them, with storefronts visible in the background.", + "sub_caption": "yellow lines: Double yellow painted lines separating opposite directions of traffic.. Scene role: Running down the center of the road, receding into the distance.", + "measured_bbox": [ + 0.3008, + 0.5732, + 0.4776, + 0.8029 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_double_yellow_lines.png", + "raw_ref_image": "references/raw_ref_double_yellow_lines_attempt_01.png", + "reference_verify": "references/reference_verify_double_yellow_lines.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_double_yellow_lines_attempt_01.png", + "output": "references/ref_double_yellow_lines.png", + "mask": "references/sam_mask_double_yellow_lines.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 0.0, + 1023.0, + 1023.0 + ], + "mask_score": 2.141169, + "mask_area_ratio": 0.667065, + "elapsed_seconds": 8.2719 + } + }, + { + "name": "street_trees", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c4891df0-24371ae1:object:3", + "source_name": "trees", + "source_description": "Numerous trees with dense green and yellowish foliage lining both sides of the road. Source dataset: BDD100K. Scene context: A dark SUV drives ahead on a wet road lined with trees and a fence under a cloudy sky.", + "sub_caption": "trees: Numerous trees with dense foliage lining both sides of the road.. Scene role: Planted along the sidewalks, softening the urban environment and framing the street.", + "measured_bbox": [ + 0.001, + 0.002, + 0.375, + 0.63 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_street_trees.png", + "raw_ref_image": "references/raw_ref_street_trees_attempt_01.png", + "reference_verify": "references/reference_verify_street_trees.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_street_trees_attempt_01.png", + "output": "references/ref_street_trees.png", + "mask": "references/sam_mask_street_trees.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 65.0, + 64.0, + 958.0, + 969.0 + ], + "mask_score": 3.478968, + "mask_area_ratio": 0.365667, + "elapsed_seconds": 8.231 + } + }, + { + "name": "twilight_sky", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c13c0d1f-00dfd075:object:9", + "source_name": "sky", + "source_description": "Clear twilight sky transitioning from bright near the horizon to dark blue at the top. Source dataset: BDD100K. Scene context: View from a moving vehicle looking down a multi-lane city street lined with parked cars and large trees under a clear twilight sky.", + "sub_caption": "sky: Clear twilight sky transitioning from bright near the horizon to dark blue at the top.. Scene role: Providing the atmospheric backdrop above the buildings and street.", + "measured_bbox": [ + 0.116, + 0.0, + 0.714, + 0.4742 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_twilight_sky.png", + "raw_ref_image": "references/raw_ref_twilight_sky_attempt_01.png", + "reference_verify": "references/reference_verify_twilight_sky.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_twilight_sky_attempt_01.png", + "output": "references/ref_twilight_sky.png", + "mask": "references/sam_mask_twilight_sky.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 208.0, + 1023.0, + 814.0 + ], + "mask_score": 2.437955, + "mask_area_ratio": 0.529621, + "elapsed_seconds": 9.8292 + } + } + ], + "not_emitted": [], + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } + }, + { + "sample_id": "sample_000003", + "target_total": 3, + "target_people": 1, + "target_objects": 2, + "canvas_size": [ + 1248, + 832 + ], + "canvas_aspect_ratio": "3:2", + "main_image": "main_image.png", + "bbox_overlay": "bbox_overlay.png", + "plan": "plan.json", + "detections": "detections.json", + "vocab_task": "vocab_task.json", + "n_planned": 3, + "n_detected": 3, + "n_subjects": 3, + "subjects": [ + { + "name": "shopper", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_12/282555,5c403000efcca35d.jpg:person:13", + "source_name": "shopper", + "source_description": "A person standing on the top level, wearing a dark top and dark pants. Source dataset: CrowdHuman. Scene context: A multi-level outdoor shopping mall with various people walking and shopping.", + "sub_caption": "shopper: A person standing and waiting, wearing a dark top and dark pants. Scene role: waiting at the crosswalk curb", + "measured_bbox": [ + 0.7364, + 0.2825, + 0.8267, + 0.7222 + ], + "detection_confidence": 100, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_shopper.png", + "raw_ref_image": "references/raw_ref_shopper_attempt_01.png", + "reference_verify": "references/reference_verify_shopper.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000003/references/raw_ref_shopper_attempt_01.png", + "output": "references/ref_shopper.png", + "mask": "references/sam_mask_shopper.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 348.0, + 80.0, + 678.0, + 995.0 + ], + "mask_score": 3.467753, + "mask_area_ratio": 0.132874, + "elapsed_seconds": 49.4008 + } + }, + { + "name": "black_sedan", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c0c183ff-1b24f541:object:7", + "source_name": "black sedan", + "source_description": "A black sedan driving in the right lane ahead of the red hatchback. Source dataset: BDD100K. Scene context: A view from a vehicle driving down a multi-lane city street flanked by tall buildings, with various cars moving in the same direction and parked along the side.", + "sub_caption": "black sedan: A dark, modern black sedan. Scene role: driving in the nearest lane on the street", + "measured_bbox": [ + 0.0883, + 0.2514, + 0.5002, + 0.449 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_black_sedan.png", + "raw_ref_image": "references/raw_ref_black_sedan_attempt_01.png", + "reference_verify": "references/reference_verify_black_sedan.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000003/references/raw_ref_black_sedan_attempt_01.png", + "output": "references/ref_black_sedan.png", + "mask": "references/sam_mask_black_sedan.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 321.0, + 1023.0, + 700.0 + ], + "mask_score": 2.52477, + "mask_area_ratio": 0.559944, + "elapsed_seconds": 8.5091 + } + }, + { + "name": "silver_car", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:be49ae7a-1ffaa683:object:2", + "source_name": "silver car", + "source_description": "A silver car driving in the adjacent right lane, slightly ahead of the white car. Source dataset: BDD100K. Scene context: View from inside a vehicle driving in heavy traffic on a multi-lane road under a clear sky.", + "sub_caption": "silver car: A sleek silver car. Scene role: driving in the adjacent lane slightly ahead of the black sedan", + "measured_bbox": [ + 0.3669, + 0.2463, + 0.7048, + 0.409 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_silver_car.png", + "raw_ref_image": "references/raw_ref_silver_car_attempt_01.png", + "reference_verify": "references/reference_verify_silver_car.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000003/references/raw_ref_silver_car_attempt_01.png", + "output": "references/ref_silver_car.png", + "mask": "references/sam_mask_silver_car.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 16.0, + 369.0, + 1006.0, + 693.0 + ], + "mask_score": 3.457475, + "mask_area_ratio": 0.178123, + "elapsed_seconds": 9.7472 + } + } + ], + "not_emitted": [], + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } + }, + { + "sample_id": "sample_000004", + "target_total": 5, + "target_people": 1, + "target_objects": 4, + "canvas_size": [ + 1248, + 832 + ], + "canvas_aspect_ratio": "3:2", + "main_image": "main_image.png", + "bbox_overlay": "bbox_overlay.png", + "plan": "plan.json", + "detections": "detections.json", + "vocab_task": "vocab_task.json", + "n_planned": 5, + "n_detected": 5, + "n_subjects": 5, + "subjects": [ + { + "name": "pedestrian_walker", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_51/273275,145927000354f7525.jpg:person:10", + "source_name": "walker", + "source_description": "A man walking, wearing a blue hoodie with 'EMO' written on it, grey pants, and dark shoes. Source dataset: CrowdHuman. Scene context: A group of people, some wearing matching white t-shirts, are walking and jogging along a paved path next to a road, with a grey SUV parked on a grassy hill in the background.", + "sub_caption": "walker: A man walking, wearing a plain blue hoodie, grey pants, and dark shoes.. Scene role: Walking near the crosswalk on the side of the street.", + "measured_bbox": [ + 0.5948, + 0.3939, + 0.6378, + 0.5698 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian_walker.png", + "raw_ref_image": "references/raw_ref_pedestrian_walker_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian_walker.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000004/references/raw_ref_pedestrian_walker_attempt_01.png", + "output": "references/ref_pedestrian_walker.png", + "mask": "references/sam_mask_pedestrian_walker.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 334.0, + 56.0, + 706.0, + 996.0 + ], + "mask_score": 3.43302, + "mask_area_ratio": 0.160827, + "elapsed_seconds": 9.8914 + } + }, + { + "name": "red_traffic_light", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b6df605f-51c158b8:object:6", + "source_name": "traffic light", + "source_description": "A set of traffic lights suspended over the intersection, showing a red light. Source dataset: BDD100K. Scene context: A view from a vehicle driving down a city street on a sunny day with a U-Haul truck in the opposite lane, parked cars along the right curb, and pedestrians crossing a crosswalk.", + "sub_caption": "traffic light: A traffic signal suspended over the intersection, illuminated with a bright red light.. Scene role: Hanging high above the center of the intersection in the driver's line of sight.", + "measured_bbox": [ + 0.4668, + 0.0722, + 0.5093, + 0.1896 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_red_traffic_light.png", + "raw_ref_image": "references/raw_ref_red_traffic_light_attempt_01.png", + "reference_verify": "references/reference_verify_red_traffic_light.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000004/references/raw_ref_red_traffic_light_attempt_01.png", + "output": "references/ref_red_traffic_light.png", + "mask": "references/sam_mask_red_traffic_light.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 286.0, + 103.0, + 1023.0, + 893.0 + ], + "mask_score": 3.25218, + "mask_area_ratio": 0.200515, + "elapsed_seconds": 8.1927 + } + }, + { + "name": "plain_delivery_truck", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b3a7b21a-48bcf2b8:object:2", + "source_name": "delivery truck", + "source_description": "A large white box truck parked behind the fence on the right, with visible green and purple logos. Source dataset: BDD100K. Scene context: A view from a vehicle driving down a wide, paved road flanked by an industrial area with fences and delivery trucks under a cloudy sky.", + "sub_caption": "delivery truck: A large, plain white box truck without any visible markings or graphics.. Scene role: Parked alongside the right edge of the street curb.", + "measured_bbox": [ + 0.6504, + 0.2022, + 0.966, + 0.6212 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_plain_delivery_truck.png", + "raw_ref_image": "references/raw_ref_plain_delivery_truck_attempt_01.png", + "reference_verify": "references/reference_verify_plain_delivery_truck.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000004/references/raw_ref_plain_delivery_truck_attempt_01.png", + "output": "references/ref_plain_delivery_truck.png", + "mask": "references/sam_mask_plain_delivery_truck.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 9.0, + 166.0, + 1017.0, + 852.0 + ], + "mask_score": 3.45107, + "mask_area_ratio": 0.437578, + "elapsed_seconds": 10.0386 + } + }, + { + "name": "dark_parked_car", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:bcd37eef-1b958ae3:object:5", + "source_name": "dark parked car", + "source_description": "Another dark-colored car parked on the right side of the street. Source dataset: BDD100K. Scene context: A nighttime view from a vehicle driving down a multi-lane city street with parked cars on the right and oncoming traffic on the left.", + "sub_caption": "dark parked car: A dark-colored passenger vehicle.. Scene role: Parked parallel to the curb directly behind the delivery truck.", + "measured_bbox": [ + 0.8339, + 0.4566, + 0.9965, + 0.7781 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_dark_parked_car.png", + "raw_ref_image": "references/raw_ref_dark_parked_car_attempt_01.png", + "reference_verify": "references/reference_verify_dark_parked_car.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000004/references/raw_ref_dark_parked_car_attempt_01.png", + "output": "references/ref_dark_parked_car.png", + "mask": "references/sam_mask_dark_parked_car.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 301.0, + 1023.0, + 694.0 + ], + "mask_score": 3.113868, + "mask_area_ratio": 0.207836, + "elapsed_seconds": 8.5697 + } + }, + { + "name": "street_lines", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c0c9ec9a-d3638a82:object:6", + "source_name": "street lines", + "source_description": "Double yellow lines separating traffic directions and white painted lines indicating lanes and crosswalks. Source dataset: BDD100K. Scene context: A view from inside a car driving on a city street, approaching an intersection underneath an elevated railway structure.", + "sub_caption": "street lines: Double yellow center lines separating traffic directions, and solid white painted lines forming a distinct crosswalk.. Scene role: Painted on the asphalt, guiding traffic and defining the pedestrian crossing area in the foreground.", + "measured_bbox": [ + 0.003, + 0.432, + 0.971, + 0.794 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_street_lines.png", + "raw_ref_image": "references/raw_ref_street_lines_attempt_01.png", + "reference_verify": "references/reference_verify_street_lines.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000004/references/raw_ref_street_lines_attempt_01.png", + "output": "references/ref_street_lines.png", + "mask": "references/sam_mask_street_lines.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 384.0, + 98.0, + 639.0, + 925.0 + ], + "mask_score": 3.44596, + "mask_area_ratio": 0.067441, + "elapsed_seconds": 8.1646 + } + } + ], + "not_emitted": [], + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } + }, + { + "sample_id": "sample_000005", + "target_total": 6, + "target_people": 3, + "target_objects": 3, + "canvas_size": [ + 1248, + 832 + ], + "canvas_aspect_ratio": "3:2", + "main_image": "main_image.png", + "bbox_overlay": "bbox_overlay.png", + "plan": "plan.json", + "detections": "detections.json", + "vocab_task": "vocab_task.json", + "n_planned": 6, + "n_detected": 6, + "n_subjects": 6, + "subjects": [ + { + "name": "person_yellow_top", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_24/282555,49a4e000d44469c6.jpg:person:49", + "source_name": "person", + "source_description": "Standing, wearing a bright yellow top Source dataset: CrowdHuman. Scene context: A large crowd of tourists is gathered in front of the Louvre museum and its iconic glass pyramid on a sunny day.", + "sub_caption": "person: Standing, wearing a bright yellow top. Scene role: Crossing the street on the crosswalk in front of the stopped silver car", + "measured_bbox": [ + 0.5309, + 0.4516, + 0.5607, + 0.6301 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_person_yellow_top.png", + "raw_ref_image": "references/raw_ref_person_yellow_top_attempt_01.png", + "reference_verify": "references/reference_verify_person_yellow_top.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_person_yellow_top_attempt_01.png", + "output": "references/ref_person_yellow_top.png", + "mask": "references/sam_mask_person_yellow_top.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 329.0, + 42.0, + 701.0, + 1012.0 + ], + "mask_score": 3.348943, + "mask_area_ratio": 0.150169, + "elapsed_seconds": 8.25 + } + }, + { + "name": "man_in_suit", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_28/273278,b62280001bda6f1a.jpg:person:19", + "source_name": "crowd member", + "source_description": "A person far right in the background wearing a suit. Source dataset: CrowdHuman. Scene context: A crowd of people gathers in front of an old stone building with a prominent arched doorway and two large animal statues.", + "sub_caption": "crowd member: A person wearing a professional suit.. Scene role: Walking alongside the other pedestrians across the crosswalk", + "measured_bbox": [ + 0.5767, + 0.4388, + 0.6397, + 0.6278 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_man_in_suit.png", + "raw_ref_image": "references/raw_ref_man_in_suit_attempt_01.png", + "reference_verify": "references/reference_verify_man_in_suit.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_man_in_suit_attempt_01.png", + "output": "references/ref_man_in_suit.png", + "mask": "references/sam_mask_man_in_suit.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 351.0, + 27.0, + 671.0, + 1004.0 + ], + "mask_score": 3.48496, + "mask_area_ratio": 0.144686, + "elapsed_seconds": 9.7885 + } + }, + { + "name": "young_girl", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_51/282555,57b9f000182048e6.jpg:person:13", + "source_name": "pedestrian", + "source_description": "Young girl with brown hair, wearing a blue patterned top. Source dataset: CrowdHuman. Scene context: A sunny outdoor scene featuring the red entrance arch to Navy Pier Beer Garden and a tall brick tower, with a diverse crowd of people walking along the waterfront promenade.", + "sub_caption": "pedestrian: Young girl with brown hair, wearing a blue patterned top.. Scene role: Walking across the intersection near the person in the yellow top", + "measured_bbox": [ + 0.6354, + 0.4889, + 0.6677, + 0.6337 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_young_girl.png", + "raw_ref_image": "references/raw_ref_young_girl_attempt_01.png", + "reference_verify": "references/reference_verify_young_girl.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_young_girl_attempt_01.png", + "output": "references/ref_young_girl.png", + "mask": "references/sam_mask_young_girl.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 369.0, + 52.0, + 661.0, + 1003.0 + ], + "mask_score": 3.482282, + "mask_area_ratio": 0.133298, + "elapsed_seconds": 8.3216 + } + }, + { + "name": "traffic_light", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_36/273275,6a11d000f52c34a9.jpg:object:0", + "source_name": "traffic light", + "source_description": "A black multi-lens traffic light fixture mounted on a pole above the street. Source dataset: CrowdHuman. Scene context: A male tour guide is speaking to a group of people standing on a city sidewalk next to a road crossing.", + "sub_caption": "traffic light: A black multi-lens traffic light fixture mounted on a pole above the street.. Scene role: Suspended over the intersection, showing a red light to halt the vehicles", + "measured_bbox": [ + 0.5513, + 0.0408, + 0.6462, + 0.1518 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_traffic_light.png", + "raw_ref_image": "references/raw_ref_traffic_light_attempt_01.png", + "reference_verify": "references/reference_verify_traffic_light.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_traffic_light_attempt_01.png", + "output": "references/ref_traffic_light.png", + "mask": "references/sam_mask_traffic_light.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 113.0, + 201.0, + 923.0, + 826.0 + ], + "mask_score": 3.467034, + "mask_area_ratio": 0.289252, + "elapsed_seconds": 9.874 + } + }, + { + "name": "street_trees", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_37/282555,a4aef000d9c10933.jpg:object:3", + "source_name": "trees", + "source_description": "Various green trees and shrubs lining the pathway and visible in the background gardens. Source dataset: CrowdHuman. Scene context: A large crowd of tourists walks along the pathway towards the Taj Mahal on a clear day.", + "sub_caption": "trees: Various green trees and shrubs lining the pathway and visible in the background gardens.. Scene role: Planted along the sidewalks on both sides of the street, providing urban greenery", + "measured_bbox": [ + 0.542, + 0.2363, + 0.636, + 0.493 + ], + "detection_confidence": 0.9, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_street_trees.png", + "raw_ref_image": "references/raw_ref_street_trees_attempt_03.png", + "reference_verify": "references/reference_verify_street_trees.json", + "reference_verify_passed": true, + "reference_attempts": 3, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_street_trees_attempt_03.png", + "output": "references/ref_street_trees.png", + "mask": "references/sam_mask_street_trees.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 22.0, + 60.0, + 1003.0, + 968.0 + ], + "mask_score": 3.301958, + "mask_area_ratio": 0.393952, + "elapsed_seconds": 8.2223 + } + }, + { + "name": "silver_car", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:be3d3a81-326a032d:object:0", + "source_name": "silver car", + "source_description": "A silver compact hatchback car facing forward, waiting at an intersection. Its brake lights are on. Source dataset: BDD100K. Scene context: A rainy street scene showing cars waiting at an intersection surrounded by tall buildings and urban infrastructure.", + "sub_caption": "silver car: A silver compact hatchback car facing forward, waiting at an intersection. Its brake lights are on.. Scene role: Stopped in the traffic lane in the foreground, waiting for the pedestrians to cross", + "measured_bbox": [ + 0.3062, + 0.4281, + 0.5436, + 0.7674 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_silver_car.png", + "raw_ref_image": "references/raw_ref_silver_car_attempt_01.png", + "reference_verify": "references/reference_verify_silver_car.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_silver_car_attempt_01.png", + "output": "references/ref_silver_car.png", + "mask": "references/sam_mask_silver_car.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 80.0, + 167.0, + 957.0, + 937.0 + ], + "mask_score": 3.434142, + "mask_area_ratio": 0.414005, + "elapsed_seconds": 8.3073 + } + } + ], + "not_emitted": [], + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } + }, + { + "sample_id": "sample_000006", + "target_total": 8, + "target_people": 1, + "target_objects": 7, + "canvas_size": [ + 1280, + 720 + ], + "canvas_aspect_ratio": "16:9", + "main_image": "main_image.png", + "bbox_overlay": "bbox_overlay.png", + "plan": "plan.json", + "detections": "detections.json", + "vocab_task": "vocab_task.json", + "n_planned": 8, + "n_detected": 8, + "n_subjects": 8, + "subjects": [ + { + "name": "pedestrian", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_73/283991,17cd800008079067.jpg:person:18", + "source_name": "pedestrian", + "source_description": "Another person in the distant background near the green structure. Source dataset: CrowdHuman. Scene context: A large crowd of people walking across a street with trees, streetlamps, and classic architecture in the background.", + "sub_caption": "pedestrian: A person walking across the street in the distance.. Scene role: Crossing the street near the left background.", + "measured_bbox": [ + 0.2151, + 0.4819, + 0.2507, + 0.5947 + ], + "detection_confidence": "high", + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian.png", + "raw_ref_image": "references/raw_ref_pedestrian_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_pedestrian_attempt_01.png", + "output": "references/ref_pedestrian.png", + "mask": "references/sam_mask_pedestrian.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 353.0, + 44.0, + 672.0, + 1013.0 + ], + "mask_score": 3.456561, + "mask_area_ratio": 0.147466, + "elapsed_seconds": 8.2841 + } + }, + { + "name": "emergency_vehicle", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b99f250d-886111c5:object:5", + "source_name": "vehicle", + "source_description": "A dark-colored vehicle partially visible in the left background with blue emergency lights flashing. Source dataset: BDD100K. Scene context: A nighttime city street intersection showing a crosswalk, construction barriers, and illuminated traffic signals.", + "sub_caption": "vehicle: A dark-colored vehicle with blue emergency lights flashing.. Scene role: Stopped or parked in the distant left background.", + "measured_bbox": [ + 0.325, + 0.4787, + 0.3786, + 0.5486 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_emergency_vehicle.png", + "raw_ref_image": "references/raw_ref_emergency_vehicle_attempt_01.png", + "reference_verify": "references/reference_verify_emergency_vehicle.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_emergency_vehicle_attempt_01.png", + "output": "references/ref_emergency_vehicle.png", + "mask": "references/sam_mask_emergency_vehicle.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 261.0, + 1023.0, + 782.0 + ], + "mask_score": 3.339466, + "mask_area_ratio": 0.300308, + "elapsed_seconds": 8.2719 + } + }, + { + "name": "curbside_trash_can", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_10/273275,5da3d000f5358c3c.jpg:object:9", + "source_name": "trash can", + "source_description": "dark blue cylindrical bin partially visible in the foreground Source dataset: CrowdHuman. Scene context: A group of children and an adult pose for a photo in front of a roller coaster at an amusement park.", + "sub_caption": "trash can: A dark blue cylindrical bin.. Scene role: Placed on the sidewalk curb in the lower right foreground.", + "measured_bbox": [ + 0.8371, + 0.5448, + 0.9204, + 0.7599 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_curbside_trash_can.png", + "raw_ref_image": "references/raw_ref_curbside_trash_can_attempt_01.png", + "reference_verify": "references/reference_verify_curbside_trash_can.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_curbside_trash_can_attempt_01.png", + "output": "references/ref_curbside_trash_can.png", + "mask": "references/sam_mask_curbside_trash_can.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 175.0, + 42.0, + 837.0, + 982.0 + ], + "mask_score": 3.480803, + "mask_area_ratio": 0.406976, + "elapsed_seconds": 8.7724 + } + }, + { + "name": "museum_banner", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_24/282555,14d900042c7b9a4.jpg:object:3", + "source_name": "exhibition banner", + "source_description": "A large, rectangular banner hanging on the building's facade, featuring a maroon background and text. Source dataset: CrowdHuman. Scene context: A large, classical building, likely a museum, with many people sitting and standing on its wide front steps, while a yellow taxi speeds past in the foreground.", + "sub_caption": "exhibition banner: A large, rectangular maroon banner hanging on a building's facade.. Scene role: Hanging from the classical architecture on the right side of the street.", + "measured_bbox": [ + 0.8013, + 0.0164, + 0.8543, + 0.2771 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_museum_banner.png", + "raw_ref_image": "references/raw_ref_museum_banner_attempt_01.png", + "reference_verify": "references/reference_verify_museum_banner.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_museum_banner_attempt_01.png", + "output": "references/ref_museum_banner.png", + "mask": "references/sam_mask_museum_banner.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 317.0, + 20.0, + 703.0, + 981.0 + ], + "mask_score": 3.453619, + "mask_area_ratio": 0.268547, + "elapsed_seconds": 8.1747 + } + }, + { + "name": "white_panel_van", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b5047c50-e1facff6:object:2", + "source_name": "white van", + "source_description": "A large white panel van with red taillights illuminated, driving in the right lane ahead. Source dataset: BDD100K. Scene context: View from inside a vehicle driving down a multi-lane city street on a sunny day with moderate traffic.", + "sub_caption": "white van: A large white panel van with illuminated red taillights.. Scene role: Driving ahead in the right lane of the road.", + "measured_bbox": [ + 0.5122, + 0.3306, + 0.6867, + 0.6936 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_white_panel_van.png", + "raw_ref_image": "references/raw_ref_white_panel_van_attempt_01.png", + "reference_verify": "references/reference_verify_white_panel_van.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_white_panel_van_attempt_01.png", + "output": "references/ref_white_panel_van.png", + "mask": "references/sam_mask_white_panel_van.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 102.0, + 91.0, + 877.0, + 937.0 + ], + "mask_score": 3.457159, + "mask_area_ratio": 0.429852, + "elapsed_seconds": 10.1474 + } + }, + { + "name": "double_solid_white_line", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:bc886d37-5b22c313:object:7", + "source_name": "double solid white line", + "source_description": "Two continuous white painted lines on the dark asphalt road surface, separating the two lanes of traffic. Source dataset: BDD100K. Scene context: A view from inside a car driving through a brightly lit tunnel with tiled walls, following a silver SUV and a dark compact car.", + "sub_caption": "double solid white line: Two continuous white painted lines on dark asphalt.. Scene role: Dividing the traffic lanes down the center of the street.", + "measured_bbox": [ + 0.2383, + 0.6095, + 0.4221, + 0.8925 + ], + "detection_confidence": "high", + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_double_solid_white_line.png", + "raw_ref_image": "references/raw_ref_double_solid_white_line_attempt_01.png", + "reference_verify": "references/reference_verify_double_solid_white_line.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_double_solid_white_line_attempt_01.png", + "output": "references/ref_double_solid_white_line.png", + "mask": "references/sam_mask_double_solid_white_line.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 229.0, + 72.0, + 795.0, + 951.0 + ], + "mask_score": 3.470715, + "mask_area_ratio": 0.39155, + "elapsed_seconds": 9.6388 + } + }, + { + "name": "blooming_trees", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_26/283991,91fa000cda68156.jpg:object:5", + "source_name": "blooming trees", + "source_description": "Trees with pink blossoms in the background park area. Source dataset: CrowdHuman. Scene context: People are walking along a sidewalk next to a street lined with trees, some in bloom, with a police officer standing near a parked car.", + "sub_caption": "blooming trees: Trees featuring vibrant pink blossoms.. Scene role: Lining the sidewalk and park area on the right side of the street.", + "measured_bbox": [ + 0.6279, + 0.2153, + 0.8163, + 0.5163 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_blooming_trees.png", + "raw_ref_image": "references/raw_ref_blooming_trees_attempt_01.png", + "reference_verify": "references/reference_verify_blooming_trees.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_blooming_trees_attempt_01.png", + "output": "references/ref_blooming_trees.png", + "mask": "references/sam_mask_blooming_trees.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 93.0, + 1023.0, + 967.0 + ], + "mask_score": 3.479366, + "mask_area_ratio": 0.423422, + "elapsed_seconds": 8.1597 + } + }, + { + "name": "green_street_sign", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_50/273278,febe100057ca94db.jpg:object:5", + "source_name": "street sign", + "source_description": "A green street sign with white text visible on the left side. Source dataset: CrowdHuman. Scene context: Several pedestrians are crossing a street at a crosswalk on a sunny day.", + "sub_caption": "street sign: A green rectangular street sign.. Scene role: Mounted on a pole on the left side of the street near the crosswalk.", + "measured_bbox": [ + 0.1915, + 0.2239, + 0.2775, + 0.2533 + ], + "detection_confidence": 0.9, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_green_street_sign.png", + "raw_ref_image": "references/raw_ref_green_street_sign_attempt_01.png", + "reference_verify": "references/reference_verify_green_street_sign.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_green_street_sign_attempt_01.png", + "output": "references/ref_green_street_sign.png", + "mask": "references/sam_mask_green_street_sign.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 26.0, + 378.0, + 998.0, + 645.0 + ], + "mask_score": 3.473778, + "mask_area_ratio": 0.191363, + "elapsed_seconds": 9.6707 + } + } + ], + "not_emitted": [], + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } + }, + { + "sample_id": "sample_000008", + "target_total": 10, + "target_people": 8, + "target_objects": 2, + "canvas_size": [ + 1280, + 720 + ], + "canvas_aspect_ratio": "16:9", + "main_image": "main_image.png", + "bbox_overlay": "bbox_overlay.png", + "plan": "plan.json", + "detections": "detections.json", + "vocab_task": "vocab_task.json", + "n_planned": 10, + "n_detected": 10, + "n_subjects": 10, + "subjects": [ + { + "name": "passenger", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_36/273278,a753000396c56bd.jpg:person:6", + "source_name": "passenger", + "source_description": "A man wearing glasses, a black jacket, and a light-colored shirt underneath, walking away from the camera. Source dataset: CrowdHuman. Scene context: A large crowd of people waiting at a train platform next to a stationary train.", + "sub_caption": "passenger: A man wearing glasses, a black jacket, and a light-colored shirt underneath, walking away from the camera.. Scene role: Walking away on the sidewalk ahead.", + "measured_bbox": [ + 0.8938, + 0.1807, + 0.9951, + 0.7157 + ], + "detection_confidence": "high", + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_passenger.png", + "raw_ref_image": "references/raw_ref_passenger_attempt_01.png", + "reference_verify": "references/reference_verify_passenger.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_passenger_attempt_01.png", + "output": "references/ref_passenger.png", + "mask": "references/sam_mask_passenger.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 342.0, + 31.0, + 682.0, + 985.0 + ], + "mask_score": 3.454991, + "mask_area_ratio": 0.146239, + "elapsed_seconds": 8.0907 + } + }, + { + "name": "shopper_waiting", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_4/284193,a17b00013852c31.jpg:person:2", + "source_name": "shopper", + "source_description": "A woman standing on the right, looking towards the street, wearing a black jacket, dark trousers, and carrying a brown handbag. Source dataset: CrowdHuman. Scene context: A bustling cobblestone street in a town lined with brick buildings, decorated with colorful bunting flags, and populated with pedestrians walking, shopping, and dining outdoors.", + "sub_caption": "shopper: A woman standing on the right, looking towards the street, wearing a black jacket, dark trousers, and carrying a brown handbag.. Scene role: Standing on the corner curb, waiting to cross the street.", + "measured_bbox": [ + 0.632, + 0.1691, + 0.7153, + 0.6522 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_shopper_waiting.png", + "raw_ref_image": "references/raw_ref_shopper_waiting_attempt_01.png", + "reference_verify": "references/reference_verify_shopper_waiting.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_shopper_waiting_attempt_01.png", + "output": "references/ref_shopper_waiting.png", + "mask": "references/sam_mask_shopper_waiting.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 365.0, + 69.0, + 720.0, + 1006.0 + ], + "mask_score": 3.169183, + "mask_area_ratio": 0.111197, + "elapsed_seconds": 8.0622 + } + }, + { + "name": "shopper_standing", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_59/283554,3417e000ebf80bca.jpg:person:13", + "source_name": "shopper", + "source_description": "A person standing, wearing a blue jacket and blue jeans. Source dataset: CrowdHuman. Scene context: A bustling street market with many pedestrians browsing stalls and goods displayed on the ground.", + "sub_caption": "shopper: A person standing, wearing a blue jacket and blue jeans.. Scene role: Standing near the crosswalk edge amidst the crowd.", + "measured_bbox": [ + 0.5209, + 0.1793, + 0.5735, + 0.5325 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_shopper_standing.png", + "raw_ref_image": "references/raw_ref_shopper_standing_attempt_01.png", + "reference_verify": "references/reference_verify_shopper_standing.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_shopper_standing_attempt_01.png", + "output": "references/ref_shopper_standing.png", + "mask": "references/sam_mask_shopper_standing.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 326.0, + 43.0, + 698.0, + 998.0 + ], + "mask_score": 3.440171, + "mask_area_ratio": 0.161293, + "elapsed_seconds": 8.0811 + } + }, + { + "name": "protester", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_22/273278,c58890002730d04e.jpg:person:8", + "source_name": "protester holding sign in back", + "source_description": "A person walking in the background on the right, holding up a large white sign. Source dataset: CrowdHuman. Scene context: A large crowd of people, possibly students, are walking down a city street, some holding signs, suggesting a protest or march.", + "sub_caption": "protester holding sign in back: A person walking in the background on the right, holding up a large white sign.. Scene role: Walking further down the right sidewalk carrying a sign.", + "measured_bbox": [ + 0.8193, + 0.1216, + 0.8875, + 0.4511 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_protester.png", + "raw_ref_image": "references/raw_ref_protester_attempt_01.png", + "reference_verify": "references/reference_verify_protester.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_protester_attempt_01.png", + "output": "references/ref_protester.png", + "mask": "references/sam_mask_protester.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 337.0, + 19.0, + 694.0, + 1013.0 + ], + "mask_score": 3.465365, + "mask_area_ratio": 0.161731, + "elapsed_seconds": 8.3408 + } + }, + { + "name": "pedestrian_crossing", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_1/273275,1361a000ad290eee.jpg:person:11", + "source_name": "pedestrian", + "source_description": "A person in a white top and dark pants, walking towards the right. Source dataset: CrowdHuman. Scene context: A bustling city street scene with a mix of old brick buildings and modern glass structures, connected by an elevated glass walkway, with a large crowd of pedestrians moving along the wide sidewalk.", + "sub_caption": "pedestrian: A person in a white top and dark pants, walking towards the right.. Scene role: Actively walking across the crosswalk in front of the vehicle.", + "measured_bbox": [ + 0.2322, + 0.1993, + 0.3165, + 0.4965 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian_crossing.png", + "raw_ref_image": "references/raw_ref_pedestrian_crossing_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian_crossing.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_pedestrian_crossing_attempt_01.png", + "output": "references/ref_pedestrian_crossing.png", + "mask": "references/sam_mask_pedestrian_crossing.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 355.0, + 20.0, + 674.0, + 1012.0 + ], + "mask_score": 3.482863, + "mask_area_ratio": 0.15384, + "elapsed_seconds": 8.0791 + } + }, + { + "name": "pedestrian_walking_away", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_66/273275,194f6000c5fdbe78.jpg:person:12", + "source_name": "pedestrian", + "source_description": "A person wearing a white top and dark pants, walking away from the camera. Source dataset: CrowdHuman. Scene context: A large crowd of people walking on a city street in front of a grand, multi-story commercial building with signs for Virgin Megastore and Monoprix.", + "sub_caption": "pedestrian: A person wearing a white top and dark pants, walking away from the camera.. Scene role: Walking away on the left side of the street.", + "measured_bbox": [ + 0.013, + 0.2139, + 0.0908, + 0.494 + ], + "detection_confidence": 100, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian_walking_away.png", + "raw_ref_image": "references/raw_ref_pedestrian_walking_away_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian_walking_away.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_pedestrian_walking_away_attempt_01.png", + "output": "references/ref_pedestrian_walking_away.png", + "mask": "references/sam_mask_pedestrian_walking_away.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 350.0, + 50.0, + 671.0, + 987.0 + ], + "mask_score": 3.476273, + "mask_area_ratio": 0.142721, + "elapsed_seconds": 8.2428 + } + }, + { + "name": "young_man", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_22/282555,1b65800070e437ee.jpg:person:8", + "source_name": "young man", + "source_description": "Standing back row, wearing a dark blue hoodie. Source dataset: CrowdHuman. Scene context: A group of young people, mostly wearing college apparel, are posing for a photograph on bleachers in front of a red building.", + "sub_caption": "young man: Standing back row, wearing a dark blue hoodie.. Scene role: Waiting in the crowd at the corner intersection.", + "measured_bbox": [ + 0.5568, + 0.1246, + 0.6032, + 0.5033 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_young_man.png", + "raw_ref_image": "references/raw_ref_young_man_attempt_01.png", + "reference_verify": "references/reference_verify_young_man.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_young_man_attempt_01.png", + "output": "references/ref_young_man.png", + "mask": "references/sam_mask_young_man.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 352.0, + 65.0, + 671.0, + 928.0 + ], + "mask_score": 3.483394, + "mask_area_ratio": 0.132506, + "elapsed_seconds": 8.2271 + } + }, + { + "name": "businessman", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_56/282555,cd31e000ef97ef85.jpg:person:23", + "source_name": "adult in dark suit", + "source_description": "Standing in back rows, wearing dark suit and tie. Source dataset: CrowdHuman. Scene context: A large group of people gathered for a celebration, many wearing bright traditional Chinese costumes, holding props like fans and a lion dance head, posing for a group photo behind a banner.", + "sub_caption": "adult in dark suit: Standing in back rows, wearing dark suit and tie.. Scene role: Standing on the sidewalk behind other pedestrians.", + "measured_bbox": [ + 0.5976, + 0.1322, + 0.6413, + 0.4385 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_businessman.png", + "raw_ref_image": "references/raw_ref_businessman_attempt_01.png", + "reference_verify": "references/reference_verify_businessman.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_businessman_attempt_01.png", + "output": "references/ref_businessman.png", + "mask": "references/sam_mask_businessman.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 336.0, + 19.0, + 690.0, + 1013.0 + ], + "mask_score": 2.970801, + "mask_area_ratio": 0.135565, + "elapsed_seconds": 8.2448 + } + }, + { + "name": "street_lamp", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_10/273278,10841e0001c957a7c.jpg:object:6", + "source_name": "street lamp", + "source_description": "A tall street lamp pole partially visible on the right side of the street. Source dataset: CrowdHuman. Scene context: A crowded city street at night, illuminated by numerous bright, colorful neon signs and shop displays.", + "sub_caption": "street lamp: A tall street lamp pole partially visible on the right side of the street.. Scene role: Towering above the right sidewalk corner, serving as city infrastructure.", + "measured_bbox": [ + 0.014, + 0.03, + 0.109, + 0.254 + ], + "detection_confidence": 0.8, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_street_lamp.png", + "raw_ref_image": "references/raw_ref_street_lamp_attempt_01.png", + "reference_verify": "references/reference_verify_street_lamp.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_street_lamp_attempt_01.png", + "output": "references/ref_street_lamp.png", + "mask": "references/sam_mask_street_lamp.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 257.0, + 5.0, + 840.0, + 1019.0 + ], + "mask_score": 3.134794, + "mask_area_ratio": 0.049316, + "elapsed_seconds": 8.2643 + } + }, + { + "name": "dashboard_reflection", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c24e7c72-f53291ea:object:8", + "source_name": "vehicle dashboard reflection", + "source_description": "A reflection on the windshield showing the interior dashboard and a document or object with large blue text. Source dataset: BDD100K. Scene context: View from inside a vehicle driving on a city street on an overcast day, passing a large brick building on the right.", + "sub_caption": "vehicle dashboard reflection: A reflection on the windshield showing the interior dashboard and a document or object with large blue text.. Scene role: Visible along the bottom edge of the frame, establishing the camera's perspective from inside a car.", + "measured_bbox": [ + 0.143, + 0.6854, + 0.461, + 0.8934 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_dashboard_reflection.png", + "raw_ref_image": "references/raw_ref_dashboard_reflection_attempt_01.png", + "reference_verify": "references/reference_verify_dashboard_reflection.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_dashboard_reflection_attempt_01.png", + "output": "references/ref_dashboard_reflection.png", + "mask": "references/sam_mask_dashboard_reflection.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 336.0, + 1023.0, + 1023.0 + ], + "mask_score": 1.211741, + "mask_area_ratio": 0.687541, + "elapsed_seconds": 9.9949 + } + } + ], + "not_emitted": [], + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } + }, + { + "sample_id": "sample_000009", + "target_total": 5, + "target_people": 1, + "target_objects": 4, + "canvas_size": [ + 1248, + 832 + ], + "canvas_aspect_ratio": "3:2", + "main_image": "main_image.png", + "bbox_overlay": "bbox_overlay.png", + "plan": "plan.json", + "detections": "detections.json", + "vocab_task": "vocab_task.json", + "n_planned": 5, + "n_detected": 5, + "n_subjects": 5, + "subjects": [ + { + "name": "pedestrian_in_suit", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_11/282555,d1509000eb848483.jpg:person:21", + "source_name": "pedestrian in suit", + "source_description": "A person walking, wearing a dark suit. Source dataset: CrowdHuman. Scene context: A crowd of tourists gathers outside a grand palace, with some sitting on a stone balustrade and others walking along a wide paved path.", + "sub_caption": "pedestrian in suit: A person walking, wearing a dark suit.. Scene role: walking along the right side of the street on the sidewalk", + "measured_bbox": [ + 0.7498, + 0.407, + 0.8062, + 0.6382 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian_in_suit.png", + "raw_ref_image": "references/raw_ref_pedestrian_in_suit_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian_in_suit.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000009/references/raw_ref_pedestrian_in_suit_attempt_01.png", + "output": "references/ref_pedestrian_in_suit.png", + "mask": "references/sam_mask_pedestrian_in_suit.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 347.0, + 40.0, + 695.0, + 1018.0 + ], + "mask_score": 3.473173, + "mask_area_ratio": 0.152202, + "elapsed_seconds": 9.5571 + } + }, + { + "name": "yellow_building", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_32/273275,7595c000fa55d9d7.jpg:object:5", + "source_name": "yellow building", + "source_description": "A multi-story building with a yellow ochre facade and numerous shuttered windows visible in the background on the far left. Source dataset: CrowdHuman. Scene context: A large crowd of tourists is gathered around a monumental, ornate stone fountain with large statues and cascading water, situated in a city square surrounded by buildings.", + "sub_caption": "yellow building: A multi-story building with a yellow ochre facade and numerous shuttered windows.. Scene role: providing a backdrop on the right side of the street scene", + "measured_bbox": [ + 0.6651, + 0.0, + 0.9968, + 0.6296 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_yellow_building.png", + "raw_ref_image": "references/raw_ref_yellow_building_attempt_01.png", + "reference_verify": "references/reference_verify_yellow_building.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000009/references/raw_ref_yellow_building_attempt_01.png", + "output": "references/ref_yellow_building.png", + "mask": "references/sam_mask_yellow_building.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 4.0, + 0.0, + 1023.0, + 995.0 + ], + "mask_score": 2.131685, + "mask_area_ratio": 0.742735, + "elapsed_seconds": 8.5184 + } + }, + { + "name": "yellow_lane_line", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c9195e43-b2fdd978:object:2", + "source_name": "yellow lane line", + "source_description": "A solid yellow line painted on the road surface indicating the edge of the lane. Source dataset: BDD100K. Scene context: View from inside a vehicle driving on a highway, with a white car ahead in the next lane and lush green bushes along the right side barrier.", + "sub_caption": "yellow lane line: A solid yellow line painted on the road surface indicating the edge of the lane.. Scene role: extending along the road surface towards the distance", + "measured_bbox": [ + 0.4487, + 0.5308, + 0.5367, + 1.0 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_yellow_lane_line.png", + "raw_ref_image": "references/raw_ref_yellow_lane_line_attempt_01.png", + "reference_verify": "references/reference_verify_yellow_lane_line.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000009/references/raw_ref_yellow_lane_line_attempt_01.png", + "output": "references/ref_yellow_lane_line.png", + "mask": "references/sam_mask_yellow_lane_line.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 12.0, + 55.0, + 1018.0, + 969.0 + ], + "mask_score": 0.925602, + "mask_area_ratio": 0.952688, + "elapsed_seconds": 8.3819 + } + }, + { + "name": "overpass", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:bcd32109-8b0b1eef:object:6", + "source_name": "overpass", + "source_description": "A concrete bridge structure spanning across the highway ahead, casting a shadow over the road. Source dataset: BDD100K. Scene context: View from a moving vehicle on a multi-lane highway, approaching an underpass with several cars ahead.", + "sub_caption": "overpass: A concrete bridge structure spanning across the street ahead.. Scene role: arching over the road in the midground", + "measured_bbox": [ + 0.0031, + 0.1925, + 0.6919, + 0.5364 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_overpass.png", + "raw_ref_image": "references/raw_ref_overpass_attempt_01.png", + "reference_verify": "references/reference_verify_overpass.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000009/references/raw_ref_overpass_attempt_01.png", + "output": "references/ref_overpass.png", + "mask": "references/sam_mask_overpass.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 28.0, + 284.0, + 965.0, + 771.0 + ], + "mask_score": 3.406555, + "mask_area_ratio": 0.166775, + "elapsed_seconds": 8.3597 + } + }, + { + "name": "street_light", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:bd5bb58b-24a6f5e7:object:2", + "source_name": "street light", + "source_description": "Bright, glowing street lights illuminating the road. Source dataset: BDD100K. Scene context: Nighttime driving scene on a city street with multiple vehicles, overhead structures, and traffic signals.", + "sub_caption": "street light: Bright, glowing street lights illuminating the road.. Scene role: overhead fixtures providing illumination over the roadway and sidewalk", + "measured_bbox": [ + 0.5545, + 0.0, + 0.5804, + 0.0625 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_street_light.png", + "raw_ref_image": "references/raw_ref_street_light_attempt_01.png", + "reference_verify": "references/reference_verify_street_light.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000009/references/raw_ref_street_light_attempt_01.png", + "output": "references/ref_street_light.png", + "mask": "references/sam_mask_street_light.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 205.0, + 0.0, + 824.0, + 857.0 + ], + "mask_score": 3.391373, + "mask_area_ratio": 0.189186, + "elapsed_seconds": 8.2179 + } + } + ], + "not_emitted": [], + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } + }, + { + "sample_id": "sample_000010", + "target_total": 9, + "target_people": 8, + "target_objects": 1, + "canvas_size": [ + 1280, + 720 + ], + "canvas_aspect_ratio": "16:9", + "main_image": "main_image.png", + "bbox_overlay": "bbox_overlay.png", + "plan": "plan.json", + "detections": "detections.json", + "vocab_task": "vocab_task.json", + "n_planned": 9, + "n_detected": 9, + "n_subjects": 9, + "subjects": [ + { + "name": "pedestrian_black_jacket", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_69/273275,3ad7000cc6b4598.jpg:person:7", + "source_name": "pedestrian", + "source_description": "Person walking away, wearing a black jacket and dark pants. Source dataset: CrowdHuman. Scene context: People are walking on a street in front of a large, ornate cathedral.", + "sub_caption": "pedestrian: Person walking away, wearing a black jacket and dark pants.. Scene role: walking away on the sidewalk to the right", + "measured_bbox": [ + 0.8947, + 0.408, + 0.9768, + 0.8 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian_black_jacket.png", + "raw_ref_image": "references/raw_ref_pedestrian_black_jacket_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian_black_jacket.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_black_jacket_attempt_01.png", + "output": "references/ref_pedestrian_black_jacket.png", + "mask": "references/sam_mask_pedestrian_black_jacket.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 332.0, + 25.0, + 691.0, + 1014.0 + ], + "mask_score": 3.419812, + "mask_area_ratio": 0.160983, + "elapsed_seconds": 8.2172 + } + }, + { + "name": "pedestrian_backpack", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_9/283991,1e62f00058996b51.jpg:person:10", + "source_name": "pedestrian", + "source_description": "A man partially visible behind the man in the black polo shirt, wearing a blue t-shirt and a backpack. Source dataset: CrowdHuman. Scene context: A group of people standing on a paved street in front of stone buildings, seemingly listening to a tour guide.", + "sub_caption": "pedestrian: A man wearing a blue t-shirt and a backpack.. Scene role: standing near the curb looking toward the road", + "measured_bbox": [ + 0.6953, + 0.4394, + 0.7156, + 0.5151 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian_backpack.png", + "raw_ref_image": "references/raw_ref_pedestrian_backpack_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian_backpack.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_backpack_attempt_01.png", + "output": "references/ref_pedestrian_backpack.png", + "mask": "references/sam_mask_pedestrian_backpack.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 344.0, + 30.0, + 689.0, + 1018.0 + ], + "mask_score": 3.422455, + "mask_area_ratio": 0.157988, + "elapsed_seconds": 8.1451 + } + }, + { + "name": "pedestrian_red_jacket", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_29/283991,4d5b0000ceec5a1.jpg:person:5", + "source_name": "pedestrian standing", + "source_description": "A person standing near a kiosk, wearing a bright red jacket. Source dataset: CrowdHuman. Scene context: A large outdoor public square with a fountain, surrounded by buildings and trees, with many pedestrians walking around and climbing a wide set of marble stairs in the foreground.", + "sub_caption": "pedestrian standing: A person standing, wearing a bright red jacket.. Scene role: waiting at a crosswalk edge", + "measured_bbox": [ + 0.4504, + 0.4033, + 0.474, + 0.5253 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian_red_jacket.png", + "raw_ref_image": "references/raw_ref_pedestrian_red_jacket_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian_red_jacket.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_red_jacket_attempt_01.png", + "output": "references/ref_pedestrian_red_jacket.png", + "mask": "references/sam_mask_pedestrian_red_jacket.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 346.0, + 92.0, + 677.0, + 984.0 + ], + "mask_score": 3.472322, + "mask_area_ratio": 0.129704, + "elapsed_seconds": 9.5973 + } + }, + { + "name": "pedestrian_striped_shirt", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_15/273278,141d72000a04cd076.jpg:person:8", + "source_name": "pedestrian", + "source_description": "Person in the background wearing a striped shirt and dark pants. Source dataset: CrowdHuman. Scene context: A busy city square with pedestrians, bicycles, and a yellow tram on a sunny day.", + "sub_caption": "pedestrian: Person wearing a striped shirt and dark pants.. Scene role: strolling along the sidewalk", + "measured_bbox": [ + 0.7269, + 0.3947, + 0.7711, + 0.5853 + ], + "detection_confidence": 100, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian_striped_shirt.png", + "raw_ref_image": "references/raw_ref_pedestrian_striped_shirt_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian_striped_shirt.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_striped_shirt_attempt_01.png", + "output": "references/ref_pedestrian_striped_shirt.png", + "mask": "references/sam_mask_pedestrian_striped_shirt.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 338.0, + 11.0, + 687.0, + 1018.0 + ], + "mask_score": 3.206288, + "mask_area_ratio": 0.147885, + "elapsed_seconds": 8.1875 + } + }, + { + "name": "man_pink_shirt", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_41/283992,8df800092d3201e.jpg:person:1", + "source_name": "man talking to young man", + "source_description": "Standing next to the young man with the backpack, wearing a pink shirt and dark shorts. Source dataset: CrowdHuman. Scene context: A bustling city street corner with people walking and standing around, surrounded by buildings with large digital billboards and store signs, with a few vehicles including a prominent red SUV passing through.", + "sub_caption": "man talking to young man: Man wearing a pink shirt and dark shorts.. Scene role: conversing near a storefront on the sidewalk", + "measured_bbox": [ + 0.8332, + 0.3734, + 0.8735, + 0.5918 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_man_pink_shirt.png", + "raw_ref_image": "references/raw_ref_man_pink_shirt_attempt_01.png", + "reference_verify": "references/reference_verify_man_pink_shirt.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_man_pink_shirt_attempt_01.png", + "output": "references/ref_man_pink_shirt.png", + "mask": "references/sam_mask_man_pink_shirt.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 340.0, + 42.0, + 689.0, + 995.0 + ], + "mask_score": 3.442738, + "mask_area_ratio": 0.146916, + "elapsed_seconds": 8.1734 + } + }, + { + "name": "pedestrian_light_jacket", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_16/273278,11c1c7000110dd323.jpg:person:1", + "source_name": "pedestrian", + "source_description": "Individual in the crowd near the intersection, wearing a light-colored jacket. Source dataset: CrowdHuman. Scene context: A bustling city street at night, brightly illuminated by numerous large electronic billboards on tall buildings, with heavy vehicle and pedestrian traffic below.", + "sub_caption": "pedestrian: Individual wearing a light-colored jacket.. Scene role: approaching the street intersection", + "measured_bbox": [ + 0.6065, + 0.3907, + 0.6375, + 0.4907 + ], + "detection_confidence": "high", + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian_light_jacket.png", + "raw_ref_image": "references/raw_ref_pedestrian_light_jacket_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian_light_jacket.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_light_jacket_attempt_01.png", + "output": "references/ref_pedestrian_light_jacket.png", + "mask": "references/sam_mask_pedestrian_light_jacket.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 342.0, + 28.0, + 681.0, + 1013.0 + ], + "mask_score": 3.460161, + "mask_area_ratio": 0.163844, + "elapsed_seconds": 9.6744 + } + }, + { + "name": "pedestrian_light_blue_shirt", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_4/273275,1dd0f000c9356aaa.jpg:person:18", + "source_name": "pedestrian", + "source_description": "A person in a light blue shirt walking away from the camera. Source dataset: CrowdHuman. Scene context: A wide, tree-lined avenue crowded with many pedestrians walking in both directions, adorned with red banners on the trees under a clear blue sky.", + "sub_caption": "pedestrian: A person in a light blue shirt walking away from the camera.. Scene role: walking further down the sidewalk in the mid-ground", + "measured_bbox": [ + 0.9459, + 0.3895, + 0.9964, + 0.6538 + ], + "detection_confidence": 100, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian_light_blue_shirt.png", + "raw_ref_image": "references/raw_ref_pedestrian_light_blue_shirt_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian_light_blue_shirt.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_light_blue_shirt_attempt_01.png", + "output": "references/ref_pedestrian_light_blue_shirt.png", + "mask": "references/sam_mask_pedestrian_light_blue_shirt.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 357.0, + 33.0, + 693.0, + 1012.0 + ], + "mask_score": 3.452806, + "mask_area_ratio": 0.153078, + "elapsed_seconds": 9.6513 + } + }, + { + "name": "distant_pedestrian", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_21/283991,2145f000467f0a28.jpg:person:14", + "source_name": "distant pedestrian", + "source_description": "A person walking in the background. Source dataset: CrowdHuman. Scene context: People walk through an outdoor plaza area with modern architecture, an outdoor seating section with red chairs on the left, and planters with yellow and blue flowers on the right.", + "sub_caption": "distant pedestrian: A person walking in the background.. Scene role: walking in the far background down the street", + "measured_bbox": [ + 0.6066, + 0.3904, + 0.6375, + 0.489 + ], + "detection_confidence": 1.0, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_distant_pedestrian.png", + "raw_ref_image": "references/raw_ref_distant_pedestrian_attempt_01.png", + "reference_verify": "references/reference_verify_distant_pedestrian.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_distant_pedestrian_attempt_01.png", + "output": "references/ref_distant_pedestrian.png", + "mask": "references/sam_mask_distant_pedestrian.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 360.0, + 88.0, + 677.0, + 997.0 + ], + "mask_score": 3.489431, + "mask_area_ratio": 0.138401, + "elapsed_seconds": 8.1869 + } + }, + { + "name": "black_suv", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b8fe1054-42625c45:object:1", + "source_name": "black suv", + "source_description": "A black SUV parked ahead of the silver SUV on the right. Source dataset: BDD100K. Scene context: View from a vehicle driving down a cracked city street lined with parked cars, buildings, and trees on a sunny day.", + "sub_caption": "black suv: A black SUV parked ahead on the right.. Scene role: parked at the curb on the right side of the street", + "measured_bbox": [ + 0.4391, + 0.47, + 0.6899, + 0.8264 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_black_suv.png", + "raw_ref_image": "references/raw_ref_black_suv_attempt_01.png", + "reference_verify": "references/reference_verify_black_suv.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_black_suv_attempt_01.png", + "output": "references/ref_black_suv.png", + "mask": "references/sam_mask_black_suv.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 16.0, + 190.0, + 1007.0, + 843.0 + ], + "mask_score": 3.120914, + "mask_area_ratio": 0.384048, + "elapsed_seconds": 9.8714 + } + } + ], + "not_emitted": [], + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } + }, + { + "sample_id": "sample_000011", + "target_total": 10, + "target_people": 1, + "target_objects": 9, + "canvas_size": [ + 1280, + 720 + ], + "canvas_aspect_ratio": "16:9", + "main_image": "main_image.png", + "bbox_overlay": "bbox_overlay.png", + "plan": "plan.json", + "detections": "detections.json", + "vocab_task": "vocab_task.json", + "n_planned": 10, + "n_detected": 10, + "n_subjects": 10, + "subjects": [ + { + "name": "blonde_woman", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_8/273275,44ab30007bea86d2.jpg:person:5", + "source_name": "woman in foreground", + "source_description": "A woman with blonde hair, seen in profile in the bottom center of the foreground. Source dataset: CrowdHuman. Scene context: A large crowd of people is gathered on a city street, many standing behind metal barricades, while some are taking photos.", + "sub_caption": "woman in foreground: A woman with blonde hair, seen in profile.. Scene role: walking along the right sidewalk under the street lamps", + "measured_bbox": [ + 0.7873, + 0.3886, + 0.8283, + 0.5843 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_blonde_woman.png", + "raw_ref_image": "references/raw_ref_blonde_woman_attempt_02.png", + "reference_verify": "references/reference_verify_blonde_woman.json", + "reference_verify_passed": true, + "reference_attempts": 2, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/raw_ref_blonde_woman_attempt_02.png", + "output": "references/ref_blonde_woman.png", + "mask": "references/sam_mask_blonde_woman.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 347.0, + 60.0, + 686.0, + 982.0 + ], + "mask_score": 3.476833, + "mask_area_ratio": 0.13921, + "elapsed_seconds": 10.126 + } + }, + { + "name": "metal_structure", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_54/273278,11abb000d12e48e5.jpg:object:5", + "source_name": "metal structure", + "source_description": "A tall, rectangular grey metal box or pillar on the right side, which one man is holding onto. Source dataset: CrowdHuman. Scene context: A crowded subway station with a metal barrier, where two people are climbing over the barrier while others stand by.", + "sub_caption": "metal structure: A tall, rectangular grey metal box or pillar.. Scene role: situated on the edge of the sidewalk as a utility box", + "measured_bbox": [ + 0.8171, + 0.3117, + 0.944, + 0.5699 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_metal_structure.png", + "raw_ref_image": "references/raw_ref_metal_structure_attempt_01.png", + "reference_verify": "references/reference_verify_metal_structure.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/raw_ref_metal_structure_attempt_01.png", + "output": "references/ref_metal_structure.png", + "mask": "references/sam_mask_metal_structure.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 91.0, + 32.0, + 932.0, + 1001.0 + ], + "mask_score": 3.477494, + "mask_area_ratio": 0.534141, + "elapsed_seconds": 8.3774 + } + }, + { + "name": "overhead_streetlights", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:bd1b8b79-829e787f:object:5", + "source_name": "streetlights", + "source_description": "Tall poles with bright lights illuminating the street from above, casting long reflections on the wet pavement. Source dataset: BDD100K. Scene context: Nighttime driving scene on a wet city street with streetlights reflecting on the road.", + "sub_caption": "streetlights: Tall poles with bright lights illuminating the street from above, casting long reflections on the wet pavement.. Scene role: providing overhead illumination for the wet road", + "measured_bbox": [ + 0.5014, + 0.0563, + 0.5225, + 0.1017 + ], + "detection_confidence": 0.9, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_overhead_streetlights.png", + "raw_ref_image": "references/raw_ref_overhead_streetlights_attempt_03.png", + "reference_verify": "references/reference_verify_overhead_streetlights.json", + "reference_verify_passed": true, + "reference_attempts": 3, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/raw_ref_overhead_streetlights_attempt_03.png", + "output": "references/ref_overhead_streetlights.png", + "mask": "references/sam_mask_overhead_streetlights.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 18.0, + 185.0, + 995.0, + 821.0 + ], + "mask_score": 3.398942, + "mask_area_ratio": 0.279411, + "elapsed_seconds": 8.219 + } + }, + { + "name": "silver_car", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:bba4ee51-3badc9f8:object:6", + "source_name": "silver car", + "source_description": "Silver car parked further down the street on the right. Source dataset: BDD100K. Scene context: View from inside a car driving down a residential street lined with parked cars and trees.", + "sub_caption": "silver car: A silver car.. Scene role: parked further down the street on the right curbside", + "measured_bbox": [ + 0.5294, + 0.4494, + 0.6488, + 0.6006 + ], + "detection_confidence": 1.0, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_silver_car.png", + "raw_ref_image": "references/raw_ref_silver_car_attempt_01.png", + "reference_verify": "references/reference_verify_silver_car.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/raw_ref_silver_car_attempt_01.png", + "output": "references/ref_silver_car.png", + "mask": "references/sam_mask_silver_car.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 47.0, + 245.0, + 984.0, + 779.0 + ], + "mask_score": 3.398036, + "mask_area_ratio": 0.279834, + "elapsed_seconds": 8.4126 + } + }, + { + "name": "white_suv", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c5e32cf6-7d2e04b4:object:0", + "source_name": "white suv", + "source_description": "A white Honda CR-V parked or stopped in the rightmost lane, showing its rear passenger side. Source dataset: BDD100K. Scene context: A view from a car driving down a multi-lane city street with parked cars on the right, oncoming traffic on the left, and a highway overpass in the distance on a sunny day.", + "sub_caption": "white suv: A white SUV showing its rear passenger side.. Scene role: stopped or parked in the rightmost lane ahead", + "measured_bbox": [ + 0.5082, + 0.4246, + 0.5826, + 0.5552 + ], + "detection_confidence": "high", + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_white_suv.png", + "raw_ref_image": "references/raw_ref_white_suv_attempt_01.png", + "reference_verify": "references/reference_verify_white_suv.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/raw_ref_white_suv_attempt_01.png", + "output": "references/ref_white_suv.png", + "mask": "references/sam_mask_white_suv.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 26.0, + 237.0, + 1002.0, + 800.0 + ], + "mask_score": 3.457781, + "mask_area_ratio": 0.315623, + "elapsed_seconds": 10.4584 + } + }, + { + "name": "background_street_light", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_15/273278,8d2ae00027075d75.jpg:object:5", + "source_name": "street light", + "source_description": "bright street lights illuminating the area in the background Source dataset: CrowdHuman. Scene context: A group of people wearing athletic clothing are posed for a group photo outdoors at night.", + "sub_caption": "street light: Bright street lights.. Scene role: illuminating the distant background area of the street", + "measured_bbox": [ + 0.5004, + 0.0592, + 0.5237, + 0.0998 + ], + "detection_confidence": 0.9, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_background_street_light.png", + "raw_ref_image": "references/raw_ref_background_street_light_attempt_01.png", + "reference_verify": "references/reference_verify_background_street_light.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/raw_ref_background_street_light_attempt_01.png", + "output": "references/ref_background_street_light.png", + "mask": "references/sam_mask_background_street_light.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 152.0, + 95.0, + 875.0, + 938.0 + ], + "mask_score": 3.437329, + "mask_area_ratio": 0.065212, + "elapsed_seconds": 8.1426 + } + }, + { + "name": "oncoming_white_car", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:be3159f3-13250ffe:object:2", + "source_name": "white car", + "source_description": "A white car driving toward the camera in the oncoming lane, further down the road. Source dataset: BDD100K. Scene context: A daytime street view from a vehicle approaching an intersection with traffic lights and several other cars.", + "sub_caption": "white car: A white car driving toward the camera with its headlights visible.. Scene role: active oncoming traffic in the left lane", + "measured_bbox": [ + 0.1529, + 0.4523, + 0.2891, + 0.5849 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_oncoming_white_car.png", + "raw_ref_image": "references/raw_ref_oncoming_white_car_attempt_01.png", + "reference_verify": "references/reference_verify_oncoming_white_car.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/raw_ref_oncoming_white_car_attempt_01.png", + "output": "references/ref_oncoming_white_car.png", + "mask": "references/sam_mask_oncoming_white_car.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 13.0, + 304.0, + 1011.0, + 819.0 + ], + "mask_score": 3.453796, + "mask_area_ratio": 0.299096, + "elapsed_seconds": 8.2507 + } + }, + { + "name": "windshield_mount", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c5694077-e345e2a8:object:12", + "source_name": "windshield mount", + "source_description": "A black mount attached to the inside of the windshield, partially obscuring the view. Source dataset: BDD100K. Scene context: A view from inside a car driving down a busy city street flanked by tall buildings, with various vehicles including cars, a delivery truck, and a cyclist in the foreground.", + "sub_caption": "windshield mount: A black mount attached to the inside of the windshield.. Scene role: partially obscuring the top view, framing the dashcam perspective", + "measured_bbox": [ + 0.0, + 0.0, + 0.4767, + 0.1353 + ], + "detection_confidence": "high", + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_windshield_mount.png", + "raw_ref_image": "references/raw_ref_windshield_mount_attempt_01.png", + "reference_verify": "references/reference_verify_windshield_mount.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/raw_ref_windshield_mount_attempt_01.png", + "output": "references/ref_windshield_mount.png", + "mask": "references/sam_mask_windshield_mount.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 88.0, + 131.0, + 967.0, + 895.0 + ], + "mask_score": 3.43205, + "mask_area_ratio": 0.236237, + "elapsed_seconds": 8.17 + } + }, + { + "name": "ego_car_dashboard", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c187431f-7b6ad6d6:object:0", + "source_name": "ego car dashboard", + "source_description": "The dark, lower foreground showing part of the dashboard and hood of the vehicle recording the video, with red reflections from taillights. Source dataset: BDD100K. Scene context: Nighttime driving on a multi-lane highway with moderate traffic, showing vehicles ahead and green overhead highway signs.", + "sub_caption": "ego car dashboard: The dark, lower foreground showing part of the dashboard and hood of the vehicle, with red ambient reflections.. Scene role: anchors the bottom of the frame, establishing the driver's perspective", + "measured_bbox": [ + 0.0, + 0.7306, + 1.0, + 1.0 + ], + "detection_confidence": 1.0, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_ego_car_dashboard.png", + "raw_ref_image": "references/raw_ref_ego_car_dashboard_attempt_01.png", + "reference_verify": "references/reference_verify_ego_car_dashboard.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/raw_ref_ego_car_dashboard_attempt_01.png", + "output": "references/ref_ego_car_dashboard.png", + "mask": "references/sam_mask_ego_car_dashboard.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 327.0, + 1023.0, + 788.0 + ], + "mask_score": 3.233951, + "mask_area_ratio": 0.206886, + "elapsed_seconds": 10.2609 + } + }, + { + "name": "vintage_street_lamp", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_29/273275,2032200056dda99e.jpg:object:0", + "source_name": "street lamp", + "source_description": "Ornate, black, vintage-style street lamp post. Source dataset: CrowdHuman. Scene context: A densely crowded city street with numerous pedestrians walking along a sidewalk lined with shops and tall, leafy trees.", + "sub_caption": "street lamp: An ornate, black, vintage-style street lamp post.. Scene role: providing decorative lighting on the right sidewalk next to the pedestrian", + "measured_bbox": [ + 0.7202, + 0.0, + 0.7633, + 0.6024 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_vintage_street_lamp.png", + "raw_ref_image": "references/raw_ref_vintage_street_lamp_attempt_01.png", + "reference_verify": "references/reference_verify_vintage_street_lamp.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/raw_ref_vintage_street_lamp_attempt_01.png", + "output": "references/ref_vintage_street_lamp.png", + "mask": "references/sam_mask_vintage_street_lamp.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 343.0, + 0.0, + 599.0, + 1023.0 + ], + "mask_score": 3.457917, + "mask_area_ratio": 0.047438, + "elapsed_seconds": 8.3114 + } + } + ], + "not_emitted": [], + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } + } +] diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/dataset.jsonl b/samples_v8/driving/BDD100K_CrowdHuman_samples/dataset.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6f390c595d8a642a802434ba82a1a21ae1a2d924 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/dataset.jsonl @@ -0,0 +1,10 @@ +{"sample_id": "sample_000001", "target_total": 3, "target_people": 1, "target_objects": 2, "canvas_size": [1248, 832], "canvas_aspect_ratio": "3:2", "main_image": "main_image.png", "bbox_overlay": "bbox_overlay.png", "plan": "plan.json", "detections": "detections.json", "vocab_task": "vocab_task.json", "n_planned": 3, "n_detected": 3, "n_subjects": 3, "subjects": [{"name": "pedestrian", "is_person": true, "subject_type": "person", "source_set": "people_set", "source_image_id": "CrowdHuman:data/data_69/273278,12fc4700013112375.jpg:person:3", "source_name": "pedestrian", "source_description": "A person wearing a dark coat and trousers. Source dataset: CrowdHuman. Scene context: A bustling city street lined with trees showcasing vibrant yellow autumn foliage, with many pedestrians walking in both directions.", "sub_caption": "pedestrian: A person wearing a dark coat and trousers.. Scene role: Walking on the sidewalk alongside the street.", "measured_bbox": [0.177, 0.0, 0.3091, 0.4552], "detection_confidence": 0.95, "ref_style": "white_bg_full_body_front", "ref_image": "references/ref_pedestrian.png", "raw_ref_image": "references/raw_ref_pedestrian_attempt_01.png", "reference_verify": "references/reference_verify_pedestrian.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000001/references/raw_ref_pedestrian_attempt_01.png", "output": "references/ref_pedestrian.png", "mask": "references/sam_mask_pedestrian.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [332.0, 40.0, 693.0, 999.0], "mask_score": 3.438137, "mask_area_ratio": 0.157722, "elapsed_seconds": 8.8351}}, {"name": "parked_dark_car", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:bcb356f6-520dd65c:object:9", "source_name": "parked dark car", "source_description": "A dark-colored sedan parked on the right side of the street, behind the silver car. Source dataset: BDD100K. Scene context: A large white New York City bus is driving down a city street on an overcast day with other cars parked and driving around it.", "sub_caption": "parked dark car: A dark-colored sedan.. Scene role: Parked on the right side of the street next to the curb.", "measured_bbox": [0.5856, 0.0522, 0.9973, 0.6586], "detection_confidence": 0.98, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_parked_dark_car.png", "raw_ref_image": "references/raw_ref_parked_dark_car_attempt_02.png", "reference_verify": "references/reference_verify_parked_dark_car.json", "reference_verify_passed": true, "reference_attempts": 2, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000001/references/raw_ref_parked_dark_car_attempt_02.png", "output": "references/ref_parked_dark_car.png", "mask": "references/sam_mask_parked_dark_car.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [17.0, 427.0, 1006.0, 796.0], "mask_score": 3.312519, "mask_area_ratio": 0.186911, "elapsed_seconds": 8.4991}}, {"name": "metal_barrier", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "CrowdHuman:data/data_74/284193,1da20000b642be5b.jpg:object:5", "source_name": "metal barrier", "source_description": "silver metal barricade placed near the entrance Source dataset: CrowdHuman. Scene context: People are gathered outside the entrance of a stone building with arched doorways and large windows.", "sub_caption": "metal barrier: A silver metal barricade.. Scene role: Placed along the edge of the sidewalk near the parked car, separating the walkway from the street.", "measured_bbox": [0.0, 0.1355, 0.6068, 0.558], "detection_confidence": 0.95, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_metal_barrier.png", "raw_ref_image": "references/raw_ref_metal_barrier_attempt_01.png", "reference_verify": "references/reference_verify_metal_barrier.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000001/references/raw_ref_metal_barrier_attempt_01.png", "output": "references/ref_metal_barrier.png", "mask": "references/sam_mask_metal_barrier.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [37.0, 201.0, 1011.0, 889.0], "mask_score": 2.936982, "mask_area_ratio": 0.305722, "elapsed_seconds": 9.8709}}], "not_emitted": [], "model_ids": {"chat_model": "gcp/google/gemini-3.1-pro-preview", "image_model": "gcp/google/gemini-3-pro-image-preview"}} +{"sample_id": "sample_000002", "target_total": 15, "target_people": 2, "target_objects": 13, "canvas_size": [1280, 720], "canvas_aspect_ratio": "16:9", "main_image": "main_image.png", "bbox_overlay": "bbox_overlay.png", "plan": "plan.json", "detections": "detections.json", "vocab_task": "vocab_task.json", "n_planned": 15, "n_detected": 15, "n_subjects": 15, "subjects": [{"name": "pedestrian_right", "is_person": true, "subject_type": "person", "source_set": "people_set", "source_image_id": "BDD100K:c5d864fa-b0b2380b:person:2", "source_name": "pedestrian", "source_description": "Person walking away from the camera on the right sidewalk, wearing dark clothing. Source dataset: BDD100K. Scene context: View from inside a vehicle driving down a wet city street lined with tall buildings, with pedestrians on the sidewalk and several cars and taxis ahead in the rain.", "sub_caption": "pedestrian: Person walking away from the camera on the right sidewalk, wearing dark clothing.. Scene role: Walking along the right-hand sidewalk under the trees.", "measured_bbox": [0.8872, 0.491, 0.9451, 0.6701], "detection_confidence": 0.95, "ref_style": "white_bg_full_body_front", "ref_image": "references/ref_pedestrian_right.png", "raw_ref_image": "references/raw_ref_pedestrian_right_attempt_01.png", "reference_verify": "references/reference_verify_pedestrian_right.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_pedestrian_right_attempt_01.png", "output": "references/ref_pedestrian_right.png", "mask": "references/sam_mask_pedestrian_right.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [345.0, 55.0, 678.0, 982.0], "mask_score": 3.462354, "mask_area_ratio": 0.14014, "elapsed_seconds": 8.2387}}, {"name": "pedestrian_left", "is_person": true, "subject_type": "person", "source_set": "people_set", "source_image_id": "BDD100K:b714a088-861a043b:person:2", "source_name": "pedestrian", "source_description": "another person near the shop entrance on the left, partially obscured Source dataset: BDD100K. Scene context: a city street scene from the perspective of a vehicle, showing multiple cars in motion, buildings lining the road, and pedestrians on the sidewalks during dusk or early evening", "sub_caption": "pedestrian: A person near a shop entrance on the left, partially obscured by shadows.. Scene role: Standing near a building entrance on the left side of the street.", "measured_bbox": [0.1301, 0.5154, 0.1517, 0.611], "detection_confidence": 0.95, "ref_style": "white_bg_full_body_front", "ref_image": "references/ref_pedestrian_left.png", "raw_ref_image": "references/raw_ref_pedestrian_left_attempt_01.png", "reference_verify": "references/reference_verify_pedestrian_left.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_pedestrian_left_attempt_01.png", "output": "references/ref_pedestrian_left.png", "mask": "references/sam_mask_pedestrian_left.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [342.0, 67.0, 681.0, 996.0], "mask_score": 3.481605, "mask_area_ratio": 0.150858, "elapsed_seconds": 8.1403}}, {"name": "city_buildings", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "CrowdHuman:data/data_9/283554,31eeb000e9237b31.jpg:object:9", "source_name": "building", "source_description": "Various city buildings of different heights forming the skyline in the background. Source dataset: CrowdHuman. Scene context: People are walking and resting on the wooden walkway of a large suspension bridge with a city skyline in the background.", "sub_caption": "building: Various city buildings of different heights forming the urban landscape along the street.. Scene role: Lining the street and forming the architectural background on both sides.", "measured_bbox": [0.3358, 0.3425, 0.4929, 0.5277], "detection_confidence": 0.95, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_city_buildings.png", "raw_ref_image": "references/raw_ref_city_buildings_attempt_01.png", "reference_verify": "references/reference_verify_city_buildings.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_city_buildings_attempt_01.png", "output": "references/ref_city_buildings.png", "mask": "references/sam_mask_city_buildings.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [27.0, 178.0, 996.0, 865.0], "mask_score": 3.420089, "mask_area_ratio": 0.463421, "elapsed_seconds": 8.2735}}, {"name": "pink_scooter", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "CrowdHuman:data/data_66/283991,a60c0001c08f63b.jpg:object:3", "source_name": "pink scooter", "source_description": "A prominent pink step-through style motor scooter. Source dataset: CrowdHuman. Scene context: A busy city intersection with many people riding scooters and some cars in the background.", "sub_caption": "pink scooter: A prominent pink step-through style motor scooter.. Scene role: Parked on the right sidewalk near the street signs.", "measured_bbox": [0.677, 0.56, 0.7935, 0.7095], "detection_confidence": 0.95, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_pink_scooter.png", "raw_ref_image": "references/raw_ref_pink_scooter_attempt_01.png", "reference_verify": "references/reference_verify_pink_scooter.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_pink_scooter_attempt_01.png", "output": "references/ref_pink_scooter.png", "mask": "references/sam_mask_pink_scooter.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [41.0, 120.0, 982.0, 920.0], "mask_score": 3.414017, "mask_area_ratio": 0.259921, "elapsed_seconds": 8.1841}}, {"name": "street_signs", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:b5ab0e46-8eab4733:object:8", "source_name": "street signs", "source_description": "Various street signs attached to a pole on the right side of the street, including a speed limit sign. Source dataset: BDD100K. Scene context: A narrow city street with several parked and moving box trucks and vans, bordered by multi-story buildings and construction barriers.", "sub_caption": "street signs: Various street signs attached to a metal pole on the right side of the street.. Scene role: Mounted on a pole alongside the road on the right.", "measured_bbox": [0.8162, 0.2869, 0.8575, 0.4063], "detection_confidence": 0.99, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_street_signs.png", "raw_ref_image": "references/raw_ref_street_signs_attempt_01.png", "reference_verify": "references/reference_verify_street_signs.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_street_signs_attempt_01.png", "output": "references/ref_street_signs.png", "mask": "references/sam_mask_street_signs.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [226.0, 71.0, 707.0, 967.0], "mask_score": 3.475593, "mask_area_ratio": 0.25818, "elapsed_seconds": 9.8621}}, {"name": "storefront_sign", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "CrowdHuman:data/data_4/273275,46a6f0005d04fc24.jpg:object:7", "source_name": "storefront sign", "source_description": "A dark sign with white text 'DELI' and a smaller red 'ATM' sign beneath it on the left. Source dataset: CrowdHuman. Scene context: A densely crowded city street with numerous pedestrians walking towards the camera, while emergency vehicles with flashing lights are visible in the background traffic.", "sub_caption": "storefront sign: A dark, illuminated sign structure above a shop entrance.. Scene role: Hanging above a shop entrance on the left side of the street, illuminating the adjacent pedestrian.", "measured_bbox": [0.1052, 0.4218, 0.186, 0.4781], "detection_confidence": 0.95, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_storefront_sign.png", "raw_ref_image": "references/raw_ref_storefront_sign_attempt_01.png", "reference_verify": "references/reference_verify_storefront_sign.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_storefront_sign_attempt_01.png", "output": "references/ref_storefront_sign.png", "mask": "references/sam_mask_storefront_sign.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [107.0, 274.0, 900.0, 749.0], "mask_score": 3.354337, "mask_area_ratio": 0.167885, "elapsed_seconds": 8.1782}}, {"name": "parked_suv_right", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:c54441e6-400c221e:object:4", "source_name": "parked SUV", "source_description": "Dark-colored SUV parked ahead of the sedan on the right side of the road. Source dataset: BDD100K. Scene context: Nighttime driving scene approaching an intersection with green traffic lights and parked cars on the right.", "sub_caption": "parked SUV: Dark-colored SUV parked on the right side of the road.. Scene role: Parked parallel to the curb on the right side of the street.", "measured_bbox": [0.6057, 0.5099, 0.7451, 0.6703], "detection_confidence": 0.95, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_parked_suv_right.png", "raw_ref_image": "references/raw_ref_parked_suv_right_attempt_01.png", "reference_verify": "references/reference_verify_parked_suv_right.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_parked_suv_right_attempt_01.png", "output": "references/ref_parked_suv_right.png", "mask": "references/sam_mask_parked_suv_right.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [52.0, 216.0, 993.0, 835.0], "mask_score": 3.459027, "mask_area_ratio": 0.361156, "elapsed_seconds": 10.1865}}, {"name": "dark_car_left", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:bb2e43e4-5e7a7129:object:6", "source_name": "car", "source_description": "A dark car parked along the left curb further ahead. Source dataset: BDD100K. Scene context: A dashcam view from a vehicle driving down a multi-lane road on a partly cloudy day, with a few other cars and pedestrians on the sidewalk.", "sub_caption": "car: A dark car parked along the left curb further ahead.. Scene role: Parked alongside the left curb.", "measured_bbox": [0.2139, 0.5323, 0.3044, 0.6201], "detection_confidence": 0.95, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_dark_car_left.png", "raw_ref_image": "references/raw_ref_dark_car_left_attempt_01.png", "reference_verify": "references/reference_verify_dark_car_left.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_dark_car_left_attempt_01.png", "output": "references/ref_dark_car_left.png", "mask": "references/sam_mask_dark_car_left.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [42.0, 237.0, 982.0, 794.0], "mask_score": 3.479099, "mask_area_ratio": 0.30617, "elapsed_seconds": 8.2274}}, {"name": "dark_suv_driving", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:c889c950-865ca5b6:object:0", "source_name": "dark SUV", "source_description": "A dark SUV traveling in the left lane, with visible red taillights. Source dataset: BDD100K. Scene context: Nighttime driving on a highway with several cars visible ahead, illuminated mainly by taillights and headlights.", "sub_caption": "dark SUV: A dark SUV traveling in the left lane, with visible glowing red taillights.. Scene role: Driving ahead of the camera vehicle in the opposing or adjacent left lane.", "measured_bbox": [0.3005, 0.5101, 0.4179, 0.6508], "detection_confidence": 0.98, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_dark_suv_driving.png", "raw_ref_image": "references/raw_ref_dark_suv_driving_attempt_01.png", "reference_verify": "references/reference_verify_dark_suv_driving.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_dark_suv_driving_attempt_01.png", "output": "references/ref_dark_suv_driving.png", "mask": "references/sam_mask_dark_suv_driving.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [96.0, 294.0, 928.0, 812.0], "mask_score": 3.455576, "mask_area_ratio": 0.251452, "elapsed_seconds": 9.8494}}, {"name": "street_light", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:c84f848e-2a5e0737:object:2", "source_name": "street light", "source_description": "Tall pole with a bright light on top, illuminating the road from the right side. Source dataset: BDD100K. Scene context: Nighttime driving view on a multi-lane highway with streetlights illuminating the dark road ahead.", "sub_caption": "street light: Tall pole with a bright light on top, illuminating the road.. Scene role: Casting warm light onto the street from the right-hand sidewalk.", "measured_bbox": [0.8171, 0.1755, 0.8719, 0.2202], "detection_confidence": 0.95, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_street_light.png", "raw_ref_image": "references/raw_ref_street_light_attempt_01.png", "reference_verify": "references/reference_verify_street_light.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_street_light_attempt_01.png", "output": "references/ref_street_light.png", "mask": "references/sam_mask_street_light.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [71.0, 277.0, 913.0, 727.0], "mask_score": 3.350243, "mask_area_ratio": 0.068855, "elapsed_seconds": 8.2963}}, {"name": "vehicle_dashboard", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:b5032e1d-dad95b60:object:9", "source_name": "dashboard", "source_description": "The dark, reflective dashboard and lower windshield area of the camera vehicle in the foreground. Source dataset: BDD100K. Scene context: View from inside a vehicle driving down a residential city street lined with trees, parked cars, and multi-story brick buildings.", "sub_caption": "dashboard: The dark, reflective dashboard and lower windshield area of the camera vehicle.. Scene role: Occupying the bottom foreground of the image, establishing the perspective from inside the car.", "measured_bbox": [0.0, 0.9261, 1.0, 1.0], "detection_confidence": "high", "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_vehicle_dashboard.png", "raw_ref_image": "references/raw_ref_vehicle_dashboard_attempt_01.png", "reference_verify": "references/reference_verify_vehicle_dashboard.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_vehicle_dashboard_attempt_01.png", "output": "references/ref_vehicle_dashboard.png", "mask": "references/sam_mask_vehicle_dashboard.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [24.0, 317.0, 1001.0, 706.0], "mask_score": 2.942001, "mask_area_ratio": 0.133658, "elapsed_seconds": 8.3645}}, {"name": "white_car_ahead", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:b3a102ed-6ef54f5e:object:3", "source_name": "white car", "source_description": "A white car visible further down the road in the right lane. Source dataset: BDD100K. Scene context: Nighttime driving scene in a city with cars stopped in traffic and a construction site on the right.", "sub_caption": "white car: A white car visible further down the road.. Scene role: Driving away in the right lane, further in the distance.", "measured_bbox": [0.4811, 0.5382, 0.5174, 0.5915], "detection_confidence": 0.98, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_white_car_ahead.png", "raw_ref_image": "references/raw_ref_white_car_ahead_attempt_01.png", "reference_verify": "references/reference_verify_white_car_ahead.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_white_car_ahead_attempt_01.png", "output": "references/ref_white_car_ahead.png", "mask": "references/sam_mask_white_car_ahead.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [112.0, 168.0, 912.0, 855.0], "mask_score": 3.412999, "mask_area_ratio": 0.338258, "elapsed_seconds": 8.3339}}, {"name": "double_yellow_lines", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:c417a291-7802692d:object:8", "source_name": "yellow lines", "source_description": "Double yellow painted lines separating opposite directions of traffic. Source dataset: BDD100K. Scene context: A pedestrian crossing a street at a crosswalk with several cars stopped around them, with storefronts visible in the background.", "sub_caption": "yellow lines: Double yellow painted lines separating opposite directions of traffic.. Scene role: Running down the center of the road, receding into the distance.", "measured_bbox": [0.3008, 0.5732, 0.4776, 0.8029], "detection_confidence": 0.98, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_double_yellow_lines.png", "raw_ref_image": "references/raw_ref_double_yellow_lines_attempt_01.png", "reference_verify": "references/reference_verify_double_yellow_lines.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_double_yellow_lines_attempt_01.png", "output": "references/ref_double_yellow_lines.png", "mask": "references/sam_mask_double_yellow_lines.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [0.0, 0.0, 1023.0, 1023.0], "mask_score": 2.141169, "mask_area_ratio": 0.667065, "elapsed_seconds": 8.2719}}, {"name": "street_trees", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:c4891df0-24371ae1:object:3", "source_name": "trees", "source_description": "Numerous trees with dense green and yellowish foliage lining both sides of the road. Source dataset: BDD100K. Scene context: A dark SUV drives ahead on a wet road lined with trees and a fence under a cloudy sky.", "sub_caption": "trees: Numerous trees with dense foliage lining both sides of the road.. Scene role: Planted along the sidewalks, softening the urban environment and framing the street.", "measured_bbox": [0.001, 0.002, 0.375, 0.63], "detection_confidence": 0.95, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_street_trees.png", "raw_ref_image": "references/raw_ref_street_trees_attempt_01.png", "reference_verify": "references/reference_verify_street_trees.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_street_trees_attempt_01.png", "output": "references/ref_street_trees.png", "mask": "references/sam_mask_street_trees.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [65.0, 64.0, 958.0, 969.0], "mask_score": 3.478968, "mask_area_ratio": 0.365667, "elapsed_seconds": 8.231}}, {"name": "twilight_sky", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:c13c0d1f-00dfd075:object:9", "source_name": "sky", "source_description": "Clear twilight sky transitioning from bright near the horizon to dark blue at the top. Source dataset: BDD100K. Scene context: View from a moving vehicle looking down a multi-lane city street lined with parked cars and large trees under a clear twilight sky.", "sub_caption": "sky: Clear twilight sky transitioning from bright near the horizon to dark blue at the top.. Scene role: Providing the atmospheric backdrop above the buildings and street.", "measured_bbox": [0.116, 0.0, 0.714, 0.4742], "detection_confidence": 0.95, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_twilight_sky.png", "raw_ref_image": "references/raw_ref_twilight_sky_attempt_01.png", "reference_verify": "references/reference_verify_twilight_sky.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_twilight_sky_attempt_01.png", "output": "references/ref_twilight_sky.png", "mask": "references/sam_mask_twilight_sky.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [0.0, 208.0, 1023.0, 814.0], "mask_score": 2.437955, "mask_area_ratio": 0.529621, "elapsed_seconds": 9.8292}}], "not_emitted": [], "model_ids": {"chat_model": "gcp/google/gemini-3.1-pro-preview", "image_model": "gcp/google/gemini-3-pro-image-preview"}} +{"sample_id": "sample_000003", "target_total": 3, "target_people": 1, "target_objects": 2, "canvas_size": [1248, 832], "canvas_aspect_ratio": "3:2", "main_image": "main_image.png", "bbox_overlay": "bbox_overlay.png", "plan": "plan.json", "detections": "detections.json", "vocab_task": "vocab_task.json", "n_planned": 3, "n_detected": 3, "n_subjects": 3, "subjects": [{"name": "shopper", "is_person": true, "subject_type": "person", "source_set": "people_set", "source_image_id": "CrowdHuman:data/data_12/282555,5c403000efcca35d.jpg:person:13", "source_name": "shopper", "source_description": "A person standing on the top level, wearing a dark top and dark pants. Source dataset: CrowdHuman. Scene context: A multi-level outdoor shopping mall with various people walking and shopping.", "sub_caption": "shopper: A person standing and waiting, wearing a dark top and dark pants. Scene role: waiting at the crosswalk curb", "measured_bbox": [0.7364, 0.2825, 0.8267, 0.7222], "detection_confidence": 100, "ref_style": "white_bg_full_body_front", "ref_image": "references/ref_shopper.png", "raw_ref_image": "references/raw_ref_shopper_attempt_01.png", "reference_verify": "references/reference_verify_shopper.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000003/references/raw_ref_shopper_attempt_01.png", "output": "references/ref_shopper.png", "mask": "references/sam_mask_shopper.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [348.0, 80.0, 678.0, 995.0], "mask_score": 3.467753, "mask_area_ratio": 0.132874, "elapsed_seconds": 49.4008}}, {"name": "black_sedan", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:c0c183ff-1b24f541:object:7", "source_name": "black sedan", "source_description": "A black sedan driving in the right lane ahead of the red hatchback. Source dataset: BDD100K. Scene context: A view from a vehicle driving down a multi-lane city street flanked by tall buildings, with various cars moving in the same direction and parked along the side.", "sub_caption": "black sedan: A dark, modern black sedan. Scene role: driving in the nearest lane on the street", "measured_bbox": [0.0883, 0.2514, 0.5002, 0.449], "detection_confidence": 0.99, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_black_sedan.png", "raw_ref_image": "references/raw_ref_black_sedan_attempt_01.png", "reference_verify": "references/reference_verify_black_sedan.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000003/references/raw_ref_black_sedan_attempt_01.png", "output": "references/ref_black_sedan.png", "mask": "references/sam_mask_black_sedan.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [0.0, 321.0, 1023.0, 700.0], "mask_score": 2.52477, "mask_area_ratio": 0.559944, "elapsed_seconds": 8.5091}}, {"name": "silver_car", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:be49ae7a-1ffaa683:object:2", "source_name": "silver car", "source_description": "A silver car driving in the adjacent right lane, slightly ahead of the white car. Source dataset: BDD100K. Scene context: View from inside a vehicle driving in heavy traffic on a multi-lane road under a clear sky.", "sub_caption": "silver car: A sleek silver car. Scene role: driving in the adjacent lane slightly ahead of the black sedan", "measured_bbox": [0.3669, 0.2463, 0.7048, 0.409], "detection_confidence": 0.99, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_silver_car.png", "raw_ref_image": "references/raw_ref_silver_car_attempt_01.png", "reference_verify": "references/reference_verify_silver_car.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000003/references/raw_ref_silver_car_attempt_01.png", "output": "references/ref_silver_car.png", "mask": "references/sam_mask_silver_car.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [16.0, 369.0, 1006.0, 693.0], "mask_score": 3.457475, "mask_area_ratio": 0.178123, "elapsed_seconds": 9.7472}}], "not_emitted": [], "model_ids": {"chat_model": "gcp/google/gemini-3.1-pro-preview", "image_model": "gcp/google/gemini-3-pro-image-preview"}} +{"sample_id": "sample_000004", "target_total": 5, "target_people": 1, "target_objects": 4, "canvas_size": [1248, 832], "canvas_aspect_ratio": "3:2", "main_image": "main_image.png", "bbox_overlay": "bbox_overlay.png", "plan": "plan.json", "detections": "detections.json", "vocab_task": "vocab_task.json", "n_planned": 5, "n_detected": 5, "n_subjects": 5, "subjects": [{"name": "pedestrian_walker", "is_person": true, "subject_type": "person", "source_set": "people_set", "source_image_id": "CrowdHuman:data/data_51/273275,145927000354f7525.jpg:person:10", "source_name": "walker", "source_description": "A man walking, wearing a blue hoodie with 'EMO' written on it, grey pants, and dark shoes. Source dataset: CrowdHuman. Scene context: A group of people, some wearing matching white t-shirts, are walking and jogging along a paved path next to a road, with a grey SUV parked on a grassy hill in the background.", "sub_caption": "walker: A man walking, wearing a plain blue hoodie, grey pants, and dark shoes.. Scene role: Walking near the crosswalk on the side of the street.", "measured_bbox": [0.5948, 0.3939, 0.6378, 0.5698], "detection_confidence": 0.98, "ref_style": "white_bg_full_body_front", "ref_image": "references/ref_pedestrian_walker.png", "raw_ref_image": "references/raw_ref_pedestrian_walker_attempt_01.png", "reference_verify": "references/reference_verify_pedestrian_walker.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000004/references/raw_ref_pedestrian_walker_attempt_01.png", "output": "references/ref_pedestrian_walker.png", "mask": "references/sam_mask_pedestrian_walker.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [334.0, 56.0, 706.0, 996.0], "mask_score": 3.43302, "mask_area_ratio": 0.160827, "elapsed_seconds": 9.8914}}, {"name": "red_traffic_light", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:b6df605f-51c158b8:object:6", "source_name": "traffic light", "source_description": "A set of traffic lights suspended over the intersection, showing a red light. Source dataset: BDD100K. Scene context: A view from a vehicle driving down a city street on a sunny day with a U-Haul truck in the opposite lane, parked cars along the right curb, and pedestrians crossing a crosswalk.", "sub_caption": "traffic light: A traffic signal suspended over the intersection, illuminated with a bright red light.. Scene role: Hanging high above the center of the intersection in the driver's line of sight.", "measured_bbox": [0.4668, 0.0722, 0.5093, 0.1896], "detection_confidence": 0.95, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_red_traffic_light.png", "raw_ref_image": "references/raw_ref_red_traffic_light_attempt_01.png", "reference_verify": "references/reference_verify_red_traffic_light.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000004/references/raw_ref_red_traffic_light_attempt_01.png", "output": "references/ref_red_traffic_light.png", "mask": "references/sam_mask_red_traffic_light.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [286.0, 103.0, 1023.0, 893.0], "mask_score": 3.25218, "mask_area_ratio": 0.200515, "elapsed_seconds": 8.1927}}, {"name": "plain_delivery_truck", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:b3a7b21a-48bcf2b8:object:2", "source_name": "delivery truck", "source_description": "A large white box truck parked behind the fence on the right, with visible green and purple logos. Source dataset: BDD100K. Scene context: A view from a vehicle driving down a wide, paved road flanked by an industrial area with fences and delivery trucks under a cloudy sky.", "sub_caption": "delivery truck: A large, plain white box truck without any visible markings or graphics.. Scene role: Parked alongside the right edge of the street curb.", "measured_bbox": [0.6504, 0.2022, 0.966, 0.6212], "detection_confidence": 0.99, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_plain_delivery_truck.png", "raw_ref_image": "references/raw_ref_plain_delivery_truck_attempt_01.png", "reference_verify": "references/reference_verify_plain_delivery_truck.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000004/references/raw_ref_plain_delivery_truck_attempt_01.png", "output": "references/ref_plain_delivery_truck.png", "mask": "references/sam_mask_plain_delivery_truck.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [9.0, 166.0, 1017.0, 852.0], "mask_score": 3.45107, "mask_area_ratio": 0.437578, "elapsed_seconds": 10.0386}}, {"name": "dark_parked_car", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:bcd37eef-1b958ae3:object:5", "source_name": "dark parked car", "source_description": "Another dark-colored car parked on the right side of the street. Source dataset: BDD100K. Scene context: A nighttime view from a vehicle driving down a multi-lane city street with parked cars on the right and oncoming traffic on the left.", "sub_caption": "dark parked car: A dark-colored passenger vehicle.. Scene role: Parked parallel to the curb directly behind the delivery truck.", "measured_bbox": [0.8339, 0.4566, 0.9965, 0.7781], "detection_confidence": 0.98, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_dark_parked_car.png", "raw_ref_image": "references/raw_ref_dark_parked_car_attempt_01.png", "reference_verify": "references/reference_verify_dark_parked_car.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000004/references/raw_ref_dark_parked_car_attempt_01.png", "output": "references/ref_dark_parked_car.png", "mask": "references/sam_mask_dark_parked_car.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [0.0, 301.0, 1023.0, 694.0], "mask_score": 3.113868, "mask_area_ratio": 0.207836, "elapsed_seconds": 8.5697}}, {"name": "street_lines", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:c0c9ec9a-d3638a82:object:6", "source_name": "street lines", "source_description": "Double yellow lines separating traffic directions and white painted lines indicating lanes and crosswalks. Source dataset: BDD100K. Scene context: A view from inside a car driving on a city street, approaching an intersection underneath an elevated railway structure.", "sub_caption": "street lines: Double yellow center lines separating traffic directions, and solid white painted lines forming a distinct crosswalk.. Scene role: Painted on the asphalt, guiding traffic and defining the pedestrian crossing area in the foreground.", "measured_bbox": [0.003, 0.432, 0.971, 0.794], "detection_confidence": 0.95, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_street_lines.png", "raw_ref_image": "references/raw_ref_street_lines_attempt_01.png", "reference_verify": "references/reference_verify_street_lines.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000004/references/raw_ref_street_lines_attempt_01.png", "output": "references/ref_street_lines.png", "mask": "references/sam_mask_street_lines.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [384.0, 98.0, 639.0, 925.0], "mask_score": 3.44596, "mask_area_ratio": 0.067441, "elapsed_seconds": 8.1646}}], "not_emitted": [], "model_ids": {"chat_model": "gcp/google/gemini-3.1-pro-preview", "image_model": "gcp/google/gemini-3-pro-image-preview"}} +{"sample_id": "sample_000005", "target_total": 6, "target_people": 3, "target_objects": 3, "canvas_size": [1248, 832], "canvas_aspect_ratio": "3:2", "main_image": "main_image.png", "bbox_overlay": "bbox_overlay.png", "plan": "plan.json", "detections": "detections.json", "vocab_task": "vocab_task.json", "n_planned": 6, "n_detected": 6, "n_subjects": 6, "subjects": [{"name": "person_yellow_top", "is_person": true, "subject_type": "person", "source_set": "people_set", "source_image_id": "CrowdHuman:data/data_24/282555,49a4e000d44469c6.jpg:person:49", "source_name": "person", "source_description": "Standing, wearing a bright yellow top Source dataset: CrowdHuman. Scene context: A large crowd of tourists is gathered in front of the Louvre museum and its iconic glass pyramid on a sunny day.", "sub_caption": "person: Standing, wearing a bright yellow top. Scene role: Crossing the street on the crosswalk in front of the stopped silver car", "measured_bbox": [0.5309, 0.4516, 0.5607, 0.6301], "detection_confidence": 0.95, "ref_style": "white_bg_full_body_front", "ref_image": "references/ref_person_yellow_top.png", "raw_ref_image": "references/raw_ref_person_yellow_top_attempt_01.png", "reference_verify": "references/reference_verify_person_yellow_top.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_person_yellow_top_attempt_01.png", "output": "references/ref_person_yellow_top.png", "mask": "references/sam_mask_person_yellow_top.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [329.0, 42.0, 701.0, 1012.0], "mask_score": 3.348943, "mask_area_ratio": 0.150169, "elapsed_seconds": 8.25}}, {"name": "man_in_suit", "is_person": true, "subject_type": "person", "source_set": "people_set", "source_image_id": "CrowdHuman:data/data_28/273278,b62280001bda6f1a.jpg:person:19", "source_name": "crowd member", "source_description": "A person far right in the background wearing a suit. Source dataset: CrowdHuman. Scene context: A crowd of people gathers in front of an old stone building with a prominent arched doorway and two large animal statues.", "sub_caption": "crowd member: A person wearing a professional suit.. Scene role: Walking alongside the other pedestrians across the crosswalk", "measured_bbox": [0.5767, 0.4388, 0.6397, 0.6278], "detection_confidence": 0.99, "ref_style": "white_bg_full_body_front", "ref_image": "references/ref_man_in_suit.png", "raw_ref_image": "references/raw_ref_man_in_suit_attempt_01.png", "reference_verify": "references/reference_verify_man_in_suit.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_man_in_suit_attempt_01.png", "output": "references/ref_man_in_suit.png", "mask": "references/sam_mask_man_in_suit.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [351.0, 27.0, 671.0, 1004.0], "mask_score": 3.48496, "mask_area_ratio": 0.144686, "elapsed_seconds": 9.7885}}, {"name": "young_girl", "is_person": true, "subject_type": "person", "source_set": "people_set", "source_image_id": "CrowdHuman:data/data_51/282555,57b9f000182048e6.jpg:person:13", "source_name": "pedestrian", "source_description": "Young girl with brown hair, wearing a blue patterned top. Source dataset: CrowdHuman. Scene context: A sunny outdoor scene featuring the red entrance arch to Navy Pier Beer Garden and a tall brick tower, with a diverse crowd of people walking along the waterfront promenade.", "sub_caption": "pedestrian: Young girl with brown hair, wearing a blue patterned top.. Scene role: Walking across the intersection near the person in the yellow top", "measured_bbox": [0.6354, 0.4889, 0.6677, 0.6337], "detection_confidence": 0.98, "ref_style": "white_bg_full_body_front", "ref_image": "references/ref_young_girl.png", "raw_ref_image": "references/raw_ref_young_girl_attempt_01.png", "reference_verify": "references/reference_verify_young_girl.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_young_girl_attempt_01.png", "output": "references/ref_young_girl.png", "mask": "references/sam_mask_young_girl.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [369.0, 52.0, 661.0, 1003.0], "mask_score": 3.482282, "mask_area_ratio": 0.133298, "elapsed_seconds": 8.3216}}, {"name": "traffic_light", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "CrowdHuman:data/data_36/273275,6a11d000f52c34a9.jpg:object:0", "source_name": "traffic light", "source_description": "A black multi-lens traffic light fixture mounted on a pole above the street. Source dataset: CrowdHuman. Scene context: A male tour guide is speaking to a group of people standing on a city sidewalk next to a road crossing.", "sub_caption": "traffic light: A black multi-lens traffic light fixture mounted on a pole above the street.. Scene role: Suspended over the intersection, showing a red light to halt the vehicles", "measured_bbox": [0.5513, 0.0408, 0.6462, 0.1518], "detection_confidence": 0.99, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_traffic_light.png", "raw_ref_image": "references/raw_ref_traffic_light_attempt_01.png", "reference_verify": "references/reference_verify_traffic_light.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_traffic_light_attempt_01.png", "output": "references/ref_traffic_light.png", "mask": "references/sam_mask_traffic_light.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [113.0, 201.0, 923.0, 826.0], "mask_score": 3.467034, "mask_area_ratio": 0.289252, "elapsed_seconds": 9.874}}, {"name": "street_trees", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "CrowdHuman:data/data_37/282555,a4aef000d9c10933.jpg:object:3", "source_name": "trees", "source_description": "Various green trees and shrubs lining the pathway and visible in the background gardens. Source dataset: CrowdHuman. Scene context: A large crowd of tourists walks along the pathway towards the Taj Mahal on a clear day.", "sub_caption": "trees: Various green trees and shrubs lining the pathway and visible in the background gardens.. Scene role: Planted along the sidewalks on both sides of the street, providing urban greenery", "measured_bbox": [0.542, 0.2363, 0.636, 0.493], "detection_confidence": 0.9, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_street_trees.png", "raw_ref_image": "references/raw_ref_street_trees_attempt_03.png", "reference_verify": "references/reference_verify_street_trees.json", "reference_verify_passed": true, "reference_attempts": 3, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_street_trees_attempt_03.png", "output": "references/ref_street_trees.png", "mask": "references/sam_mask_street_trees.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [22.0, 60.0, 1003.0, 968.0], "mask_score": 3.301958, "mask_area_ratio": 0.393952, "elapsed_seconds": 8.2223}}, {"name": "silver_car", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:be3d3a81-326a032d:object:0", "source_name": "silver car", "source_description": "A silver compact hatchback car facing forward, waiting at an intersection. Its brake lights are on. Source dataset: BDD100K. Scene context: A rainy street scene showing cars waiting at an intersection surrounded by tall buildings and urban infrastructure.", "sub_caption": "silver car: A silver compact hatchback car facing forward, waiting at an intersection. Its brake lights are on.. Scene role: Stopped in the traffic lane in the foreground, waiting for the pedestrians to cross", "measured_bbox": [0.3062, 0.4281, 0.5436, 0.7674], "detection_confidence": 0.99, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_silver_car.png", "raw_ref_image": "references/raw_ref_silver_car_attempt_01.png", "reference_verify": "references/reference_verify_silver_car.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_silver_car_attempt_01.png", "output": "references/ref_silver_car.png", "mask": "references/sam_mask_silver_car.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [80.0, 167.0, 957.0, 937.0], "mask_score": 3.434142, "mask_area_ratio": 0.414005, "elapsed_seconds": 8.3073}}], "not_emitted": [], "model_ids": {"chat_model": "gcp/google/gemini-3.1-pro-preview", "image_model": "gcp/google/gemini-3-pro-image-preview"}} +{"sample_id": "sample_000006", "target_total": 8, "target_people": 1, "target_objects": 7, "canvas_size": [1280, 720], "canvas_aspect_ratio": "16:9", "main_image": "main_image.png", "bbox_overlay": "bbox_overlay.png", "plan": "plan.json", "detections": "detections.json", "vocab_task": "vocab_task.json", "n_planned": 8, "n_detected": 8, "n_subjects": 8, "subjects": [{"name": "pedestrian", "is_person": true, "subject_type": "person", "source_set": "people_set", "source_image_id": "CrowdHuman:data/data_73/283991,17cd800008079067.jpg:person:18", "source_name": "pedestrian", "source_description": "Another person in the distant background near the green structure. Source dataset: CrowdHuman. Scene context: A large crowd of people walking across a street with trees, streetlamps, and classic architecture in the background.", "sub_caption": "pedestrian: A person walking across the street in the distance.. Scene role: Crossing the street near the left background.", "measured_bbox": [0.2151, 0.4819, 0.2507, 0.5947], "detection_confidence": "high", "ref_style": "white_bg_full_body_front", "ref_image": "references/ref_pedestrian.png", "raw_ref_image": "references/raw_ref_pedestrian_attempt_01.png", "reference_verify": "references/reference_verify_pedestrian.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_pedestrian_attempt_01.png", "output": "references/ref_pedestrian.png", "mask": "references/sam_mask_pedestrian.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [353.0, 44.0, 672.0, 1013.0], "mask_score": 3.456561, "mask_area_ratio": 0.147466, "elapsed_seconds": 8.2841}}, {"name": "emergency_vehicle", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:b99f250d-886111c5:object:5", "source_name": "vehicle", "source_description": "A dark-colored vehicle partially visible in the left background with blue emergency lights flashing. Source dataset: BDD100K. Scene context: A nighttime city street intersection showing a crosswalk, construction barriers, and illuminated traffic signals.", "sub_caption": "vehicle: A dark-colored vehicle with blue emergency lights flashing.. Scene role: Stopped or parked in the distant left background.", "measured_bbox": [0.325, 0.4787, 0.3786, 0.5486], "detection_confidence": 0.95, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_emergency_vehicle.png", "raw_ref_image": "references/raw_ref_emergency_vehicle_attempt_01.png", "reference_verify": "references/reference_verify_emergency_vehicle.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_emergency_vehicle_attempt_01.png", "output": "references/ref_emergency_vehicle.png", "mask": "references/sam_mask_emergency_vehicle.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [0.0, 261.0, 1023.0, 782.0], "mask_score": 3.339466, "mask_area_ratio": 0.300308, "elapsed_seconds": 8.2719}}, {"name": "curbside_trash_can", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "CrowdHuman:data/data_10/273275,5da3d000f5358c3c.jpg:object:9", "source_name": "trash can", "source_description": "dark blue cylindrical bin partially visible in the foreground Source dataset: CrowdHuman. Scene context: A group of children and an adult pose for a photo in front of a roller coaster at an amusement park.", "sub_caption": "trash can: A dark blue cylindrical bin.. Scene role: Placed on the sidewalk curb in the lower right foreground.", "measured_bbox": [0.8371, 0.5448, 0.9204, 0.7599], "detection_confidence": 0.99, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_curbside_trash_can.png", "raw_ref_image": "references/raw_ref_curbside_trash_can_attempt_01.png", "reference_verify": "references/reference_verify_curbside_trash_can.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_curbside_trash_can_attempt_01.png", "output": "references/ref_curbside_trash_can.png", "mask": "references/sam_mask_curbside_trash_can.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [175.0, 42.0, 837.0, 982.0], "mask_score": 3.480803, "mask_area_ratio": 0.406976, "elapsed_seconds": 8.7724}}, {"name": "museum_banner", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "CrowdHuman:data/data_24/282555,14d900042c7b9a4.jpg:object:3", "source_name": "exhibition banner", "source_description": "A large, rectangular banner hanging on the building's facade, featuring a maroon background and text. Source dataset: CrowdHuman. Scene context: A large, classical building, likely a museum, with many people sitting and standing on its wide front steps, while a yellow taxi speeds past in the foreground.", "sub_caption": "exhibition banner: A large, rectangular maroon banner hanging on a building's facade.. Scene role: Hanging from the classical architecture on the right side of the street.", "measured_bbox": [0.8013, 0.0164, 0.8543, 0.2771], "detection_confidence": 0.99, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_museum_banner.png", "raw_ref_image": "references/raw_ref_museum_banner_attempt_01.png", "reference_verify": "references/reference_verify_museum_banner.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_museum_banner_attempt_01.png", "output": "references/ref_museum_banner.png", "mask": "references/sam_mask_museum_banner.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [317.0, 20.0, 703.0, 981.0], "mask_score": 3.453619, "mask_area_ratio": 0.268547, "elapsed_seconds": 8.1747}}, {"name": "white_panel_van", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:b5047c50-e1facff6:object:2", "source_name": "white van", "source_description": "A large white panel van with red taillights illuminated, driving in the right lane ahead. Source dataset: BDD100K. Scene context: View from inside a vehicle driving down a multi-lane city street on a sunny day with moderate traffic.", "sub_caption": "white van: A large white panel van with illuminated red taillights.. Scene role: Driving ahead in the right lane of the road.", "measured_bbox": [0.5122, 0.3306, 0.6867, 0.6936], "detection_confidence": 0.99, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_white_panel_van.png", "raw_ref_image": "references/raw_ref_white_panel_van_attempt_01.png", "reference_verify": "references/reference_verify_white_panel_van.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_white_panel_van_attempt_01.png", "output": "references/ref_white_panel_van.png", "mask": "references/sam_mask_white_panel_van.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [102.0, 91.0, 877.0, 937.0], "mask_score": 3.457159, "mask_area_ratio": 0.429852, "elapsed_seconds": 10.1474}}, {"name": "double_solid_white_line", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:bc886d37-5b22c313:object:7", "source_name": "double solid white line", "source_description": "Two continuous white painted lines on the dark asphalt road surface, separating the two lanes of traffic. Source dataset: BDD100K. Scene context: A view from inside a car driving through a brightly lit tunnel with tiled walls, following a silver SUV and a dark compact car.", "sub_caption": "double solid white line: Two continuous white painted lines on dark asphalt.. Scene role: Dividing the traffic lanes down the center of the street.", "measured_bbox": [0.2383, 0.6095, 0.4221, 0.8925], "detection_confidence": "high", "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_double_solid_white_line.png", "raw_ref_image": "references/raw_ref_double_solid_white_line_attempt_01.png", "reference_verify": "references/reference_verify_double_solid_white_line.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_double_solid_white_line_attempt_01.png", "output": "references/ref_double_solid_white_line.png", "mask": "references/sam_mask_double_solid_white_line.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [229.0, 72.0, 795.0, 951.0], "mask_score": 3.470715, "mask_area_ratio": 0.39155, "elapsed_seconds": 9.6388}}, {"name": "blooming_trees", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "CrowdHuman:data/data_26/283991,91fa000cda68156.jpg:object:5", "source_name": "blooming trees", "source_description": "Trees with pink blossoms in the background park area. Source dataset: CrowdHuman. Scene context: People are walking along a sidewalk next to a street lined with trees, some in bloom, with a police officer standing near a parked car.", "sub_caption": "blooming trees: Trees featuring vibrant pink blossoms.. Scene role: Lining the sidewalk and park area on the right side of the street.", "measured_bbox": [0.6279, 0.2153, 0.8163, 0.5163], "detection_confidence": 0.95, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_blooming_trees.png", "raw_ref_image": "references/raw_ref_blooming_trees_attempt_01.png", "reference_verify": "references/reference_verify_blooming_trees.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_blooming_trees_attempt_01.png", "output": "references/ref_blooming_trees.png", "mask": "references/sam_mask_blooming_trees.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [0.0, 93.0, 1023.0, 967.0], "mask_score": 3.479366, "mask_area_ratio": 0.423422, "elapsed_seconds": 8.1597}}, {"name": "green_street_sign", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "CrowdHuman:data/data_50/273278,febe100057ca94db.jpg:object:5", "source_name": "street sign", "source_description": "A green street sign with white text visible on the left side. Source dataset: CrowdHuman. Scene context: Several pedestrians are crossing a street at a crosswalk on a sunny day.", "sub_caption": "street sign: A green rectangular street sign.. Scene role: Mounted on a pole on the left side of the street near the crosswalk.", "measured_bbox": [0.1915, 0.2239, 0.2775, 0.2533], "detection_confidence": 0.9, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_green_street_sign.png", "raw_ref_image": "references/raw_ref_green_street_sign_attempt_01.png", "reference_verify": "references/reference_verify_green_street_sign.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_green_street_sign_attempt_01.png", "output": "references/ref_green_street_sign.png", "mask": "references/sam_mask_green_street_sign.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [26.0, 378.0, 998.0, 645.0], "mask_score": 3.473778, "mask_area_ratio": 0.191363, "elapsed_seconds": 9.6707}}], "not_emitted": [], "model_ids": {"chat_model": "gcp/google/gemini-3.1-pro-preview", "image_model": "gcp/google/gemini-3-pro-image-preview"}} +{"sample_id": "sample_000008", "target_total": 10, "target_people": 8, "target_objects": 2, "canvas_size": [1280, 720], "canvas_aspect_ratio": "16:9", "main_image": "main_image.png", "bbox_overlay": "bbox_overlay.png", "plan": "plan.json", "detections": "detections.json", "vocab_task": "vocab_task.json", "n_planned": 10, "n_detected": 10, "n_subjects": 10, "subjects": [{"name": "passenger", "is_person": true, "subject_type": "person", "source_set": "people_set", "source_image_id": "CrowdHuman:data/data_36/273278,a753000396c56bd.jpg:person:6", "source_name": "passenger", "source_description": "A man wearing glasses, a black jacket, and a light-colored shirt underneath, walking away from the camera. Source dataset: CrowdHuman. Scene context: A large crowd of people waiting at a train platform next to a stationary train.", "sub_caption": "passenger: A man wearing glasses, a black jacket, and a light-colored shirt underneath, walking away from the camera.. Scene role: Walking away on the sidewalk ahead.", "measured_bbox": [0.8938, 0.1807, 0.9951, 0.7157], "detection_confidence": "high", "ref_style": "white_bg_full_body_front", "ref_image": "references/ref_passenger.png", "raw_ref_image": "references/raw_ref_passenger_attempt_01.png", "reference_verify": "references/reference_verify_passenger.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_passenger_attempt_01.png", "output": "references/ref_passenger.png", "mask": "references/sam_mask_passenger.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [342.0, 31.0, 682.0, 985.0], "mask_score": 3.454991, "mask_area_ratio": 0.146239, "elapsed_seconds": 8.0907}}, {"name": "shopper_waiting", "is_person": true, "subject_type": "person", "source_set": "people_set", "source_image_id": "CrowdHuman:data/data_4/284193,a17b00013852c31.jpg:person:2", "source_name": "shopper", "source_description": "A woman standing on the right, looking towards the street, wearing a black jacket, dark trousers, and carrying a brown handbag. Source dataset: CrowdHuman. Scene context: A bustling cobblestone street in a town lined with brick buildings, decorated with colorful bunting flags, and populated with pedestrians walking, shopping, and dining outdoors.", "sub_caption": "shopper: A woman standing on the right, looking towards the street, wearing a black jacket, dark trousers, and carrying a brown handbag.. Scene role: Standing on the corner curb, waiting to cross the street.", "measured_bbox": [0.632, 0.1691, 0.7153, 0.6522], "detection_confidence": 0.99, "ref_style": "white_bg_full_body_front", "ref_image": "references/ref_shopper_waiting.png", "raw_ref_image": "references/raw_ref_shopper_waiting_attempt_01.png", "reference_verify": "references/reference_verify_shopper_waiting.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_shopper_waiting_attempt_01.png", "output": "references/ref_shopper_waiting.png", "mask": "references/sam_mask_shopper_waiting.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [365.0, 69.0, 720.0, 1006.0], "mask_score": 3.169183, "mask_area_ratio": 0.111197, "elapsed_seconds": 8.0622}}, {"name": "shopper_standing", "is_person": true, "subject_type": "person", "source_set": "people_set", "source_image_id": "CrowdHuman:data/data_59/283554,3417e000ebf80bca.jpg:person:13", "source_name": "shopper", "source_description": "A person standing, wearing a blue jacket and blue jeans. Source dataset: CrowdHuman. Scene context: A bustling street market with many pedestrians browsing stalls and goods displayed on the ground.", "sub_caption": "shopper: A person standing, wearing a blue jacket and blue jeans.. Scene role: Standing near the crosswalk edge amidst the crowd.", "measured_bbox": [0.5209, 0.1793, 0.5735, 0.5325], "detection_confidence": 0.98, "ref_style": "white_bg_full_body_front", "ref_image": "references/ref_shopper_standing.png", "raw_ref_image": "references/raw_ref_shopper_standing_attempt_01.png", "reference_verify": "references/reference_verify_shopper_standing.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_shopper_standing_attempt_01.png", "output": "references/ref_shopper_standing.png", "mask": "references/sam_mask_shopper_standing.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [326.0, 43.0, 698.0, 998.0], "mask_score": 3.440171, "mask_area_ratio": 0.161293, "elapsed_seconds": 8.0811}}, {"name": "protester", "is_person": true, "subject_type": "person", "source_set": "people_set", "source_image_id": "CrowdHuman:data/data_22/273278,c58890002730d04e.jpg:person:8", "source_name": "protester holding sign in back", "source_description": "A person walking in the background on the right, holding up a large white sign. Source dataset: CrowdHuman. Scene context: A large crowd of people, possibly students, are walking down a city street, some holding signs, suggesting a protest or march.", "sub_caption": "protester holding sign in back: A person walking in the background on the right, holding up a large white sign.. Scene role: Walking further down the right sidewalk carrying a sign.", "measured_bbox": [0.8193, 0.1216, 0.8875, 0.4511], "detection_confidence": 0.99, "ref_style": "white_bg_full_body_front", "ref_image": "references/ref_protester.png", "raw_ref_image": "references/raw_ref_protester_attempt_01.png", "reference_verify": "references/reference_verify_protester.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_protester_attempt_01.png", "output": "references/ref_protester.png", "mask": "references/sam_mask_protester.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [337.0, 19.0, 694.0, 1013.0], "mask_score": 3.465365, "mask_area_ratio": 0.161731, "elapsed_seconds": 8.3408}}, {"name": "pedestrian_crossing", "is_person": true, "subject_type": "person", "source_set": "people_set", "source_image_id": "CrowdHuman:data/data_1/273275,1361a000ad290eee.jpg:person:11", "source_name": "pedestrian", "source_description": "A person in a white top and dark pants, walking towards the right. Source dataset: CrowdHuman. Scene context: A bustling city street scene with a mix of old brick buildings and modern glass structures, connected by an elevated glass walkway, with a large crowd of pedestrians moving along the wide sidewalk.", "sub_caption": "pedestrian: A person in a white top and dark pants, walking towards the right.. Scene role: Actively walking across the crosswalk in front of the vehicle.", "measured_bbox": [0.2322, 0.1993, 0.3165, 0.4965], "detection_confidence": 0.99, "ref_style": "white_bg_full_body_front", "ref_image": "references/ref_pedestrian_crossing.png", "raw_ref_image": "references/raw_ref_pedestrian_crossing_attempt_01.png", "reference_verify": "references/reference_verify_pedestrian_crossing.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_pedestrian_crossing_attempt_01.png", "output": "references/ref_pedestrian_crossing.png", "mask": "references/sam_mask_pedestrian_crossing.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [355.0, 20.0, 674.0, 1012.0], "mask_score": 3.482863, "mask_area_ratio": 0.15384, "elapsed_seconds": 8.0791}}, {"name": "pedestrian_walking_away", "is_person": true, "subject_type": "person", "source_set": "people_set", "source_image_id": "CrowdHuman:data/data_66/273275,194f6000c5fdbe78.jpg:person:12", "source_name": "pedestrian", "source_description": "A person wearing a white top and dark pants, walking away from the camera. Source dataset: CrowdHuman. Scene context: A large crowd of people walking on a city street in front of a grand, multi-story commercial building with signs for Virgin Megastore and Monoprix.", "sub_caption": "pedestrian: A person wearing a white top and dark pants, walking away from the camera.. Scene role: Walking away on the left side of the street.", "measured_bbox": [0.013, 0.2139, 0.0908, 0.494], "detection_confidence": 100, "ref_style": "white_bg_full_body_front", "ref_image": "references/ref_pedestrian_walking_away.png", "raw_ref_image": "references/raw_ref_pedestrian_walking_away_attempt_01.png", "reference_verify": "references/reference_verify_pedestrian_walking_away.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_pedestrian_walking_away_attempt_01.png", "output": "references/ref_pedestrian_walking_away.png", "mask": "references/sam_mask_pedestrian_walking_away.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [350.0, 50.0, 671.0, 987.0], "mask_score": 3.476273, "mask_area_ratio": 0.142721, "elapsed_seconds": 8.2428}}, {"name": "young_man", "is_person": true, "subject_type": "person", "source_set": "people_set", "source_image_id": "CrowdHuman:data/data_22/282555,1b65800070e437ee.jpg:person:8", "source_name": "young man", "source_description": "Standing back row, wearing a dark blue hoodie. Source dataset: CrowdHuman. Scene context: A group of young people, mostly wearing college apparel, are posing for a photograph on bleachers in front of a red building.", "sub_caption": "young man: Standing back row, wearing a dark blue hoodie.. Scene role: Waiting in the crowd at the corner intersection.", "measured_bbox": [0.5568, 0.1246, 0.6032, 0.5033], "detection_confidence": 0.95, "ref_style": "white_bg_full_body_front", "ref_image": "references/ref_young_man.png", "raw_ref_image": "references/raw_ref_young_man_attempt_01.png", "reference_verify": "references/reference_verify_young_man.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_young_man_attempt_01.png", "output": "references/ref_young_man.png", "mask": "references/sam_mask_young_man.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [352.0, 65.0, 671.0, 928.0], "mask_score": 3.483394, "mask_area_ratio": 0.132506, "elapsed_seconds": 8.2271}}, {"name": "businessman", "is_person": true, "subject_type": "person", "source_set": "people_set", "source_image_id": "CrowdHuman:data/data_56/282555,cd31e000ef97ef85.jpg:person:23", "source_name": "adult in dark suit", "source_description": "Standing in back rows, wearing dark suit and tie. Source dataset: CrowdHuman. Scene context: A large group of people gathered for a celebration, many wearing bright traditional Chinese costumes, holding props like fans and a lion dance head, posing for a group photo behind a banner.", "sub_caption": "adult in dark suit: Standing in back rows, wearing dark suit and tie.. Scene role: Standing on the sidewalk behind other pedestrians.", "measured_bbox": [0.5976, 0.1322, 0.6413, 0.4385], "detection_confidence": 0.95, "ref_style": "white_bg_full_body_front", "ref_image": "references/ref_businessman.png", "raw_ref_image": "references/raw_ref_businessman_attempt_01.png", "reference_verify": "references/reference_verify_businessman.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_businessman_attempt_01.png", "output": "references/ref_businessman.png", "mask": "references/sam_mask_businessman.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [336.0, 19.0, 690.0, 1013.0], "mask_score": 2.970801, "mask_area_ratio": 0.135565, "elapsed_seconds": 8.2448}}, {"name": "street_lamp", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "CrowdHuman:data/data_10/273278,10841e0001c957a7c.jpg:object:6", "source_name": "street lamp", "source_description": "A tall street lamp pole partially visible on the right side of the street. Source dataset: CrowdHuman. Scene context: A crowded city street at night, illuminated by numerous bright, colorful neon signs and shop displays.", "sub_caption": "street lamp: A tall street lamp pole partially visible on the right side of the street.. Scene role: Towering above the right sidewalk corner, serving as city infrastructure.", "measured_bbox": [0.014, 0.03, 0.109, 0.254], "detection_confidence": 0.8, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_street_lamp.png", "raw_ref_image": "references/raw_ref_street_lamp_attempt_01.png", "reference_verify": "references/reference_verify_street_lamp.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_street_lamp_attempt_01.png", "output": "references/ref_street_lamp.png", "mask": "references/sam_mask_street_lamp.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [257.0, 5.0, 840.0, 1019.0], "mask_score": 3.134794, "mask_area_ratio": 0.049316, "elapsed_seconds": 8.2643}}, {"name": "dashboard_reflection", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:c24e7c72-f53291ea:object:8", "source_name": "vehicle dashboard reflection", "source_description": "A reflection on the windshield showing the interior dashboard and a document or object with large blue text. Source dataset: BDD100K. Scene context: View from inside a vehicle driving on a city street on an overcast day, passing a large brick building on the right.", "sub_caption": "vehicle dashboard reflection: A reflection on the windshield showing the interior dashboard and a document or object with large blue text.. Scene role: Visible along the bottom edge of the frame, establishing the camera's perspective from inside a car.", "measured_bbox": [0.143, 0.6854, 0.461, 0.8934], "detection_confidence": 0.95, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_dashboard_reflection.png", "raw_ref_image": "references/raw_ref_dashboard_reflection_attempt_01.png", "reference_verify": "references/reference_verify_dashboard_reflection.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_dashboard_reflection_attempt_01.png", "output": "references/ref_dashboard_reflection.png", "mask": "references/sam_mask_dashboard_reflection.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [0.0, 336.0, 1023.0, 1023.0], "mask_score": 1.211741, "mask_area_ratio": 0.687541, "elapsed_seconds": 9.9949}}], "not_emitted": [], "model_ids": {"chat_model": "gcp/google/gemini-3.1-pro-preview", "image_model": "gcp/google/gemini-3-pro-image-preview"}} +{"sample_id": "sample_000009", "target_total": 5, "target_people": 1, "target_objects": 4, "canvas_size": [1248, 832], "canvas_aspect_ratio": "3:2", "main_image": "main_image.png", "bbox_overlay": "bbox_overlay.png", "plan": "plan.json", "detections": "detections.json", "vocab_task": "vocab_task.json", "n_planned": 5, "n_detected": 5, "n_subjects": 5, "subjects": [{"name": "pedestrian_in_suit", "is_person": true, "subject_type": "person", "source_set": "people_set", "source_image_id": "CrowdHuman:data/data_11/282555,d1509000eb848483.jpg:person:21", "source_name": "pedestrian in suit", "source_description": "A person walking, wearing a dark suit. Source dataset: CrowdHuman. Scene context: A crowd of tourists gathers outside a grand palace, with some sitting on a stone balustrade and others walking along a wide paved path.", "sub_caption": "pedestrian in suit: A person walking, wearing a dark suit.. Scene role: walking along the right side of the street on the sidewalk", "measured_bbox": [0.7498, 0.407, 0.8062, 0.6382], "detection_confidence": 0.99, "ref_style": "white_bg_full_body_front", "ref_image": "references/ref_pedestrian_in_suit.png", "raw_ref_image": "references/raw_ref_pedestrian_in_suit_attempt_01.png", "reference_verify": "references/reference_verify_pedestrian_in_suit.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000009/references/raw_ref_pedestrian_in_suit_attempt_01.png", "output": "references/ref_pedestrian_in_suit.png", "mask": "references/sam_mask_pedestrian_in_suit.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [347.0, 40.0, 695.0, 1018.0], "mask_score": 3.473173, "mask_area_ratio": 0.152202, "elapsed_seconds": 9.5571}}, {"name": "yellow_building", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "CrowdHuman:data/data_32/273275,7595c000fa55d9d7.jpg:object:5", "source_name": "yellow building", "source_description": "A multi-story building with a yellow ochre facade and numerous shuttered windows visible in the background on the far left. Source dataset: CrowdHuman. Scene context: A large crowd of tourists is gathered around a monumental, ornate stone fountain with large statues and cascading water, situated in a city square surrounded by buildings.", "sub_caption": "yellow building: A multi-story building with a yellow ochre facade and numerous shuttered windows.. Scene role: providing a backdrop on the right side of the street scene", "measured_bbox": [0.6651, 0.0, 0.9968, 0.6296], "detection_confidence": 0.99, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_yellow_building.png", "raw_ref_image": "references/raw_ref_yellow_building_attempt_01.png", "reference_verify": "references/reference_verify_yellow_building.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000009/references/raw_ref_yellow_building_attempt_01.png", "output": "references/ref_yellow_building.png", "mask": "references/sam_mask_yellow_building.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [4.0, 0.0, 1023.0, 995.0], "mask_score": 2.131685, "mask_area_ratio": 0.742735, "elapsed_seconds": 8.5184}}, {"name": "yellow_lane_line", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:c9195e43-b2fdd978:object:2", "source_name": "yellow lane line", "source_description": "A solid yellow line painted on the road surface indicating the edge of the lane. Source dataset: BDD100K. Scene context: View from inside a vehicle driving on a highway, with a white car ahead in the next lane and lush green bushes along the right side barrier.", "sub_caption": "yellow lane line: A solid yellow line painted on the road surface indicating the edge of the lane.. Scene role: extending along the road surface towards the distance", "measured_bbox": [0.4487, 0.5308, 0.5367, 1.0], "detection_confidence": 0.98, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_yellow_lane_line.png", "raw_ref_image": "references/raw_ref_yellow_lane_line_attempt_01.png", "reference_verify": "references/reference_verify_yellow_lane_line.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000009/references/raw_ref_yellow_lane_line_attempt_01.png", "output": "references/ref_yellow_lane_line.png", "mask": "references/sam_mask_yellow_lane_line.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [12.0, 55.0, 1018.0, 969.0], "mask_score": 0.925602, "mask_area_ratio": 0.952688, "elapsed_seconds": 8.3819}}, {"name": "overpass", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:bcd32109-8b0b1eef:object:6", "source_name": "overpass", "source_description": "A concrete bridge structure spanning across the highway ahead, casting a shadow over the road. Source dataset: BDD100K. Scene context: View from a moving vehicle on a multi-lane highway, approaching an underpass with several cars ahead.", "sub_caption": "overpass: A concrete bridge structure spanning across the street ahead.. Scene role: arching over the road in the midground", "measured_bbox": [0.0031, 0.1925, 0.6919, 0.5364], "detection_confidence": 0.95, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_overpass.png", "raw_ref_image": "references/raw_ref_overpass_attempt_01.png", "reference_verify": "references/reference_verify_overpass.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000009/references/raw_ref_overpass_attempt_01.png", "output": "references/ref_overpass.png", "mask": "references/sam_mask_overpass.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [28.0, 284.0, 965.0, 771.0], "mask_score": 3.406555, "mask_area_ratio": 0.166775, "elapsed_seconds": 8.3597}}, {"name": "street_light", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:bd5bb58b-24a6f5e7:object:2", "source_name": "street light", "source_description": "Bright, glowing street lights illuminating the road. Source dataset: BDD100K. Scene context: Nighttime driving scene on a city street with multiple vehicles, overhead structures, and traffic signals.", "sub_caption": "street light: Bright, glowing street lights illuminating the road.. Scene role: overhead fixtures providing illumination over the roadway and sidewalk", "measured_bbox": [0.5545, 0.0, 0.5804, 0.0625], "detection_confidence": 0.95, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_street_light.png", "raw_ref_image": "references/raw_ref_street_light_attempt_01.png", "reference_verify": "references/reference_verify_street_light.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000009/references/raw_ref_street_light_attempt_01.png", "output": "references/ref_street_light.png", "mask": "references/sam_mask_street_light.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [205.0, 0.0, 824.0, 857.0], "mask_score": 3.391373, "mask_area_ratio": 0.189186, "elapsed_seconds": 8.2179}}], "not_emitted": [], "model_ids": {"chat_model": "gcp/google/gemini-3.1-pro-preview", "image_model": "gcp/google/gemini-3-pro-image-preview"}} +{"sample_id": "sample_000010", "target_total": 9, "target_people": 8, "target_objects": 1, "canvas_size": [1280, 720], "canvas_aspect_ratio": "16:9", "main_image": "main_image.png", "bbox_overlay": "bbox_overlay.png", "plan": "plan.json", "detections": "detections.json", "vocab_task": "vocab_task.json", "n_planned": 9, "n_detected": 9, "n_subjects": 9, "subjects": [{"name": "pedestrian_black_jacket", "is_person": true, "subject_type": "person", "source_set": "people_set", "source_image_id": "CrowdHuman:data/data_69/273275,3ad7000cc6b4598.jpg:person:7", "source_name": "pedestrian", "source_description": "Person walking away, wearing a black jacket and dark pants. Source dataset: CrowdHuman. Scene context: People are walking on a street in front of a large, ornate cathedral.", "sub_caption": "pedestrian: Person walking away, wearing a black jacket and dark pants.. Scene role: walking away on the sidewalk to the right", "measured_bbox": [0.8947, 0.408, 0.9768, 0.8], "detection_confidence": 0.98, "ref_style": "white_bg_full_body_front", "ref_image": "references/ref_pedestrian_black_jacket.png", "raw_ref_image": "references/raw_ref_pedestrian_black_jacket_attempt_01.png", "reference_verify": "references/reference_verify_pedestrian_black_jacket.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_black_jacket_attempt_01.png", "output": "references/ref_pedestrian_black_jacket.png", "mask": "references/sam_mask_pedestrian_black_jacket.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [332.0, 25.0, 691.0, 1014.0], "mask_score": 3.419812, "mask_area_ratio": 0.160983, "elapsed_seconds": 8.2172}}, {"name": "pedestrian_backpack", "is_person": true, "subject_type": "person", "source_set": "people_set", "source_image_id": "CrowdHuman:data/data_9/283991,1e62f00058996b51.jpg:person:10", "source_name": "pedestrian", "source_description": "A man partially visible behind the man in the black polo shirt, wearing a blue t-shirt and a backpack. Source dataset: CrowdHuman. Scene context: A group of people standing on a paved street in front of stone buildings, seemingly listening to a tour guide.", "sub_caption": "pedestrian: A man wearing a blue t-shirt and a backpack.. Scene role: standing near the curb looking toward the road", "measured_bbox": [0.6953, 0.4394, 0.7156, 0.5151], "detection_confidence": 0.95, "ref_style": "white_bg_full_body_front", "ref_image": "references/ref_pedestrian_backpack.png", "raw_ref_image": "references/raw_ref_pedestrian_backpack_attempt_01.png", "reference_verify": "references/reference_verify_pedestrian_backpack.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_backpack_attempt_01.png", "output": "references/ref_pedestrian_backpack.png", "mask": "references/sam_mask_pedestrian_backpack.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [344.0, 30.0, 689.0, 1018.0], "mask_score": 3.422455, "mask_area_ratio": 0.157988, "elapsed_seconds": 8.1451}}, {"name": "pedestrian_red_jacket", "is_person": true, "subject_type": "person", "source_set": "people_set", "source_image_id": "CrowdHuman:data/data_29/283991,4d5b0000ceec5a1.jpg:person:5", "source_name": "pedestrian standing", "source_description": "A person standing near a kiosk, wearing a bright red jacket. Source dataset: CrowdHuman. Scene context: A large outdoor public square with a fountain, surrounded by buildings and trees, with many pedestrians walking around and climbing a wide set of marble stairs in the foreground.", "sub_caption": "pedestrian standing: A person standing, wearing a bright red jacket.. Scene role: waiting at a crosswalk edge", "measured_bbox": [0.4504, 0.4033, 0.474, 0.5253], "detection_confidence": 0.98, "ref_style": "white_bg_full_body_front", "ref_image": "references/ref_pedestrian_red_jacket.png", "raw_ref_image": "references/raw_ref_pedestrian_red_jacket_attempt_01.png", "reference_verify": "references/reference_verify_pedestrian_red_jacket.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_red_jacket_attempt_01.png", "output": "references/ref_pedestrian_red_jacket.png", "mask": "references/sam_mask_pedestrian_red_jacket.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [346.0, 92.0, 677.0, 984.0], "mask_score": 3.472322, "mask_area_ratio": 0.129704, "elapsed_seconds": 9.5973}}, {"name": "pedestrian_striped_shirt", "is_person": true, "subject_type": "person", "source_set": "people_set", "source_image_id": "CrowdHuman:data/data_15/273278,141d72000a04cd076.jpg:person:8", "source_name": "pedestrian", "source_description": "Person in the background wearing a striped shirt and dark pants. Source dataset: CrowdHuman. Scene context: A busy city square with pedestrians, bicycles, and a yellow tram on a sunny day.", "sub_caption": "pedestrian: Person wearing a striped shirt and dark pants.. Scene role: strolling along the sidewalk", "measured_bbox": [0.7269, 0.3947, 0.7711, 0.5853], "detection_confidence": 100, "ref_style": "white_bg_full_body_front", "ref_image": "references/ref_pedestrian_striped_shirt.png", "raw_ref_image": "references/raw_ref_pedestrian_striped_shirt_attempt_01.png", "reference_verify": "references/reference_verify_pedestrian_striped_shirt.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_striped_shirt_attempt_01.png", "output": "references/ref_pedestrian_striped_shirt.png", "mask": "references/sam_mask_pedestrian_striped_shirt.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [338.0, 11.0, 687.0, 1018.0], "mask_score": 3.206288, "mask_area_ratio": 0.147885, "elapsed_seconds": 8.1875}}, {"name": "man_pink_shirt", "is_person": true, "subject_type": "person", "source_set": "people_set", "source_image_id": "CrowdHuman:data/data_41/283992,8df800092d3201e.jpg:person:1", "source_name": "man talking to young man", "source_description": "Standing next to the young man with the backpack, wearing a pink shirt and dark shorts. Source dataset: CrowdHuman. Scene context: A bustling city street corner with people walking and standing around, surrounded by buildings with large digital billboards and store signs, with a few vehicles including a prominent red SUV passing through.", "sub_caption": "man talking to young man: Man wearing a pink shirt and dark shorts.. Scene role: conversing near a storefront on the sidewalk", "measured_bbox": [0.8332, 0.3734, 0.8735, 0.5918], "detection_confidence": 0.99, "ref_style": "white_bg_full_body_front", "ref_image": "references/ref_man_pink_shirt.png", "raw_ref_image": "references/raw_ref_man_pink_shirt_attempt_01.png", "reference_verify": "references/reference_verify_man_pink_shirt.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_man_pink_shirt_attempt_01.png", "output": "references/ref_man_pink_shirt.png", "mask": "references/sam_mask_man_pink_shirt.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [340.0, 42.0, 689.0, 995.0], "mask_score": 3.442738, "mask_area_ratio": 0.146916, "elapsed_seconds": 8.1734}}, {"name": "pedestrian_light_jacket", "is_person": true, "subject_type": "person", "source_set": "people_set", "source_image_id": "CrowdHuman:data/data_16/273278,11c1c7000110dd323.jpg:person:1", "source_name": "pedestrian", "source_description": "Individual in the crowd near the intersection, wearing a light-colored jacket. Source dataset: CrowdHuman. Scene context: A bustling city street at night, brightly illuminated by numerous large electronic billboards on tall buildings, with heavy vehicle and pedestrian traffic below.", "sub_caption": "pedestrian: Individual wearing a light-colored jacket.. Scene role: approaching the street intersection", "measured_bbox": [0.6065, 0.3907, 0.6375, 0.4907], "detection_confidence": "high", "ref_style": "white_bg_full_body_front", "ref_image": "references/ref_pedestrian_light_jacket.png", "raw_ref_image": "references/raw_ref_pedestrian_light_jacket_attempt_01.png", "reference_verify": "references/reference_verify_pedestrian_light_jacket.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_light_jacket_attempt_01.png", "output": "references/ref_pedestrian_light_jacket.png", "mask": "references/sam_mask_pedestrian_light_jacket.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [342.0, 28.0, 681.0, 1013.0], "mask_score": 3.460161, "mask_area_ratio": 0.163844, "elapsed_seconds": 9.6744}}, {"name": "pedestrian_light_blue_shirt", "is_person": true, "subject_type": "person", "source_set": "people_set", "source_image_id": "CrowdHuman:data/data_4/273275,1dd0f000c9356aaa.jpg:person:18", "source_name": "pedestrian", "source_description": "A person in a light blue shirt walking away from the camera. Source dataset: CrowdHuman. Scene context: A wide, tree-lined avenue crowded with many pedestrians walking in both directions, adorned with red banners on the trees under a clear blue sky.", "sub_caption": "pedestrian: A person in a light blue shirt walking away from the camera.. Scene role: walking further down the sidewalk in the mid-ground", "measured_bbox": [0.9459, 0.3895, 0.9964, 0.6538], "detection_confidence": 100, "ref_style": "white_bg_full_body_front", "ref_image": "references/ref_pedestrian_light_blue_shirt.png", "raw_ref_image": "references/raw_ref_pedestrian_light_blue_shirt_attempt_01.png", "reference_verify": "references/reference_verify_pedestrian_light_blue_shirt.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_light_blue_shirt_attempt_01.png", "output": "references/ref_pedestrian_light_blue_shirt.png", "mask": "references/sam_mask_pedestrian_light_blue_shirt.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [357.0, 33.0, 693.0, 1012.0], "mask_score": 3.452806, "mask_area_ratio": 0.153078, "elapsed_seconds": 9.6513}}, {"name": "distant_pedestrian", "is_person": true, "subject_type": "person", "source_set": "people_set", "source_image_id": "CrowdHuman:data/data_21/283991,2145f000467f0a28.jpg:person:14", "source_name": "distant pedestrian", "source_description": "A person walking in the background. Source dataset: CrowdHuman. Scene context: People walk through an outdoor plaza area with modern architecture, an outdoor seating section with red chairs on the left, and planters with yellow and blue flowers on the right.", "sub_caption": "distant pedestrian: A person walking in the background.. Scene role: walking in the far background down the street", "measured_bbox": [0.6066, 0.3904, 0.6375, 0.489], "detection_confidence": 1.0, "ref_style": "white_bg_full_body_front", "ref_image": "references/ref_distant_pedestrian.png", "raw_ref_image": "references/raw_ref_distant_pedestrian_attempt_01.png", "reference_verify": "references/reference_verify_distant_pedestrian.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_distant_pedestrian_attempt_01.png", "output": "references/ref_distant_pedestrian.png", "mask": "references/sam_mask_distant_pedestrian.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [360.0, 88.0, 677.0, 997.0], "mask_score": 3.489431, "mask_area_ratio": 0.138401, "elapsed_seconds": 8.1869}}, {"name": "black_suv", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:b8fe1054-42625c45:object:1", "source_name": "black suv", "source_description": "A black SUV parked ahead of the silver SUV on the right. Source dataset: BDD100K. Scene context: View from a vehicle driving down a cracked city street lined with parked cars, buildings, and trees on a sunny day.", "sub_caption": "black suv: A black SUV parked ahead on the right.. Scene role: parked at the curb on the right side of the street", "measured_bbox": [0.4391, 0.47, 0.6899, 0.8264], "detection_confidence": 0.99, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_black_suv.png", "raw_ref_image": "references/raw_ref_black_suv_attempt_01.png", "reference_verify": "references/reference_verify_black_suv.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_black_suv_attempt_01.png", "output": "references/ref_black_suv.png", "mask": "references/sam_mask_black_suv.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [16.0, 190.0, 1007.0, 843.0], "mask_score": 3.120914, "mask_area_ratio": 0.384048, "elapsed_seconds": 9.8714}}], "not_emitted": [], "model_ids": {"chat_model": "gcp/google/gemini-3.1-pro-preview", "image_model": "gcp/google/gemini-3-pro-image-preview"}} +{"sample_id": "sample_000011", "target_total": 10, "target_people": 1, "target_objects": 9, "canvas_size": [1280, 720], "canvas_aspect_ratio": "16:9", "main_image": "main_image.png", "bbox_overlay": "bbox_overlay.png", "plan": "plan.json", "detections": "detections.json", "vocab_task": "vocab_task.json", "n_planned": 10, "n_detected": 10, "n_subjects": 10, "subjects": [{"name": "blonde_woman", "is_person": true, "subject_type": "person", "source_set": "people_set", "source_image_id": "CrowdHuman:data/data_8/273275,44ab30007bea86d2.jpg:person:5", "source_name": "woman in foreground", "source_description": "A woman with blonde hair, seen in profile in the bottom center of the foreground. Source dataset: CrowdHuman. Scene context: A large crowd of people is gathered on a city street, many standing behind metal barricades, while some are taking photos.", "sub_caption": "woman in foreground: A woman with blonde hair, seen in profile.. Scene role: walking along the right sidewalk under the street lamps", "measured_bbox": [0.7873, 0.3886, 0.8283, 0.5843], "detection_confidence": 0.95, "ref_style": "white_bg_full_body_front", "ref_image": "references/ref_blonde_woman.png", "raw_ref_image": "references/raw_ref_blonde_woman_attempt_02.png", "reference_verify": "references/reference_verify_blonde_woman.json", "reference_verify_passed": true, "reference_attempts": 2, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/raw_ref_blonde_woman_attempt_02.png", "output": "references/ref_blonde_woman.png", "mask": "references/sam_mask_blonde_woman.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [347.0, 60.0, 686.0, 982.0], "mask_score": 3.476833, "mask_area_ratio": 0.13921, "elapsed_seconds": 10.126}}, {"name": "metal_structure", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "CrowdHuman:data/data_54/273278,11abb000d12e48e5.jpg:object:5", "source_name": "metal structure", "source_description": "A tall, rectangular grey metal box or pillar on the right side, which one man is holding onto. Source dataset: CrowdHuman. Scene context: A crowded subway station with a metal barrier, where two people are climbing over the barrier while others stand by.", "sub_caption": "metal structure: A tall, rectangular grey metal box or pillar.. Scene role: situated on the edge of the sidewalk as a utility box", "measured_bbox": [0.8171, 0.3117, 0.944, 0.5699], "detection_confidence": 0.98, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_metal_structure.png", "raw_ref_image": "references/raw_ref_metal_structure_attempt_01.png", "reference_verify": "references/reference_verify_metal_structure.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/raw_ref_metal_structure_attempt_01.png", "output": "references/ref_metal_structure.png", "mask": "references/sam_mask_metal_structure.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [91.0, 32.0, 932.0, 1001.0], "mask_score": 3.477494, "mask_area_ratio": 0.534141, "elapsed_seconds": 8.3774}}, {"name": "overhead_streetlights", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:bd1b8b79-829e787f:object:5", "source_name": "streetlights", "source_description": "Tall poles with bright lights illuminating the street from above, casting long reflections on the wet pavement. Source dataset: BDD100K. Scene context: Nighttime driving scene on a wet city street with streetlights reflecting on the road.", "sub_caption": "streetlights: Tall poles with bright lights illuminating the street from above, casting long reflections on the wet pavement.. Scene role: providing overhead illumination for the wet road", "measured_bbox": [0.5014, 0.0563, 0.5225, 0.1017], "detection_confidence": 0.9, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_overhead_streetlights.png", "raw_ref_image": "references/raw_ref_overhead_streetlights_attempt_03.png", "reference_verify": "references/reference_verify_overhead_streetlights.json", "reference_verify_passed": true, "reference_attempts": 3, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/raw_ref_overhead_streetlights_attempt_03.png", "output": "references/ref_overhead_streetlights.png", "mask": "references/sam_mask_overhead_streetlights.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [18.0, 185.0, 995.0, 821.0], "mask_score": 3.398942, "mask_area_ratio": 0.279411, "elapsed_seconds": 8.219}}, {"name": "silver_car", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:bba4ee51-3badc9f8:object:6", "source_name": "silver car", "source_description": "Silver car parked further down the street on the right. Source dataset: BDD100K. Scene context: View from inside a car driving down a residential street lined with parked cars and trees.", "sub_caption": "silver car: A silver car.. Scene role: parked further down the street on the right curbside", "measured_bbox": [0.5294, 0.4494, 0.6488, 0.6006], "detection_confidence": 1.0, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_silver_car.png", "raw_ref_image": "references/raw_ref_silver_car_attempt_01.png", "reference_verify": "references/reference_verify_silver_car.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/raw_ref_silver_car_attempt_01.png", "output": "references/ref_silver_car.png", "mask": "references/sam_mask_silver_car.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [47.0, 245.0, 984.0, 779.0], "mask_score": 3.398036, "mask_area_ratio": 0.279834, "elapsed_seconds": 8.4126}}, {"name": "white_suv", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:c5e32cf6-7d2e04b4:object:0", "source_name": "white suv", "source_description": "A white Honda CR-V parked or stopped in the rightmost lane, showing its rear passenger side. Source dataset: BDD100K. Scene context: A view from a car driving down a multi-lane city street with parked cars on the right, oncoming traffic on the left, and a highway overpass in the distance on a sunny day.", "sub_caption": "white suv: A white SUV showing its rear passenger side.. Scene role: stopped or parked in the rightmost lane ahead", "measured_bbox": [0.5082, 0.4246, 0.5826, 0.5552], "detection_confidence": "high", "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_white_suv.png", "raw_ref_image": "references/raw_ref_white_suv_attempt_01.png", "reference_verify": "references/reference_verify_white_suv.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/raw_ref_white_suv_attempt_01.png", "output": "references/ref_white_suv.png", "mask": "references/sam_mask_white_suv.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [26.0, 237.0, 1002.0, 800.0], "mask_score": 3.457781, "mask_area_ratio": 0.315623, "elapsed_seconds": 10.4584}}, {"name": "background_street_light", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "CrowdHuman:data/data_15/273278,8d2ae00027075d75.jpg:object:5", "source_name": "street light", "source_description": "bright street lights illuminating the area in the background Source dataset: CrowdHuman. Scene context: A group of people wearing athletic clothing are posed for a group photo outdoors at night.", "sub_caption": "street light: Bright street lights.. Scene role: illuminating the distant background area of the street", "measured_bbox": [0.5004, 0.0592, 0.5237, 0.0998], "detection_confidence": 0.9, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_background_street_light.png", "raw_ref_image": "references/raw_ref_background_street_light_attempt_01.png", "reference_verify": "references/reference_verify_background_street_light.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/raw_ref_background_street_light_attempt_01.png", "output": "references/ref_background_street_light.png", "mask": "references/sam_mask_background_street_light.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [152.0, 95.0, 875.0, 938.0], "mask_score": 3.437329, "mask_area_ratio": 0.065212, "elapsed_seconds": 8.1426}}, {"name": "oncoming_white_car", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:be3159f3-13250ffe:object:2", "source_name": "white car", "source_description": "A white car driving toward the camera in the oncoming lane, further down the road. Source dataset: BDD100K. Scene context: A daytime street view from a vehicle approaching an intersection with traffic lights and several other cars.", "sub_caption": "white car: A white car driving toward the camera with its headlights visible.. Scene role: active oncoming traffic in the left lane", "measured_bbox": [0.1529, 0.4523, 0.2891, 0.5849], "detection_confidence": 0.98, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_oncoming_white_car.png", "raw_ref_image": "references/raw_ref_oncoming_white_car_attempt_01.png", "reference_verify": "references/reference_verify_oncoming_white_car.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/raw_ref_oncoming_white_car_attempt_01.png", "output": "references/ref_oncoming_white_car.png", "mask": "references/sam_mask_oncoming_white_car.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [13.0, 304.0, 1011.0, 819.0], "mask_score": 3.453796, "mask_area_ratio": 0.299096, "elapsed_seconds": 8.2507}}, {"name": "windshield_mount", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:c5694077-e345e2a8:object:12", "source_name": "windshield mount", "source_description": "A black mount attached to the inside of the windshield, partially obscuring the view. Source dataset: BDD100K. Scene context: A view from inside a car driving down a busy city street flanked by tall buildings, with various vehicles including cars, a delivery truck, and a cyclist in the foreground.", "sub_caption": "windshield mount: A black mount attached to the inside of the windshield.. Scene role: partially obscuring the top view, framing the dashcam perspective", "measured_bbox": [0.0, 0.0, 0.4767, 0.1353], "detection_confidence": "high", "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_windshield_mount.png", "raw_ref_image": "references/raw_ref_windshield_mount_attempt_01.png", "reference_verify": "references/reference_verify_windshield_mount.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/raw_ref_windshield_mount_attempt_01.png", "output": "references/ref_windshield_mount.png", "mask": "references/sam_mask_windshield_mount.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [88.0, 131.0, 967.0, 895.0], "mask_score": 3.43205, "mask_area_ratio": 0.236237, "elapsed_seconds": 8.17}}, {"name": "ego_car_dashboard", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "BDD100K:c187431f-7b6ad6d6:object:0", "source_name": "ego car dashboard", "source_description": "The dark, lower foreground showing part of the dashboard and hood of the vehicle recording the video, with red reflections from taillights. Source dataset: BDD100K. Scene context: Nighttime driving on a multi-lane highway with moderate traffic, showing vehicles ahead and green overhead highway signs.", "sub_caption": "ego car dashboard: The dark, lower foreground showing part of the dashboard and hood of the vehicle, with red ambient reflections.. Scene role: anchors the bottom of the frame, establishing the driver's perspective", "measured_bbox": [0.0, 0.7306, 1.0, 1.0], "detection_confidence": 1.0, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_ego_car_dashboard.png", "raw_ref_image": "references/raw_ref_ego_car_dashboard_attempt_01.png", "reference_verify": "references/reference_verify_ego_car_dashboard.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/raw_ref_ego_car_dashboard_attempt_01.png", "output": "references/ref_ego_car_dashboard.png", "mask": "references/sam_mask_ego_car_dashboard.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [0.0, 327.0, 1023.0, 788.0], "mask_score": 3.233951, "mask_area_ratio": 0.206886, "elapsed_seconds": 10.2609}}, {"name": "vintage_street_lamp", "is_person": false, "subject_type": "object", "source_set": "obj_set", "source_image_id": "CrowdHuman:data/data_29/273275,2032200056dda99e.jpg:object:0", "source_name": "street lamp", "source_description": "Ornate, black, vintage-style street lamp post. Source dataset: CrowdHuman. Scene context: A densely crowded city street with numerous pedestrians walking along a sidewalk lined with shops and tall, leafy trees.", "sub_caption": "street lamp: An ornate, black, vintage-style street lamp post.. Scene role: providing decorative lighting on the right sidewalk next to the pedestrian", "measured_bbox": [0.7202, 0.0, 0.7633, 0.6024], "detection_confidence": 0.99, "ref_style": "white_bg_encyclopedia_photo", "ref_image": "references/ref_vintage_street_lamp.png", "raw_ref_image": "references/raw_ref_vintage_street_lamp_attempt_01.png", "reference_verify": "references/reference_verify_vintage_street_lamp.json", "reference_verify_passed": true, "reference_attempts": 1, "sam_white_bg": {"input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/raw_ref_vintage_street_lamp_attempt_01.png", "output": "references/ref_vintage_street_lamp.png", "mask": "references/sam_mask_vintage_street_lamp.png", "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", "sam_model_type": "vit_b", "sam_device": "auto", "sam_working_size": [640, 640], "sam_max_side": 640, "sam_downscale": 0.625, "prompt_box_xyxy": [343.0, 0.0, 599.0, 1023.0], "mask_score": 3.457917, "mask_area_ratio": 0.047438, "elapsed_seconds": 8.3114}}], "not_emitted": [], "model_ids": {"chat_model": "gcp/google/gemini-3.1-pro-preview", "image_model": "gcp/google/gemini-3-pro-image-preview"}} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/detection_pool/done/sample_000001.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/detection_pool/done/sample_000001.json new file mode 100644 index 0000000000000000000000000000000000000000..6a016e7cbfb2a7ba266347923a204bdc2fb285eb --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/detection_pool/done/sample_000001.json @@ -0,0 +1,16 @@ +{ + "sample_id": "sample_000001", + "plan_path": "sample_000001/plan.json", + "task_path": "sample_000001/vocab_task.json", + "main_image": "sample_000001/main_image.png", + "detections": "sample_000001/detections.json", + "n_detected": 3, + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + }, + "item_id": "sample_000001", + "pool": "detection_pool", + "retry_count": 0, + "errors": [] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/detection_pool/done/sample_000002.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/detection_pool/done/sample_000002.json new file mode 100644 index 0000000000000000000000000000000000000000..f7a9d948df98a917de2f5c1b1bb87a33d5a344d3 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/detection_pool/done/sample_000002.json @@ -0,0 +1,16 @@ +{ + "sample_id": "sample_000002", + "plan_path": "sample_000002/plan.json", + "task_path": "sample_000002/vocab_task.json", + "main_image": "sample_000002/main_image.png", + "detections": "sample_000002/detections.json", + "n_detected": 15, + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + }, + "item_id": "sample_000002", + "pool": "detection_pool", + "retry_count": 0, + "errors": [] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/detection_pool/done/sample_000003.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/detection_pool/done/sample_000003.json new file mode 100644 index 0000000000000000000000000000000000000000..b25aedb689727ef7284d2ca905e87ba2906b74cf --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/detection_pool/done/sample_000003.json @@ -0,0 +1,16 @@ +{ + "sample_id": "sample_000003", + "plan_path": "sample_000003/plan.json", + "task_path": "sample_000003/vocab_task.json", + "main_image": "sample_000003/main_image.png", + "detections": "sample_000003/detections.json", + "n_detected": 3, + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + }, + "item_id": "sample_000003", + "pool": "detection_pool", + "retry_count": 0, + "errors": [] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/detection_pool/done/sample_000004.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/detection_pool/done/sample_000004.json new file mode 100644 index 0000000000000000000000000000000000000000..f40f336f128af5e7e15f22921c27ec7aa9c61d24 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/detection_pool/done/sample_000004.json @@ -0,0 +1,16 @@ +{ + "sample_id": "sample_000004", + "plan_path": "sample_000004/plan.json", + "task_path": "sample_000004/vocab_task.json", + "main_image": "sample_000004/main_image.png", + "detections": "sample_000004/detections.json", + "n_detected": 5, + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + }, + "item_id": "sample_000004", + "pool": "detection_pool", + "retry_count": 0, + "errors": [] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/detection_pool/done/sample_000005.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/detection_pool/done/sample_000005.json new file mode 100644 index 0000000000000000000000000000000000000000..03ed0f70c943bc5440cb95b8a1b1f89de8d110f6 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/detection_pool/done/sample_000005.json @@ -0,0 +1,16 @@ +{ + "sample_id": "sample_000005", + "plan_path": "sample_000005/plan.json", + "task_path": "sample_000005/vocab_task.json", + "main_image": "sample_000005/main_image.png", + "detections": "sample_000005/detections.json", + "n_detected": 6, + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + }, + "item_id": "sample_000005", + "pool": "detection_pool", + "retry_count": 0, + "errors": [] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/detection_pool/done/sample_000006.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/detection_pool/done/sample_000006.json new file mode 100644 index 0000000000000000000000000000000000000000..3ed9eef0b60cc3b94c5aa658234c2f3b5d3f343d --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/detection_pool/done/sample_000006.json @@ -0,0 +1,16 @@ +{ + "sample_id": "sample_000006", + "plan_path": "sample_000006/plan.json", + "task_path": "sample_000006/vocab_task.json", + "main_image": "sample_000006/main_image.png", + "detections": "sample_000006/detections.json", + "n_detected": 8, + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + }, + "item_id": "sample_000006", + "pool": "detection_pool", + "retry_count": 0, + "errors": [] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/detection_pool/done/sample_000008.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/detection_pool/done/sample_000008.json new file mode 100644 index 0000000000000000000000000000000000000000..9b16fa3a8a181c2c4b2de45e0e083a4d038bb357 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/detection_pool/done/sample_000008.json @@ -0,0 +1,16 @@ +{ + "sample_id": "sample_000008", + "plan_path": "sample_000008/plan.json", + "task_path": "sample_000008/vocab_task.json", + "main_image": "sample_000008/main_image.png", + "detections": "sample_000008/detections.json", + "n_detected": 10, + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + }, + "item_id": "sample_000008", + "pool": "detection_pool", + "retry_count": 0, + "errors": [] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/detection_pool/done/sample_000009.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/detection_pool/done/sample_000009.json new file mode 100644 index 0000000000000000000000000000000000000000..56298ed97dc892b6d0e7828308a3b4b27da78110 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/detection_pool/done/sample_000009.json @@ -0,0 +1,16 @@ +{ + "sample_id": "sample_000009", + "plan_path": "sample_000009/plan.json", + "task_path": "sample_000009/vocab_task.json", + "main_image": "sample_000009/main_image.png", + "detections": "sample_000009/detections.json", + "n_detected": 5, + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + }, + "item_id": "sample_000009", + "pool": "detection_pool", + "retry_count": 0, + "errors": [] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/detection_pool/done/sample_000010.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/detection_pool/done/sample_000010.json new file mode 100644 index 0000000000000000000000000000000000000000..ed77a8c9807dd70bc03fecc518643c278d9b7b8c --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/detection_pool/done/sample_000010.json @@ -0,0 +1,16 @@ +{ + "sample_id": "sample_000010", + "plan_path": "sample_000010/plan.json", + "task_path": "sample_000010/vocab_task.json", + "main_image": "sample_000010/main_image.png", + "detections": "sample_000010/detections.json", + "n_detected": 9, + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + }, + "item_id": "sample_000010", + "pool": "detection_pool", + "retry_count": 0, + "errors": [] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/detection_pool/done/sample_000011.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/detection_pool/done/sample_000011.json new file mode 100644 index 0000000000000000000000000000000000000000..491bc1184efcc808d55f6c2bd1dc16f966999749 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/detection_pool/done/sample_000011.json @@ -0,0 +1,16 @@ +{ + "sample_id": "sample_000011", + "plan_path": "sample_000011/plan.json", + "task_path": "sample_000011/vocab_task.json", + "main_image": "sample_000011/main_image.png", + "detections": "sample_000011/detections.json", + "n_detected": 10, + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + }, + "item_id": "sample_000011", + "pool": "detection_pool", + "retry_count": 0, + "errors": [] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/emit_pool/done/sample_000001.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/emit_pool/done/sample_000001.json new file mode 100644 index 0000000000000000000000000000000000000000..64fd57efb906647af4a3e085e9a20b02e99016fa --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/emit_pool/done/sample_000001.json @@ -0,0 +1,4 @@ +{ + "sample_id": "sample_000001", + "row": "sample_000001/row.json" +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/emit_pool/done/sample_000002.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/emit_pool/done/sample_000002.json new file mode 100644 index 0000000000000000000000000000000000000000..df3b957d0c46db4687785605d95f8dd1cf831207 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/emit_pool/done/sample_000002.json @@ -0,0 +1,4 @@ +{ + "sample_id": "sample_000002", + "row": "sample_000002/row.json" +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/emit_pool/done/sample_000003.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/emit_pool/done/sample_000003.json new file mode 100644 index 0000000000000000000000000000000000000000..ff0968713613b2c92d71ff03fa795c586634d671 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/emit_pool/done/sample_000003.json @@ -0,0 +1,4 @@ +{ + "sample_id": "sample_000003", + "row": "sample_000003/row.json" +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/emit_pool/done/sample_000004.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/emit_pool/done/sample_000004.json new file mode 100644 index 0000000000000000000000000000000000000000..faae18874e83856e048ef243f5e9824b4e70a448 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/emit_pool/done/sample_000004.json @@ -0,0 +1,4 @@ +{ + "sample_id": "sample_000004", + "row": "sample_000004/row.json" +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/emit_pool/done/sample_000005.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/emit_pool/done/sample_000005.json new file mode 100644 index 0000000000000000000000000000000000000000..f361f5514fe3ae9586e376112f859ad1c648e2f2 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/emit_pool/done/sample_000005.json @@ -0,0 +1,4 @@ +{ + "sample_id": "sample_000005", + "row": "sample_000005/row.json" +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/emit_pool/done/sample_000006.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/emit_pool/done/sample_000006.json new file mode 100644 index 0000000000000000000000000000000000000000..bba746622988c822c4538ecf7960fc469dc76dda --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/emit_pool/done/sample_000006.json @@ -0,0 +1,4 @@ +{ + "sample_id": "sample_000006", + "row": "sample_000006/row.json" +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/emit_pool/done/sample_000008.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/emit_pool/done/sample_000008.json new file mode 100644 index 0000000000000000000000000000000000000000..a5ab986f3299e71f9a73b25d79b2c1d8d18a9076 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/emit_pool/done/sample_000008.json @@ -0,0 +1,4 @@ +{ + "sample_id": "sample_000008", + "row": "sample_000008/row.json" +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/emit_pool/done/sample_000009.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/emit_pool/done/sample_000009.json new file mode 100644 index 0000000000000000000000000000000000000000..fcb5d99e3592f42c93fa0307eec42780967578dc --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/emit_pool/done/sample_000009.json @@ -0,0 +1,4 @@ +{ + "sample_id": "sample_000009", + "row": "sample_000009/row.json" +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/emit_pool/done/sample_000010.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/emit_pool/done/sample_000010.json new file mode 100644 index 0000000000000000000000000000000000000000..cd4e0ba4748f5fc45cf2007f4067b22c15ad6ddf --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/emit_pool/done/sample_000010.json @@ -0,0 +1,4 @@ +{ + "sample_id": "sample_000010", + "row": "sample_000010/row.json" +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/emit_pool/done/sample_000011.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/emit_pool/done/sample_000011.json new file mode 100644 index 0000000000000000000000000000000000000000..e9a99bbc10226a952ae2914eb9d3c3f13a4b93cb --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/emit_pool/done/sample_000011.json @@ -0,0 +1,4 @@ +{ + "sample_id": "sample_000011", + "row": "sample_000011/row.json" +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/plan_pool/done/sample_000001.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/plan_pool/done/sample_000001.json new file mode 100644 index 0000000000000000000000000000000000000000..784f9ac3f389045708ddb15fa2a1b283aeb26819 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/plan_pool/done/sample_000001.json @@ -0,0 +1,14 @@ +{ + "sample_id": "sample_000001", + "plan_path": "sample_000001/plan.json", + "task_path": "sample_000001/vocab_task.json", + "prompt_hash": "ee63c678fc09a67bb20b6d08e8ef2b19732be3312cf3b823e6d07e1c5dd44f63", + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + }, + "item_id": "sample_000001", + "pool": "plan_pool", + "retry_count": 0, + "errors": [] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/plan_pool/done/sample_000002.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/plan_pool/done/sample_000002.json new file mode 100644 index 0000000000000000000000000000000000000000..9a9ffe77eb96687afebcc01244488e2ebc99feb9 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/plan_pool/done/sample_000002.json @@ -0,0 +1,14 @@ +{ + "sample_id": "sample_000002", + "plan_path": "sample_000002/plan.json", + "task_path": "sample_000002/vocab_task.json", + "prompt_hash": "c3e70d0e58500cbcc95ef2a96d5d6793951917c615216803cc4c45dcc2a3a379", + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + }, + "item_id": "sample_000002", + "pool": "plan_pool", + "retry_count": 0, + "errors": [] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/plan_pool/done/sample_000003.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/plan_pool/done/sample_000003.json new file mode 100644 index 0000000000000000000000000000000000000000..30e7333a56f61c36c0e94d54bf8c5367b7289b61 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/plan_pool/done/sample_000003.json @@ -0,0 +1,14 @@ +{ + "sample_id": "sample_000003", + "plan_path": "sample_000003/plan.json", + "task_path": "sample_000003/vocab_task.json", + "prompt_hash": "117c1932f1edb3ce9fdfb3e81dfacb7b09d1402b748b361e9e812444f5375b35", + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + }, + "item_id": "sample_000003", + "pool": "plan_pool", + "retry_count": 0, + "errors": [] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/plan_pool/done/sample_000004.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/plan_pool/done/sample_000004.json new file mode 100644 index 0000000000000000000000000000000000000000..e67ee17130bdeb1279ec8764cf887b1e43340818 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/plan_pool/done/sample_000004.json @@ -0,0 +1,14 @@ +{ + "sample_id": "sample_000004", + "plan_path": "sample_000004/plan.json", + "task_path": "sample_000004/vocab_task.json", + "prompt_hash": "6960fecb1d8acad95182b833cb50f6e2533d54c0b80d08771e7bc3b42d40e3d8", + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + }, + "item_id": "sample_000004", + "pool": "plan_pool", + "retry_count": 0, + "errors": [] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/plan_pool/done/sample_000005.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/plan_pool/done/sample_000005.json new file mode 100644 index 0000000000000000000000000000000000000000..8939fdd9e97fea949447a23042c786f8356609cf --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/plan_pool/done/sample_000005.json @@ -0,0 +1,14 @@ +{ + "sample_id": "sample_000005", + "plan_path": "sample_000005/plan.json", + "task_path": "sample_000005/vocab_task.json", + "prompt_hash": "a48c0aa92c5c1cd47384926ff0c246c81330b81484f9df4241df125f4b568141", + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + }, + "item_id": "sample_000005", + "pool": "plan_pool", + "retry_count": 0, + "errors": [] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/plan_pool/done/sample_000006.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/plan_pool/done/sample_000006.json new file mode 100644 index 0000000000000000000000000000000000000000..902ba406b4fe76bfda90ca1b122abca3f885ccc1 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/plan_pool/done/sample_000006.json @@ -0,0 +1,14 @@ +{ + "sample_id": "sample_000006", + "plan_path": "sample_000006/plan.json", + "task_path": "sample_000006/vocab_task.json", + "prompt_hash": "d570570e4c1353a40b5e3c9c048efe37edf22c7e2a5e5977e66dc49033cbd19d", + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + }, + "item_id": "sample_000006", + "pool": "plan_pool", + "retry_count": 0, + "errors": [] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/plan_pool/done/sample_000007.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/plan_pool/done/sample_000007.json new file mode 100644 index 0000000000000000000000000000000000000000..f2c078282a373b60c9d19b02c409780290f3b940 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/plan_pool/done/sample_000007.json @@ -0,0 +1,14 @@ +{ + "sample_id": "sample_000007", + "plan_path": "sample_000007/plan.json", + "task_path": "sample_000007/vocab_task.json", + "prompt_hash": "9bdbac530e2f6fdaa0bef9408c2676077dc19d2a8c0de7167594ca7912f19985", + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + }, + "item_id": "sample_000007", + "pool": "plan_pool", + "retry_count": 0, + "errors": [] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/plan_pool/done/sample_000008.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/plan_pool/done/sample_000008.json new file mode 100644 index 0000000000000000000000000000000000000000..056e2a7b9e567ae7df756b5362b11986a9580c2a --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/plan_pool/done/sample_000008.json @@ -0,0 +1,14 @@ +{ + "sample_id": "sample_000008", + "plan_path": "sample_000008/plan.json", + "task_path": "sample_000008/vocab_task.json", + "prompt_hash": "587eff1134028954e2fb620b54fe1638b5a6fcbd3fe47f6dbdbd07d717459d81", + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + }, + "item_id": "sample_000008", + "pool": "plan_pool", + "retry_count": 0, + "errors": [] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/plan_pool/done/sample_000009.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/plan_pool/done/sample_000009.json new file mode 100644 index 0000000000000000000000000000000000000000..57527fee9e7f8f97a58450e071baf9ad6cda567a --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/plan_pool/done/sample_000009.json @@ -0,0 +1,14 @@ +{ + "sample_id": "sample_000009", + "plan_path": "sample_000009/plan.json", + "task_path": "sample_000009/vocab_task.json", + "prompt_hash": "a012f98a386adac92d188c68bde72a52c6cbb0dcfd864a08e70d6f08881a15de", + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + }, + "item_id": "sample_000009", + "pool": "plan_pool", + "retry_count": 0, + "errors": [] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/plan_pool/done/sample_000010.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/plan_pool/done/sample_000010.json new file mode 100644 index 0000000000000000000000000000000000000000..1766dbb9a6896eb790e758732f228a57068afec4 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/plan_pool/done/sample_000010.json @@ -0,0 +1,14 @@ +{ + "sample_id": "sample_000010", + "plan_path": "sample_000010/plan.json", + "task_path": "sample_000010/vocab_task.json", + "prompt_hash": "0ac9cdba25e93fc329b5dd2d566e77d7add5c4dffbf9c53d4f9b8998e0b21917", + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + }, + "item_id": "sample_000010", + "pool": "plan_pool", + "retry_count": 0, + "errors": [] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/plan_pool/done/sample_000011.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/plan_pool/done/sample_000011.json new file mode 100644 index 0000000000000000000000000000000000000000..9fe22930eab83df888f270cf949d8f281dade870 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/plan_pool/done/sample_000011.json @@ -0,0 +1,14 @@ +{ + "sample_id": "sample_000011", + "plan_path": "sample_000011/plan.json", + "task_path": "sample_000011/vocab_task.json", + "prompt_hash": "9b1f48926b835cfec59e58b4564016b13af9e8f32197b9bfe4a85cddce267178", + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + }, + "item_id": "sample_000011", + "pool": "plan_pool", + "retry_count": 0, + "errors": [] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/reference_pool/done/sample_000001.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/reference_pool/done/sample_000001.json new file mode 100644 index 0000000000000000000000000000000000000000..86ebed9b49e21a1f17a8f10ac99469542619da58 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/reference_pool/done/sample_000001.json @@ -0,0 +1,18 @@ +{ + "sample_id": "sample_000001", + "plan_path": "sample_000001/plan.json", + "task_path": "sample_000001/vocab_task.json", + "main_image": "sample_000001/main_image.png", + "detections": "sample_000001/detections.json", + "references": "sample_000001/references.json", + "n_references": 3, + "reference_errors": {}, + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + }, + "item_id": "sample_000001", + "pool": "reference_pool", + "retry_count": 0, + "errors": [] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/reference_pool/done/sample_000002.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/reference_pool/done/sample_000002.json new file mode 100644 index 0000000000000000000000000000000000000000..b28a5ac8cee232b2d077c6d5edf0c0cc20ee9219 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/reference_pool/done/sample_000002.json @@ -0,0 +1,18 @@ +{ + "sample_id": "sample_000002", + "plan_path": "sample_000002/plan.json", + "task_path": "sample_000002/vocab_task.json", + "main_image": "sample_000002/main_image.png", + "detections": "sample_000002/detections.json", + "references": "sample_000002/references.json", + "n_references": 15, + "reference_errors": {}, + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + }, + "item_id": "sample_000002", + "pool": "reference_pool", + "retry_count": 0, + "errors": [] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/reference_pool/done/sample_000003.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/reference_pool/done/sample_000003.json new file mode 100644 index 0000000000000000000000000000000000000000..08509d33a48cc140f2a135490d689a7bd640f1e4 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/reference_pool/done/sample_000003.json @@ -0,0 +1,18 @@ +{ + "sample_id": "sample_000003", + "plan_path": "sample_000003/plan.json", + "task_path": "sample_000003/vocab_task.json", + "main_image": "sample_000003/main_image.png", + "detections": "sample_000003/detections.json", + "references": "sample_000003/references.json", + "n_references": 3, + "reference_errors": {}, + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + }, + "item_id": "sample_000003", + "pool": "reference_pool", + "retry_count": 0, + "errors": [] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/reference_pool/done/sample_000004.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/reference_pool/done/sample_000004.json new file mode 100644 index 0000000000000000000000000000000000000000..a7c5e40f7f1878828b8ce15b462ce1ac3469f595 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/reference_pool/done/sample_000004.json @@ -0,0 +1,18 @@ +{ + "sample_id": "sample_000004", + "plan_path": "sample_000004/plan.json", + "task_path": "sample_000004/vocab_task.json", + "main_image": "sample_000004/main_image.png", + "detections": "sample_000004/detections.json", + "references": "sample_000004/references.json", + "n_references": 5, + "reference_errors": {}, + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + }, + "item_id": "sample_000004", + "pool": "reference_pool", + "retry_count": 0, + "errors": [] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/reference_pool/done/sample_000005.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/reference_pool/done/sample_000005.json new file mode 100644 index 0000000000000000000000000000000000000000..f87ac5a26cbbf7728590f11a74c891cb3df38c7f --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/reference_pool/done/sample_000005.json @@ -0,0 +1,18 @@ +{ + "sample_id": "sample_000005", + "plan_path": "sample_000005/plan.json", + "task_path": "sample_000005/vocab_task.json", + "main_image": "sample_000005/main_image.png", + "detections": "sample_000005/detections.json", + "references": "sample_000005/references.json", + "n_references": 6, + "reference_errors": {}, + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + }, + "item_id": "sample_000005", + "pool": "reference_pool", + "retry_count": 0, + "errors": [] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/reference_pool/done/sample_000006.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/reference_pool/done/sample_000006.json new file mode 100644 index 0000000000000000000000000000000000000000..1754b27839a1e822553c88896224ad3752c047f2 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/reference_pool/done/sample_000006.json @@ -0,0 +1,18 @@ +{ + "sample_id": "sample_000006", + "plan_path": "sample_000006/plan.json", + "task_path": "sample_000006/vocab_task.json", + "main_image": "sample_000006/main_image.png", + "detections": "sample_000006/detections.json", + "references": "sample_000006/references.json", + "n_references": 8, + "reference_errors": {}, + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + }, + "item_id": "sample_000006", + "pool": "reference_pool", + "retry_count": 0, + "errors": [] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/reference_pool/done/sample_000008.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/reference_pool/done/sample_000008.json new file mode 100644 index 0000000000000000000000000000000000000000..ef0e428ae07b9c48a4d817602609e5105c1ad485 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/reference_pool/done/sample_000008.json @@ -0,0 +1,18 @@ +{ + "sample_id": "sample_000008", + "plan_path": "sample_000008/plan.json", + "task_path": "sample_000008/vocab_task.json", + "main_image": "sample_000008/main_image.png", + "detections": "sample_000008/detections.json", + "references": "sample_000008/references.json", + "n_references": 10, + "reference_errors": {}, + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + }, + "item_id": "sample_000008", + "pool": "reference_pool", + "retry_count": 0, + "errors": [] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/reference_pool/done/sample_000009.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/reference_pool/done/sample_000009.json new file mode 100644 index 0000000000000000000000000000000000000000..0b06f2f83d20135b5ffdb0bfc12da679cd5c02c2 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/reference_pool/done/sample_000009.json @@ -0,0 +1,18 @@ +{ + "sample_id": "sample_000009", + "plan_path": "sample_000009/plan.json", + "task_path": "sample_000009/vocab_task.json", + "main_image": "sample_000009/main_image.png", + "detections": "sample_000009/detections.json", + "references": "sample_000009/references.json", + "n_references": 5, + "reference_errors": {}, + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + }, + "item_id": "sample_000009", + "pool": "reference_pool", + "retry_count": 0, + "errors": [] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/reference_pool/done/sample_000010.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/reference_pool/done/sample_000010.json new file mode 100644 index 0000000000000000000000000000000000000000..d9f3f8f8c770a8f4791feacad3852bbf8417fb12 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/reference_pool/done/sample_000010.json @@ -0,0 +1,18 @@ +{ + "sample_id": "sample_000010", + "plan_path": "sample_000010/plan.json", + "task_path": "sample_000010/vocab_task.json", + "main_image": "sample_000010/main_image.png", + "detections": "sample_000010/detections.json", + "references": "sample_000010/references.json", + "n_references": 9, + "reference_errors": {}, + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + }, + "item_id": "sample_000010", + "pool": "reference_pool", + "retry_count": 0, + "errors": [] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/reference_pool/done/sample_000011.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/reference_pool/done/sample_000011.json new file mode 100644 index 0000000000000000000000000000000000000000..853fd1e05047a734e5439eecf213315bfd2cf570 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/reference_pool/done/sample_000011.json @@ -0,0 +1,18 @@ +{ + "sample_id": "sample_000011", + "plan_path": "sample_000011/plan.json", + "task_path": "sample_000011/vocab_task.json", + "main_image": "sample_000011/main_image.png", + "detections": "sample_000011/detections.json", + "references": "sample_000011/references.json", + "n_references": 10, + "reference_errors": {}, + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + }, + "item_id": "sample_000011", + "pool": "reference_pool", + "retry_count": 0, + "errors": [] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/rows/sample_000001.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/rows/sample_000001.json new file mode 100644 index 0000000000000000000000000000000000000000..b58924ea676d48cf68e5a36b20fecd5e98653637 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/rows/sample_000001.json @@ -0,0 +1,164 @@ +{ + "sample_id": "sample_000001", + "target_total": 3, + "target_people": 1, + "target_objects": 2, + "canvas_size": [ + 1248, + 832 + ], + "canvas_aspect_ratio": "3:2", + "main_image": "main_image.png", + "bbox_overlay": "bbox_overlay.png", + "plan": "plan.json", + "detections": "detections.json", + "vocab_task": "vocab_task.json", + "n_planned": 3, + "n_detected": 3, + "n_subjects": 3, + "subjects": [ + { + "name": "pedestrian", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_69/273278,12fc4700013112375.jpg:person:3", + "source_name": "pedestrian", + "source_description": "A person wearing a dark coat and trousers. Source dataset: CrowdHuman. Scene context: A bustling city street lined with trees showcasing vibrant yellow autumn foliage, with many pedestrians walking in both directions.", + "sub_caption": "pedestrian: A person wearing a dark coat and trousers.. Scene role: Walking on the sidewalk alongside the street.", + "measured_bbox": [ + 0.177, + 0.0, + 0.3091, + 0.4552 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian.png", + "raw_ref_image": "references/raw_ref_pedestrian_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000001/references/raw_ref_pedestrian_attempt_01.png", + "output": "references/ref_pedestrian.png", + "mask": "references/sam_mask_pedestrian.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 332.0, + 40.0, + 693.0, + 999.0 + ], + "mask_score": 3.438137, + "mask_area_ratio": 0.157722, + "elapsed_seconds": 8.8351 + } + }, + { + "name": "parked_dark_car", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:bcb356f6-520dd65c:object:9", + "source_name": "parked dark car", + "source_description": "A dark-colored sedan parked on the right side of the street, behind the silver car. Source dataset: BDD100K. Scene context: A large white New York City bus is driving down a city street on an overcast day with other cars parked and driving around it.", + "sub_caption": "parked dark car: A dark-colored sedan.. Scene role: Parked on the right side of the street next to the curb.", + "measured_bbox": [ + 0.5856, + 0.0522, + 0.9973, + 0.6586 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_parked_dark_car.png", + "raw_ref_image": "references/raw_ref_parked_dark_car_attempt_02.png", + "reference_verify": "references/reference_verify_parked_dark_car.json", + "reference_verify_passed": true, + "reference_attempts": 2, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000001/references/raw_ref_parked_dark_car_attempt_02.png", + "output": "references/ref_parked_dark_car.png", + "mask": "references/sam_mask_parked_dark_car.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 17.0, + 427.0, + 1006.0, + 796.0 + ], + "mask_score": 3.312519, + "mask_area_ratio": 0.186911, + "elapsed_seconds": 8.4991 + } + }, + { + "name": "metal_barrier", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_74/284193,1da20000b642be5b.jpg:object:5", + "source_name": "metal barrier", + "source_description": "silver metal barricade placed near the entrance Source dataset: CrowdHuman. Scene context: People are gathered outside the entrance of a stone building with arched doorways and large windows.", + "sub_caption": "metal barrier: A silver metal barricade.. Scene role: Placed along the edge of the sidewalk near the parked car, separating the walkway from the street.", + "measured_bbox": [ + 0.0, + 0.1355, + 0.6068, + 0.558 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_metal_barrier.png", + "raw_ref_image": "references/raw_ref_metal_barrier_attempt_01.png", + "reference_verify": "references/reference_verify_metal_barrier.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000001/references/raw_ref_metal_barrier_attempt_01.png", + "output": "references/ref_metal_barrier.png", + "mask": "references/sam_mask_metal_barrier.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 37.0, + 201.0, + 1011.0, + 889.0 + ], + "mask_score": 2.936982, + "mask_area_ratio": 0.305722, + "elapsed_seconds": 9.8709 + } + } + ], + "not_emitted": [], + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/rows/sample_000002.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/rows/sample_000002.json new file mode 100644 index 0000000000000000000000000000000000000000..6a67e01ff3fd6dc56330bcfa2185344802267159 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/rows/sample_000002.json @@ -0,0 +1,716 @@ +{ + "sample_id": "sample_000002", + "target_total": 15, + "target_people": 2, + "target_objects": 13, + "canvas_size": [ + 1280, + 720 + ], + "canvas_aspect_ratio": "16:9", + "main_image": "main_image.png", + "bbox_overlay": "bbox_overlay.png", + "plan": "plan.json", + "detections": "detections.json", + "vocab_task": "vocab_task.json", + "n_planned": 15, + "n_detected": 15, + "n_subjects": 15, + "subjects": [ + { + "name": "pedestrian_right", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "BDD100K:c5d864fa-b0b2380b:person:2", + "source_name": "pedestrian", + "source_description": "Person walking away from the camera on the right sidewalk, wearing dark clothing. Source dataset: BDD100K. Scene context: View from inside a vehicle driving down a wet city street lined with tall buildings, with pedestrians on the sidewalk and several cars and taxis ahead in the rain.", + "sub_caption": "pedestrian: Person walking away from the camera on the right sidewalk, wearing dark clothing.. Scene role: Walking along the right-hand sidewalk under the trees.", + "measured_bbox": [ + 0.8872, + 0.491, + 0.9451, + 0.6701 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian_right.png", + "raw_ref_image": "references/raw_ref_pedestrian_right_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian_right.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_pedestrian_right_attempt_01.png", + "output": "references/ref_pedestrian_right.png", + "mask": "references/sam_mask_pedestrian_right.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 345.0, + 55.0, + 678.0, + 982.0 + ], + "mask_score": 3.462354, + "mask_area_ratio": 0.14014, + "elapsed_seconds": 8.2387 + } + }, + { + "name": "pedestrian_left", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "BDD100K:b714a088-861a043b:person:2", + "source_name": "pedestrian", + "source_description": "another person near the shop entrance on the left, partially obscured Source dataset: BDD100K. Scene context: a city street scene from the perspective of a vehicle, showing multiple cars in motion, buildings lining the road, and pedestrians on the sidewalks during dusk or early evening", + "sub_caption": "pedestrian: A person near a shop entrance on the left, partially obscured by shadows.. Scene role: Standing near a building entrance on the left side of the street.", + "measured_bbox": [ + 0.1301, + 0.5154, + 0.1517, + 0.611 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian_left.png", + "raw_ref_image": "references/raw_ref_pedestrian_left_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian_left.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_pedestrian_left_attempt_01.png", + "output": "references/ref_pedestrian_left.png", + "mask": "references/sam_mask_pedestrian_left.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 342.0, + 67.0, + 681.0, + 996.0 + ], + "mask_score": 3.481605, + "mask_area_ratio": 0.150858, + "elapsed_seconds": 8.1403 + } + }, + { + "name": "city_buildings", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_9/283554,31eeb000e9237b31.jpg:object:9", + "source_name": "building", + "source_description": "Various city buildings of different heights forming the skyline in the background. Source dataset: CrowdHuman. Scene context: People are walking and resting on the wooden walkway of a large suspension bridge with a city skyline in the background.", + "sub_caption": "building: Various city buildings of different heights forming the urban landscape along the street.. Scene role: Lining the street and forming the architectural background on both sides.", + "measured_bbox": [ + 0.3358, + 0.3425, + 0.4929, + 0.5277 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_city_buildings.png", + "raw_ref_image": "references/raw_ref_city_buildings_attempt_01.png", + "reference_verify": "references/reference_verify_city_buildings.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_city_buildings_attempt_01.png", + "output": "references/ref_city_buildings.png", + "mask": "references/sam_mask_city_buildings.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 27.0, + 178.0, + 996.0, + 865.0 + ], + "mask_score": 3.420089, + "mask_area_ratio": 0.463421, + "elapsed_seconds": 8.2735 + } + }, + { + "name": "pink_scooter", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_66/283991,a60c0001c08f63b.jpg:object:3", + "source_name": "pink scooter", + "source_description": "A prominent pink step-through style motor scooter. Source dataset: CrowdHuman. Scene context: A busy city intersection with many people riding scooters and some cars in the background.", + "sub_caption": "pink scooter: A prominent pink step-through style motor scooter.. Scene role: Parked on the right sidewalk near the street signs.", + "measured_bbox": [ + 0.677, + 0.56, + 0.7935, + 0.7095 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_pink_scooter.png", + "raw_ref_image": "references/raw_ref_pink_scooter_attempt_01.png", + "reference_verify": "references/reference_verify_pink_scooter.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_pink_scooter_attempt_01.png", + "output": "references/ref_pink_scooter.png", + "mask": "references/sam_mask_pink_scooter.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 41.0, + 120.0, + 982.0, + 920.0 + ], + "mask_score": 3.414017, + "mask_area_ratio": 0.259921, + "elapsed_seconds": 8.1841 + } + }, + { + "name": "street_signs", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b5ab0e46-8eab4733:object:8", + "source_name": "street signs", + "source_description": "Various street signs attached to a pole on the right side of the street, including a speed limit sign. Source dataset: BDD100K. Scene context: A narrow city street with several parked and moving box trucks and vans, bordered by multi-story buildings and construction barriers.", + "sub_caption": "street signs: Various street signs attached to a metal pole on the right side of the street.. Scene role: Mounted on a pole alongside the road on the right.", + "measured_bbox": [ + 0.8162, + 0.2869, + 0.8575, + 0.4063 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_street_signs.png", + "raw_ref_image": "references/raw_ref_street_signs_attempt_01.png", + "reference_verify": "references/reference_verify_street_signs.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_street_signs_attempt_01.png", + "output": "references/ref_street_signs.png", + "mask": "references/sam_mask_street_signs.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 226.0, + 71.0, + 707.0, + 967.0 + ], + "mask_score": 3.475593, + "mask_area_ratio": 0.25818, + "elapsed_seconds": 9.8621 + } + }, + { + "name": "storefront_sign", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_4/273275,46a6f0005d04fc24.jpg:object:7", + "source_name": "storefront sign", + "source_description": "A dark sign with white text 'DELI' and a smaller red 'ATM' sign beneath it on the left. Source dataset: CrowdHuman. Scene context: A densely crowded city street with numerous pedestrians walking towards the camera, while emergency vehicles with flashing lights are visible in the background traffic.", + "sub_caption": "storefront sign: A dark, illuminated sign structure above a shop entrance.. Scene role: Hanging above a shop entrance on the left side of the street, illuminating the adjacent pedestrian.", + "measured_bbox": [ + 0.1052, + 0.4218, + 0.186, + 0.4781 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_storefront_sign.png", + "raw_ref_image": "references/raw_ref_storefront_sign_attempt_01.png", + "reference_verify": "references/reference_verify_storefront_sign.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_storefront_sign_attempt_01.png", + "output": "references/ref_storefront_sign.png", + "mask": "references/sam_mask_storefront_sign.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 107.0, + 274.0, + 900.0, + 749.0 + ], + "mask_score": 3.354337, + "mask_area_ratio": 0.167885, + "elapsed_seconds": 8.1782 + } + }, + { + "name": "parked_suv_right", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c54441e6-400c221e:object:4", + "source_name": "parked SUV", + "source_description": "Dark-colored SUV parked ahead of the sedan on the right side of the road. Source dataset: BDD100K. Scene context: Nighttime driving scene approaching an intersection with green traffic lights and parked cars on the right.", + "sub_caption": "parked SUV: Dark-colored SUV parked on the right side of the road.. Scene role: Parked parallel to the curb on the right side of the street.", + "measured_bbox": [ + 0.6057, + 0.5099, + 0.7451, + 0.6703 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_parked_suv_right.png", + "raw_ref_image": "references/raw_ref_parked_suv_right_attempt_01.png", + "reference_verify": "references/reference_verify_parked_suv_right.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_parked_suv_right_attempt_01.png", + "output": "references/ref_parked_suv_right.png", + "mask": "references/sam_mask_parked_suv_right.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 52.0, + 216.0, + 993.0, + 835.0 + ], + "mask_score": 3.459027, + "mask_area_ratio": 0.361156, + "elapsed_seconds": 10.1865 + } + }, + { + "name": "dark_car_left", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:bb2e43e4-5e7a7129:object:6", + "source_name": "car", + "source_description": "A dark car parked along the left curb further ahead. Source dataset: BDD100K. Scene context: A dashcam view from a vehicle driving down a multi-lane road on a partly cloudy day, with a few other cars and pedestrians on the sidewalk.", + "sub_caption": "car: A dark car parked along the left curb further ahead.. Scene role: Parked alongside the left curb.", + "measured_bbox": [ + 0.2139, + 0.5323, + 0.3044, + 0.6201 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_dark_car_left.png", + "raw_ref_image": "references/raw_ref_dark_car_left_attempt_01.png", + "reference_verify": "references/reference_verify_dark_car_left.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_dark_car_left_attempt_01.png", + "output": "references/ref_dark_car_left.png", + "mask": "references/sam_mask_dark_car_left.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 42.0, + 237.0, + 982.0, + 794.0 + ], + "mask_score": 3.479099, + "mask_area_ratio": 0.30617, + "elapsed_seconds": 8.2274 + } + }, + { + "name": "dark_suv_driving", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c889c950-865ca5b6:object:0", + "source_name": "dark SUV", + "source_description": "A dark SUV traveling in the left lane, with visible red taillights. Source dataset: BDD100K. Scene context: Nighttime driving on a highway with several cars visible ahead, illuminated mainly by taillights and headlights.", + "sub_caption": "dark SUV: A dark SUV traveling in the left lane, with visible glowing red taillights.. Scene role: Driving ahead of the camera vehicle in the opposing or adjacent left lane.", + "measured_bbox": [ + 0.3005, + 0.5101, + 0.4179, + 0.6508 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_dark_suv_driving.png", + "raw_ref_image": "references/raw_ref_dark_suv_driving_attempt_01.png", + "reference_verify": "references/reference_verify_dark_suv_driving.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_dark_suv_driving_attempt_01.png", + "output": "references/ref_dark_suv_driving.png", + "mask": "references/sam_mask_dark_suv_driving.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 96.0, + 294.0, + 928.0, + 812.0 + ], + "mask_score": 3.455576, + "mask_area_ratio": 0.251452, + "elapsed_seconds": 9.8494 + } + }, + { + "name": "street_light", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c84f848e-2a5e0737:object:2", + "source_name": "street light", + "source_description": "Tall pole with a bright light on top, illuminating the road from the right side. Source dataset: BDD100K. Scene context: Nighttime driving view on a multi-lane highway with streetlights illuminating the dark road ahead.", + "sub_caption": "street light: Tall pole with a bright light on top, illuminating the road.. Scene role: Casting warm light onto the street from the right-hand sidewalk.", + "measured_bbox": [ + 0.8171, + 0.1755, + 0.8719, + 0.2202 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_street_light.png", + "raw_ref_image": "references/raw_ref_street_light_attempt_01.png", + "reference_verify": "references/reference_verify_street_light.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_street_light_attempt_01.png", + "output": "references/ref_street_light.png", + "mask": "references/sam_mask_street_light.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 71.0, + 277.0, + 913.0, + 727.0 + ], + "mask_score": 3.350243, + "mask_area_ratio": 0.068855, + "elapsed_seconds": 8.2963 + } + }, + { + "name": "vehicle_dashboard", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b5032e1d-dad95b60:object:9", + "source_name": "dashboard", + "source_description": "The dark, reflective dashboard and lower windshield area of the camera vehicle in the foreground. Source dataset: BDD100K. Scene context: View from inside a vehicle driving down a residential city street lined with trees, parked cars, and multi-story brick buildings.", + "sub_caption": "dashboard: The dark, reflective dashboard and lower windshield area of the camera vehicle.. Scene role: Occupying the bottom foreground of the image, establishing the perspective from inside the car.", + "measured_bbox": [ + 0.0, + 0.9261, + 1.0, + 1.0 + ], + "detection_confidence": "high", + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_vehicle_dashboard.png", + "raw_ref_image": "references/raw_ref_vehicle_dashboard_attempt_01.png", + "reference_verify": "references/reference_verify_vehicle_dashboard.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_vehicle_dashboard_attempt_01.png", + "output": "references/ref_vehicle_dashboard.png", + "mask": "references/sam_mask_vehicle_dashboard.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 24.0, + 317.0, + 1001.0, + 706.0 + ], + "mask_score": 2.942001, + "mask_area_ratio": 0.133658, + "elapsed_seconds": 8.3645 + } + }, + { + "name": "white_car_ahead", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b3a102ed-6ef54f5e:object:3", + "source_name": "white car", + "source_description": "A white car visible further down the road in the right lane. Source dataset: BDD100K. Scene context: Nighttime driving scene in a city with cars stopped in traffic and a construction site on the right.", + "sub_caption": "white car: A white car visible further down the road.. Scene role: Driving away in the right lane, further in the distance.", + "measured_bbox": [ + 0.4811, + 0.5382, + 0.5174, + 0.5915 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_white_car_ahead.png", + "raw_ref_image": "references/raw_ref_white_car_ahead_attempt_01.png", + "reference_verify": "references/reference_verify_white_car_ahead.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_white_car_ahead_attempt_01.png", + "output": "references/ref_white_car_ahead.png", + "mask": "references/sam_mask_white_car_ahead.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 112.0, + 168.0, + 912.0, + 855.0 + ], + "mask_score": 3.412999, + "mask_area_ratio": 0.338258, + "elapsed_seconds": 8.3339 + } + }, + { + "name": "double_yellow_lines", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c417a291-7802692d:object:8", + "source_name": "yellow lines", + "source_description": "Double yellow painted lines separating opposite directions of traffic. Source dataset: BDD100K. Scene context: A pedestrian crossing a street at a crosswalk with several cars stopped around them, with storefronts visible in the background.", + "sub_caption": "yellow lines: Double yellow painted lines separating opposite directions of traffic.. Scene role: Running down the center of the road, receding into the distance.", + "measured_bbox": [ + 0.3008, + 0.5732, + 0.4776, + 0.8029 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_double_yellow_lines.png", + "raw_ref_image": "references/raw_ref_double_yellow_lines_attempt_01.png", + "reference_verify": "references/reference_verify_double_yellow_lines.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_double_yellow_lines_attempt_01.png", + "output": "references/ref_double_yellow_lines.png", + "mask": "references/sam_mask_double_yellow_lines.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 0.0, + 1023.0, + 1023.0 + ], + "mask_score": 2.141169, + "mask_area_ratio": 0.667065, + "elapsed_seconds": 8.2719 + } + }, + { + "name": "street_trees", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c4891df0-24371ae1:object:3", + "source_name": "trees", + "source_description": "Numerous trees with dense green and yellowish foliage lining both sides of the road. Source dataset: BDD100K. Scene context: A dark SUV drives ahead on a wet road lined with trees and a fence under a cloudy sky.", + "sub_caption": "trees: Numerous trees with dense foliage lining both sides of the road.. Scene role: Planted along the sidewalks, softening the urban environment and framing the street.", + "measured_bbox": [ + 0.001, + 0.002, + 0.375, + 0.63 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_street_trees.png", + "raw_ref_image": "references/raw_ref_street_trees_attempt_01.png", + "reference_verify": "references/reference_verify_street_trees.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_street_trees_attempt_01.png", + "output": "references/ref_street_trees.png", + "mask": "references/sam_mask_street_trees.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 65.0, + 64.0, + 958.0, + 969.0 + ], + "mask_score": 3.478968, + "mask_area_ratio": 0.365667, + "elapsed_seconds": 8.231 + } + }, + { + "name": "twilight_sky", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c13c0d1f-00dfd075:object:9", + "source_name": "sky", + "source_description": "Clear twilight sky transitioning from bright near the horizon to dark blue at the top. Source dataset: BDD100K. Scene context: View from a moving vehicle looking down a multi-lane city street lined with parked cars and large trees under a clear twilight sky.", + "sub_caption": "sky: Clear twilight sky transitioning from bright near the horizon to dark blue at the top.. Scene role: Providing the atmospheric backdrop above the buildings and street.", + "measured_bbox": [ + 0.116, + 0.0, + 0.714, + 0.4742 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_twilight_sky.png", + "raw_ref_image": "references/raw_ref_twilight_sky_attempt_01.png", + "reference_verify": "references/reference_verify_twilight_sky.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_twilight_sky_attempt_01.png", + "output": "references/ref_twilight_sky.png", + "mask": "references/sam_mask_twilight_sky.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 208.0, + 1023.0, + 814.0 + ], + "mask_score": 2.437955, + "mask_area_ratio": 0.529621, + "elapsed_seconds": 9.8292 + } + } + ], + "not_emitted": [], + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/rows/sample_000003.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/rows/sample_000003.json new file mode 100644 index 0000000000000000000000000000000000000000..3324bd0ee4b849f963ea0d2908ef069c23619477 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/rows/sample_000003.json @@ -0,0 +1,164 @@ +{ + "sample_id": "sample_000003", + "target_total": 3, + "target_people": 1, + "target_objects": 2, + "canvas_size": [ + 1248, + 832 + ], + "canvas_aspect_ratio": "3:2", + "main_image": "main_image.png", + "bbox_overlay": "bbox_overlay.png", + "plan": "plan.json", + "detections": "detections.json", + "vocab_task": "vocab_task.json", + "n_planned": 3, + "n_detected": 3, + "n_subjects": 3, + "subjects": [ + { + "name": "shopper", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_12/282555,5c403000efcca35d.jpg:person:13", + "source_name": "shopper", + "source_description": "A person standing on the top level, wearing a dark top and dark pants. Source dataset: CrowdHuman. Scene context: A multi-level outdoor shopping mall with various people walking and shopping.", + "sub_caption": "shopper: A person standing and waiting, wearing a dark top and dark pants. Scene role: waiting at the crosswalk curb", + "measured_bbox": [ + 0.7364, + 0.2825, + 0.8267, + 0.7222 + ], + "detection_confidence": 100, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_shopper.png", + "raw_ref_image": "references/raw_ref_shopper_attempt_01.png", + "reference_verify": "references/reference_verify_shopper.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000003/references/raw_ref_shopper_attempt_01.png", + "output": "references/ref_shopper.png", + "mask": "references/sam_mask_shopper.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 348.0, + 80.0, + 678.0, + 995.0 + ], + "mask_score": 3.467753, + "mask_area_ratio": 0.132874, + "elapsed_seconds": 49.4008 + } + }, + { + "name": "black_sedan", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c0c183ff-1b24f541:object:7", + "source_name": "black sedan", + "source_description": "A black sedan driving in the right lane ahead of the red hatchback. Source dataset: BDD100K. Scene context: A view from a vehicle driving down a multi-lane city street flanked by tall buildings, with various cars moving in the same direction and parked along the side.", + "sub_caption": "black sedan: A dark, modern black sedan. Scene role: driving in the nearest lane on the street", + "measured_bbox": [ + 0.0883, + 0.2514, + 0.5002, + 0.449 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_black_sedan.png", + "raw_ref_image": "references/raw_ref_black_sedan_attempt_01.png", + "reference_verify": "references/reference_verify_black_sedan.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000003/references/raw_ref_black_sedan_attempt_01.png", + "output": "references/ref_black_sedan.png", + "mask": "references/sam_mask_black_sedan.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 321.0, + 1023.0, + 700.0 + ], + "mask_score": 2.52477, + "mask_area_ratio": 0.559944, + "elapsed_seconds": 8.5091 + } + }, + { + "name": "silver_car", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:be49ae7a-1ffaa683:object:2", + "source_name": "silver car", + "source_description": "A silver car driving in the adjacent right lane, slightly ahead of the white car. Source dataset: BDD100K. Scene context: View from inside a vehicle driving in heavy traffic on a multi-lane road under a clear sky.", + "sub_caption": "silver car: A sleek silver car. Scene role: driving in the adjacent lane slightly ahead of the black sedan", + "measured_bbox": [ + 0.3669, + 0.2463, + 0.7048, + 0.409 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_silver_car.png", + "raw_ref_image": "references/raw_ref_silver_car_attempt_01.png", + "reference_verify": "references/reference_verify_silver_car.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000003/references/raw_ref_silver_car_attempt_01.png", + "output": "references/ref_silver_car.png", + "mask": "references/sam_mask_silver_car.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 16.0, + 369.0, + 1006.0, + 693.0 + ], + "mask_score": 3.457475, + "mask_area_ratio": 0.178123, + "elapsed_seconds": 9.7472 + } + } + ], + "not_emitted": [], + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/rows/sample_000004.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/rows/sample_000004.json new file mode 100644 index 0000000000000000000000000000000000000000..e1a3d24315d3196b0908cdbb4bc5a2418ad63ab1 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/rows/sample_000004.json @@ -0,0 +1,256 @@ +{ + "sample_id": "sample_000004", + "target_total": 5, + "target_people": 1, + "target_objects": 4, + "canvas_size": [ + 1248, + 832 + ], + "canvas_aspect_ratio": "3:2", + "main_image": "main_image.png", + "bbox_overlay": "bbox_overlay.png", + "plan": "plan.json", + "detections": "detections.json", + "vocab_task": "vocab_task.json", + "n_planned": 5, + "n_detected": 5, + "n_subjects": 5, + "subjects": [ + { + "name": "pedestrian_walker", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_51/273275,145927000354f7525.jpg:person:10", + "source_name": "walker", + "source_description": "A man walking, wearing a blue hoodie with 'EMO' written on it, grey pants, and dark shoes. Source dataset: CrowdHuman. Scene context: A group of people, some wearing matching white t-shirts, are walking and jogging along a paved path next to a road, with a grey SUV parked on a grassy hill in the background.", + "sub_caption": "walker: A man walking, wearing a plain blue hoodie, grey pants, and dark shoes.. Scene role: Walking near the crosswalk on the side of the street.", + "measured_bbox": [ + 0.5948, + 0.3939, + 0.6378, + 0.5698 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian_walker.png", + "raw_ref_image": "references/raw_ref_pedestrian_walker_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian_walker.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000004/references/raw_ref_pedestrian_walker_attempt_01.png", + "output": "references/ref_pedestrian_walker.png", + "mask": "references/sam_mask_pedestrian_walker.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 334.0, + 56.0, + 706.0, + 996.0 + ], + "mask_score": 3.43302, + "mask_area_ratio": 0.160827, + "elapsed_seconds": 9.8914 + } + }, + { + "name": "red_traffic_light", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b6df605f-51c158b8:object:6", + "source_name": "traffic light", + "source_description": "A set of traffic lights suspended over the intersection, showing a red light. Source dataset: BDD100K. Scene context: A view from a vehicle driving down a city street on a sunny day with a U-Haul truck in the opposite lane, parked cars along the right curb, and pedestrians crossing a crosswalk.", + "sub_caption": "traffic light: A traffic signal suspended over the intersection, illuminated with a bright red light.. Scene role: Hanging high above the center of the intersection in the driver's line of sight.", + "measured_bbox": [ + 0.4668, + 0.0722, + 0.5093, + 0.1896 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_red_traffic_light.png", + "raw_ref_image": "references/raw_ref_red_traffic_light_attempt_01.png", + "reference_verify": "references/reference_verify_red_traffic_light.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000004/references/raw_ref_red_traffic_light_attempt_01.png", + "output": "references/ref_red_traffic_light.png", + "mask": "references/sam_mask_red_traffic_light.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 286.0, + 103.0, + 1023.0, + 893.0 + ], + "mask_score": 3.25218, + "mask_area_ratio": 0.200515, + "elapsed_seconds": 8.1927 + } + }, + { + "name": "plain_delivery_truck", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b3a7b21a-48bcf2b8:object:2", + "source_name": "delivery truck", + "source_description": "A large white box truck parked behind the fence on the right, with visible green and purple logos. Source dataset: BDD100K. Scene context: A view from a vehicle driving down a wide, paved road flanked by an industrial area with fences and delivery trucks under a cloudy sky.", + "sub_caption": "delivery truck: A large, plain white box truck without any visible markings or graphics.. Scene role: Parked alongside the right edge of the street curb.", + "measured_bbox": [ + 0.6504, + 0.2022, + 0.966, + 0.6212 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_plain_delivery_truck.png", + "raw_ref_image": "references/raw_ref_plain_delivery_truck_attempt_01.png", + "reference_verify": "references/reference_verify_plain_delivery_truck.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000004/references/raw_ref_plain_delivery_truck_attempt_01.png", + "output": "references/ref_plain_delivery_truck.png", + "mask": "references/sam_mask_plain_delivery_truck.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 9.0, + 166.0, + 1017.0, + 852.0 + ], + "mask_score": 3.45107, + "mask_area_ratio": 0.437578, + "elapsed_seconds": 10.0386 + } + }, + { + "name": "dark_parked_car", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:bcd37eef-1b958ae3:object:5", + "source_name": "dark parked car", + "source_description": "Another dark-colored car parked on the right side of the street. Source dataset: BDD100K. Scene context: A nighttime view from a vehicle driving down a multi-lane city street with parked cars on the right and oncoming traffic on the left.", + "sub_caption": "dark parked car: A dark-colored passenger vehicle.. Scene role: Parked parallel to the curb directly behind the delivery truck.", + "measured_bbox": [ + 0.8339, + 0.4566, + 0.9965, + 0.7781 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_dark_parked_car.png", + "raw_ref_image": "references/raw_ref_dark_parked_car_attempt_01.png", + "reference_verify": "references/reference_verify_dark_parked_car.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000004/references/raw_ref_dark_parked_car_attempt_01.png", + "output": "references/ref_dark_parked_car.png", + "mask": "references/sam_mask_dark_parked_car.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 301.0, + 1023.0, + 694.0 + ], + "mask_score": 3.113868, + "mask_area_ratio": 0.207836, + "elapsed_seconds": 8.5697 + } + }, + { + "name": "street_lines", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c0c9ec9a-d3638a82:object:6", + "source_name": "street lines", + "source_description": "Double yellow lines separating traffic directions and white painted lines indicating lanes and crosswalks. Source dataset: BDD100K. Scene context: A view from inside a car driving on a city street, approaching an intersection underneath an elevated railway structure.", + "sub_caption": "street lines: Double yellow center lines separating traffic directions, and solid white painted lines forming a distinct crosswalk.. Scene role: Painted on the asphalt, guiding traffic and defining the pedestrian crossing area in the foreground.", + "measured_bbox": [ + 0.003, + 0.432, + 0.971, + 0.794 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_street_lines.png", + "raw_ref_image": "references/raw_ref_street_lines_attempt_01.png", + "reference_verify": "references/reference_verify_street_lines.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000004/references/raw_ref_street_lines_attempt_01.png", + "output": "references/ref_street_lines.png", + "mask": "references/sam_mask_street_lines.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 384.0, + 98.0, + 639.0, + 925.0 + ], + "mask_score": 3.44596, + "mask_area_ratio": 0.067441, + "elapsed_seconds": 8.1646 + } + } + ], + "not_emitted": [], + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/rows/sample_000005.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/rows/sample_000005.json new file mode 100644 index 0000000000000000000000000000000000000000..030ad1a12b026f0f651944838179229027727a84 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/rows/sample_000005.json @@ -0,0 +1,302 @@ +{ + "sample_id": "sample_000005", + "target_total": 6, + "target_people": 3, + "target_objects": 3, + "canvas_size": [ + 1248, + 832 + ], + "canvas_aspect_ratio": "3:2", + "main_image": "main_image.png", + "bbox_overlay": "bbox_overlay.png", + "plan": "plan.json", + "detections": "detections.json", + "vocab_task": "vocab_task.json", + "n_planned": 6, + "n_detected": 6, + "n_subjects": 6, + "subjects": [ + { + "name": "person_yellow_top", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_24/282555,49a4e000d44469c6.jpg:person:49", + "source_name": "person", + "source_description": "Standing, wearing a bright yellow top Source dataset: CrowdHuman. Scene context: A large crowd of tourists is gathered in front of the Louvre museum and its iconic glass pyramid on a sunny day.", + "sub_caption": "person: Standing, wearing a bright yellow top. Scene role: Crossing the street on the crosswalk in front of the stopped silver car", + "measured_bbox": [ + 0.5309, + 0.4516, + 0.5607, + 0.6301 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_person_yellow_top.png", + "raw_ref_image": "references/raw_ref_person_yellow_top_attempt_01.png", + "reference_verify": "references/reference_verify_person_yellow_top.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_person_yellow_top_attempt_01.png", + "output": "references/ref_person_yellow_top.png", + "mask": "references/sam_mask_person_yellow_top.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 329.0, + 42.0, + 701.0, + 1012.0 + ], + "mask_score": 3.348943, + "mask_area_ratio": 0.150169, + "elapsed_seconds": 8.25 + } + }, + { + "name": "man_in_suit", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_28/273278,b62280001bda6f1a.jpg:person:19", + "source_name": "crowd member", + "source_description": "A person far right in the background wearing a suit. Source dataset: CrowdHuman. Scene context: A crowd of people gathers in front of an old stone building with a prominent arched doorway and two large animal statues.", + "sub_caption": "crowd member: A person wearing a professional suit.. Scene role: Walking alongside the other pedestrians across the crosswalk", + "measured_bbox": [ + 0.5767, + 0.4388, + 0.6397, + 0.6278 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_man_in_suit.png", + "raw_ref_image": "references/raw_ref_man_in_suit_attempt_01.png", + "reference_verify": "references/reference_verify_man_in_suit.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_man_in_suit_attempt_01.png", + "output": "references/ref_man_in_suit.png", + "mask": "references/sam_mask_man_in_suit.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 351.0, + 27.0, + 671.0, + 1004.0 + ], + "mask_score": 3.48496, + "mask_area_ratio": 0.144686, + "elapsed_seconds": 9.7885 + } + }, + { + "name": "young_girl", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_51/282555,57b9f000182048e6.jpg:person:13", + "source_name": "pedestrian", + "source_description": "Young girl with brown hair, wearing a blue patterned top. Source dataset: CrowdHuman. Scene context: A sunny outdoor scene featuring the red entrance arch to Navy Pier Beer Garden and a tall brick tower, with a diverse crowd of people walking along the waterfront promenade.", + "sub_caption": "pedestrian: Young girl with brown hair, wearing a blue patterned top.. Scene role: Walking across the intersection near the person in the yellow top", + "measured_bbox": [ + 0.6354, + 0.4889, + 0.6677, + 0.6337 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_young_girl.png", + "raw_ref_image": "references/raw_ref_young_girl_attempt_01.png", + "reference_verify": "references/reference_verify_young_girl.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_young_girl_attempt_01.png", + "output": "references/ref_young_girl.png", + "mask": "references/sam_mask_young_girl.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 369.0, + 52.0, + 661.0, + 1003.0 + ], + "mask_score": 3.482282, + "mask_area_ratio": 0.133298, + "elapsed_seconds": 8.3216 + } + }, + { + "name": "traffic_light", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_36/273275,6a11d000f52c34a9.jpg:object:0", + "source_name": "traffic light", + "source_description": "A black multi-lens traffic light fixture mounted on a pole above the street. Source dataset: CrowdHuman. Scene context: A male tour guide is speaking to a group of people standing on a city sidewalk next to a road crossing.", + "sub_caption": "traffic light: A black multi-lens traffic light fixture mounted on a pole above the street.. Scene role: Suspended over the intersection, showing a red light to halt the vehicles", + "measured_bbox": [ + 0.5513, + 0.0408, + 0.6462, + 0.1518 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_traffic_light.png", + "raw_ref_image": "references/raw_ref_traffic_light_attempt_01.png", + "reference_verify": "references/reference_verify_traffic_light.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_traffic_light_attempt_01.png", + "output": "references/ref_traffic_light.png", + "mask": "references/sam_mask_traffic_light.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 113.0, + 201.0, + 923.0, + 826.0 + ], + "mask_score": 3.467034, + "mask_area_ratio": 0.289252, + "elapsed_seconds": 9.874 + } + }, + { + "name": "street_trees", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_37/282555,a4aef000d9c10933.jpg:object:3", + "source_name": "trees", + "source_description": "Various green trees and shrubs lining the pathway and visible in the background gardens. Source dataset: CrowdHuman. Scene context: A large crowd of tourists walks along the pathway towards the Taj Mahal on a clear day.", + "sub_caption": "trees: Various green trees and shrubs lining the pathway and visible in the background gardens.. Scene role: Planted along the sidewalks on both sides of the street, providing urban greenery", + "measured_bbox": [ + 0.542, + 0.2363, + 0.636, + 0.493 + ], + "detection_confidence": 0.9, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_street_trees.png", + "raw_ref_image": "references/raw_ref_street_trees_attempt_03.png", + "reference_verify": "references/reference_verify_street_trees.json", + "reference_verify_passed": true, + "reference_attempts": 3, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_street_trees_attempt_03.png", + "output": "references/ref_street_trees.png", + "mask": "references/sam_mask_street_trees.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 22.0, + 60.0, + 1003.0, + 968.0 + ], + "mask_score": 3.301958, + "mask_area_ratio": 0.393952, + "elapsed_seconds": 8.2223 + } + }, + { + "name": "silver_car", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:be3d3a81-326a032d:object:0", + "source_name": "silver car", + "source_description": "A silver compact hatchback car facing forward, waiting at an intersection. Its brake lights are on. Source dataset: BDD100K. Scene context: A rainy street scene showing cars waiting at an intersection surrounded by tall buildings and urban infrastructure.", + "sub_caption": "silver car: A silver compact hatchback car facing forward, waiting at an intersection. Its brake lights are on.. Scene role: Stopped in the traffic lane in the foreground, waiting for the pedestrians to cross", + "measured_bbox": [ + 0.3062, + 0.4281, + 0.5436, + 0.7674 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_silver_car.png", + "raw_ref_image": "references/raw_ref_silver_car_attempt_01.png", + "reference_verify": "references/reference_verify_silver_car.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_silver_car_attempt_01.png", + "output": "references/ref_silver_car.png", + "mask": "references/sam_mask_silver_car.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 80.0, + 167.0, + 957.0, + 937.0 + ], + "mask_score": 3.434142, + "mask_area_ratio": 0.414005, + "elapsed_seconds": 8.3073 + } + } + ], + "not_emitted": [], + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/rows/sample_000006.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/rows/sample_000006.json new file mode 100644 index 0000000000000000000000000000000000000000..72dab8fc148337e1f8ef4b7864be488bc394f817 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/rows/sample_000006.json @@ -0,0 +1,394 @@ +{ + "sample_id": "sample_000006", + "target_total": 8, + "target_people": 1, + "target_objects": 7, + "canvas_size": [ + 1280, + 720 + ], + "canvas_aspect_ratio": "16:9", + "main_image": "main_image.png", + "bbox_overlay": "bbox_overlay.png", + "plan": "plan.json", + "detections": "detections.json", + "vocab_task": "vocab_task.json", + "n_planned": 8, + "n_detected": 8, + "n_subjects": 8, + "subjects": [ + { + "name": "pedestrian", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_73/283991,17cd800008079067.jpg:person:18", + "source_name": "pedestrian", + "source_description": "Another person in the distant background near the green structure. Source dataset: CrowdHuman. Scene context: A large crowd of people walking across a street with trees, streetlamps, and classic architecture in the background.", + "sub_caption": "pedestrian: A person walking across the street in the distance.. Scene role: Crossing the street near the left background.", + "measured_bbox": [ + 0.2151, + 0.4819, + 0.2507, + 0.5947 + ], + "detection_confidence": "high", + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian.png", + "raw_ref_image": "references/raw_ref_pedestrian_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_pedestrian_attempt_01.png", + "output": "references/ref_pedestrian.png", + "mask": "references/sam_mask_pedestrian.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 353.0, + 44.0, + 672.0, + 1013.0 + ], + "mask_score": 3.456561, + "mask_area_ratio": 0.147466, + "elapsed_seconds": 8.2841 + } + }, + { + "name": "emergency_vehicle", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b99f250d-886111c5:object:5", + "source_name": "vehicle", + "source_description": "A dark-colored vehicle partially visible in the left background with blue emergency lights flashing. Source dataset: BDD100K. Scene context: A nighttime city street intersection showing a crosswalk, construction barriers, and illuminated traffic signals.", + "sub_caption": "vehicle: A dark-colored vehicle with blue emergency lights flashing.. Scene role: Stopped or parked in the distant left background.", + "measured_bbox": [ + 0.325, + 0.4787, + 0.3786, + 0.5486 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_emergency_vehicle.png", + "raw_ref_image": "references/raw_ref_emergency_vehicle_attempt_01.png", + "reference_verify": "references/reference_verify_emergency_vehicle.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_emergency_vehicle_attempt_01.png", + "output": "references/ref_emergency_vehicle.png", + "mask": "references/sam_mask_emergency_vehicle.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 261.0, + 1023.0, + 782.0 + ], + "mask_score": 3.339466, + "mask_area_ratio": 0.300308, + "elapsed_seconds": 8.2719 + } + }, + { + "name": "curbside_trash_can", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_10/273275,5da3d000f5358c3c.jpg:object:9", + "source_name": "trash can", + "source_description": "dark blue cylindrical bin partially visible in the foreground Source dataset: CrowdHuman. Scene context: A group of children and an adult pose for a photo in front of a roller coaster at an amusement park.", + "sub_caption": "trash can: A dark blue cylindrical bin.. Scene role: Placed on the sidewalk curb in the lower right foreground.", + "measured_bbox": [ + 0.8371, + 0.5448, + 0.9204, + 0.7599 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_curbside_trash_can.png", + "raw_ref_image": "references/raw_ref_curbside_trash_can_attempt_01.png", + "reference_verify": "references/reference_verify_curbside_trash_can.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_curbside_trash_can_attempt_01.png", + "output": "references/ref_curbside_trash_can.png", + "mask": "references/sam_mask_curbside_trash_can.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 175.0, + 42.0, + 837.0, + 982.0 + ], + "mask_score": 3.480803, + "mask_area_ratio": 0.406976, + "elapsed_seconds": 8.7724 + } + }, + { + "name": "museum_banner", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_24/282555,14d900042c7b9a4.jpg:object:3", + "source_name": "exhibition banner", + "source_description": "A large, rectangular banner hanging on the building's facade, featuring a maroon background and text. Source dataset: CrowdHuman. Scene context: A large, classical building, likely a museum, with many people sitting and standing on its wide front steps, while a yellow taxi speeds past in the foreground.", + "sub_caption": "exhibition banner: A large, rectangular maroon banner hanging on a building's facade.. Scene role: Hanging from the classical architecture on the right side of the street.", + "measured_bbox": [ + 0.8013, + 0.0164, + 0.8543, + 0.2771 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_museum_banner.png", + "raw_ref_image": "references/raw_ref_museum_banner_attempt_01.png", + "reference_verify": "references/reference_verify_museum_banner.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_museum_banner_attempt_01.png", + "output": "references/ref_museum_banner.png", + "mask": "references/sam_mask_museum_banner.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 317.0, + 20.0, + 703.0, + 981.0 + ], + "mask_score": 3.453619, + "mask_area_ratio": 0.268547, + "elapsed_seconds": 8.1747 + } + }, + { + "name": "white_panel_van", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b5047c50-e1facff6:object:2", + "source_name": "white van", + "source_description": "A large white panel van with red taillights illuminated, driving in the right lane ahead. Source dataset: BDD100K. Scene context: View from inside a vehicle driving down a multi-lane city street on a sunny day with moderate traffic.", + "sub_caption": "white van: A large white panel van with illuminated red taillights.. Scene role: Driving ahead in the right lane of the road.", + "measured_bbox": [ + 0.5122, + 0.3306, + 0.6867, + 0.6936 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_white_panel_van.png", + "raw_ref_image": "references/raw_ref_white_panel_van_attempt_01.png", + "reference_verify": "references/reference_verify_white_panel_van.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_white_panel_van_attempt_01.png", + "output": "references/ref_white_panel_van.png", + "mask": "references/sam_mask_white_panel_van.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 102.0, + 91.0, + 877.0, + 937.0 + ], + "mask_score": 3.457159, + "mask_area_ratio": 0.429852, + "elapsed_seconds": 10.1474 + } + }, + { + "name": "double_solid_white_line", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:bc886d37-5b22c313:object:7", + "source_name": "double solid white line", + "source_description": "Two continuous white painted lines on the dark asphalt road surface, separating the two lanes of traffic. Source dataset: BDD100K. Scene context: A view from inside a car driving through a brightly lit tunnel with tiled walls, following a silver SUV and a dark compact car.", + "sub_caption": "double solid white line: Two continuous white painted lines on dark asphalt.. Scene role: Dividing the traffic lanes down the center of the street.", + "measured_bbox": [ + 0.2383, + 0.6095, + 0.4221, + 0.8925 + ], + "detection_confidence": "high", + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_double_solid_white_line.png", + "raw_ref_image": "references/raw_ref_double_solid_white_line_attempt_01.png", + "reference_verify": "references/reference_verify_double_solid_white_line.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_double_solid_white_line_attempt_01.png", + "output": "references/ref_double_solid_white_line.png", + "mask": "references/sam_mask_double_solid_white_line.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 229.0, + 72.0, + 795.0, + 951.0 + ], + "mask_score": 3.470715, + "mask_area_ratio": 0.39155, + "elapsed_seconds": 9.6388 + } + }, + { + "name": "blooming_trees", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_26/283991,91fa000cda68156.jpg:object:5", + "source_name": "blooming trees", + "source_description": "Trees with pink blossoms in the background park area. Source dataset: CrowdHuman. Scene context: People are walking along a sidewalk next to a street lined with trees, some in bloom, with a police officer standing near a parked car.", + "sub_caption": "blooming trees: Trees featuring vibrant pink blossoms.. Scene role: Lining the sidewalk and park area on the right side of the street.", + "measured_bbox": [ + 0.6279, + 0.2153, + 0.8163, + 0.5163 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_blooming_trees.png", + "raw_ref_image": "references/raw_ref_blooming_trees_attempt_01.png", + "reference_verify": "references/reference_verify_blooming_trees.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_blooming_trees_attempt_01.png", + "output": "references/ref_blooming_trees.png", + "mask": "references/sam_mask_blooming_trees.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 93.0, + 1023.0, + 967.0 + ], + "mask_score": 3.479366, + "mask_area_ratio": 0.423422, + "elapsed_seconds": 8.1597 + } + }, + { + "name": "green_street_sign", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_50/273278,febe100057ca94db.jpg:object:5", + "source_name": "street sign", + "source_description": "A green street sign with white text visible on the left side. Source dataset: CrowdHuman. Scene context: Several pedestrians are crossing a street at a crosswalk on a sunny day.", + "sub_caption": "street sign: A green rectangular street sign.. Scene role: Mounted on a pole on the left side of the street near the crosswalk.", + "measured_bbox": [ + 0.1915, + 0.2239, + 0.2775, + 0.2533 + ], + "detection_confidence": 0.9, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_green_street_sign.png", + "raw_ref_image": "references/raw_ref_green_street_sign_attempt_01.png", + "reference_verify": "references/reference_verify_green_street_sign.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_green_street_sign_attempt_01.png", + "output": "references/ref_green_street_sign.png", + "mask": "references/sam_mask_green_street_sign.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 26.0, + 378.0, + 998.0, + 645.0 + ], + "mask_score": 3.473778, + "mask_area_ratio": 0.191363, + "elapsed_seconds": 9.6707 + } + } + ], + "not_emitted": [], + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/rows/sample_000008.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/rows/sample_000008.json new file mode 100644 index 0000000000000000000000000000000000000000..3a428eff762cc54b0584b1907cbfeeecb00f3a7d --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/rows/sample_000008.json @@ -0,0 +1,486 @@ +{ + "sample_id": "sample_000008", + "target_total": 10, + "target_people": 8, + "target_objects": 2, + "canvas_size": [ + 1280, + 720 + ], + "canvas_aspect_ratio": "16:9", + "main_image": "main_image.png", + "bbox_overlay": "bbox_overlay.png", + "plan": "plan.json", + "detections": "detections.json", + "vocab_task": "vocab_task.json", + "n_planned": 10, + "n_detected": 10, + "n_subjects": 10, + "subjects": [ + { + "name": "passenger", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_36/273278,a753000396c56bd.jpg:person:6", + "source_name": "passenger", + "source_description": "A man wearing glasses, a black jacket, and a light-colored shirt underneath, walking away from the camera. Source dataset: CrowdHuman. Scene context: A large crowd of people waiting at a train platform next to a stationary train.", + "sub_caption": "passenger: A man wearing glasses, a black jacket, and a light-colored shirt underneath, walking away from the camera.. Scene role: Walking away on the sidewalk ahead.", + "measured_bbox": [ + 0.8938, + 0.1807, + 0.9951, + 0.7157 + ], + "detection_confidence": "high", + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_passenger.png", + "raw_ref_image": "references/raw_ref_passenger_attempt_01.png", + "reference_verify": "references/reference_verify_passenger.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_passenger_attempt_01.png", + "output": "references/ref_passenger.png", + "mask": "references/sam_mask_passenger.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 342.0, + 31.0, + 682.0, + 985.0 + ], + "mask_score": 3.454991, + "mask_area_ratio": 0.146239, + "elapsed_seconds": 8.0907 + } + }, + { + "name": "shopper_waiting", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_4/284193,a17b00013852c31.jpg:person:2", + "source_name": "shopper", + "source_description": "A woman standing on the right, looking towards the street, wearing a black jacket, dark trousers, and carrying a brown handbag. Source dataset: CrowdHuman. Scene context: A bustling cobblestone street in a town lined with brick buildings, decorated with colorful bunting flags, and populated with pedestrians walking, shopping, and dining outdoors.", + "sub_caption": "shopper: A woman standing on the right, looking towards the street, wearing a black jacket, dark trousers, and carrying a brown handbag.. Scene role: Standing on the corner curb, waiting to cross the street.", + "measured_bbox": [ + 0.632, + 0.1691, + 0.7153, + 0.6522 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_shopper_waiting.png", + "raw_ref_image": "references/raw_ref_shopper_waiting_attempt_01.png", + "reference_verify": "references/reference_verify_shopper_waiting.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_shopper_waiting_attempt_01.png", + "output": "references/ref_shopper_waiting.png", + "mask": "references/sam_mask_shopper_waiting.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 365.0, + 69.0, + 720.0, + 1006.0 + ], + "mask_score": 3.169183, + "mask_area_ratio": 0.111197, + "elapsed_seconds": 8.0622 + } + }, + { + "name": "shopper_standing", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_59/283554,3417e000ebf80bca.jpg:person:13", + "source_name": "shopper", + "source_description": "A person standing, wearing a blue jacket and blue jeans. Source dataset: CrowdHuman. Scene context: A bustling street market with many pedestrians browsing stalls and goods displayed on the ground.", + "sub_caption": "shopper: A person standing, wearing a blue jacket and blue jeans.. Scene role: Standing near the crosswalk edge amidst the crowd.", + "measured_bbox": [ + 0.5209, + 0.1793, + 0.5735, + 0.5325 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_shopper_standing.png", + "raw_ref_image": "references/raw_ref_shopper_standing_attempt_01.png", + "reference_verify": "references/reference_verify_shopper_standing.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_shopper_standing_attempt_01.png", + "output": "references/ref_shopper_standing.png", + "mask": "references/sam_mask_shopper_standing.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 326.0, + 43.0, + 698.0, + 998.0 + ], + "mask_score": 3.440171, + "mask_area_ratio": 0.161293, + "elapsed_seconds": 8.0811 + } + }, + { + "name": "protester", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_22/273278,c58890002730d04e.jpg:person:8", + "source_name": "protester holding sign in back", + "source_description": "A person walking in the background on the right, holding up a large white sign. Source dataset: CrowdHuman. Scene context: A large crowd of people, possibly students, are walking down a city street, some holding signs, suggesting a protest or march.", + "sub_caption": "protester holding sign in back: A person walking in the background on the right, holding up a large white sign.. Scene role: Walking further down the right sidewalk carrying a sign.", + "measured_bbox": [ + 0.8193, + 0.1216, + 0.8875, + 0.4511 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_protester.png", + "raw_ref_image": "references/raw_ref_protester_attempt_01.png", + "reference_verify": "references/reference_verify_protester.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_protester_attempt_01.png", + "output": "references/ref_protester.png", + "mask": "references/sam_mask_protester.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 337.0, + 19.0, + 694.0, + 1013.0 + ], + "mask_score": 3.465365, + "mask_area_ratio": 0.161731, + "elapsed_seconds": 8.3408 + } + }, + { + "name": "pedestrian_crossing", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_1/273275,1361a000ad290eee.jpg:person:11", + "source_name": "pedestrian", + "source_description": "A person in a white top and dark pants, walking towards the right. Source dataset: CrowdHuman. Scene context: A bustling city street scene with a mix of old brick buildings and modern glass structures, connected by an elevated glass walkway, with a large crowd of pedestrians moving along the wide sidewalk.", + "sub_caption": "pedestrian: A person in a white top and dark pants, walking towards the right.. Scene role: Actively walking across the crosswalk in front of the vehicle.", + "measured_bbox": [ + 0.2322, + 0.1993, + 0.3165, + 0.4965 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian_crossing.png", + "raw_ref_image": "references/raw_ref_pedestrian_crossing_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian_crossing.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_pedestrian_crossing_attempt_01.png", + "output": "references/ref_pedestrian_crossing.png", + "mask": "references/sam_mask_pedestrian_crossing.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 355.0, + 20.0, + 674.0, + 1012.0 + ], + "mask_score": 3.482863, + "mask_area_ratio": 0.15384, + "elapsed_seconds": 8.0791 + } + }, + { + "name": "pedestrian_walking_away", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_66/273275,194f6000c5fdbe78.jpg:person:12", + "source_name": "pedestrian", + "source_description": "A person wearing a white top and dark pants, walking away from the camera. Source dataset: CrowdHuman. Scene context: A large crowd of people walking on a city street in front of a grand, multi-story commercial building with signs for Virgin Megastore and Monoprix.", + "sub_caption": "pedestrian: A person wearing a white top and dark pants, walking away from the camera.. Scene role: Walking away on the left side of the street.", + "measured_bbox": [ + 0.013, + 0.2139, + 0.0908, + 0.494 + ], + "detection_confidence": 100, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian_walking_away.png", + "raw_ref_image": "references/raw_ref_pedestrian_walking_away_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian_walking_away.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_pedestrian_walking_away_attempt_01.png", + "output": "references/ref_pedestrian_walking_away.png", + "mask": "references/sam_mask_pedestrian_walking_away.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 350.0, + 50.0, + 671.0, + 987.0 + ], + "mask_score": 3.476273, + "mask_area_ratio": 0.142721, + "elapsed_seconds": 8.2428 + } + }, + { + "name": "young_man", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_22/282555,1b65800070e437ee.jpg:person:8", + "source_name": "young man", + "source_description": "Standing back row, wearing a dark blue hoodie. Source dataset: CrowdHuman. Scene context: A group of young people, mostly wearing college apparel, are posing for a photograph on bleachers in front of a red building.", + "sub_caption": "young man: Standing back row, wearing a dark blue hoodie.. Scene role: Waiting in the crowd at the corner intersection.", + "measured_bbox": [ + 0.5568, + 0.1246, + 0.6032, + 0.5033 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_young_man.png", + "raw_ref_image": "references/raw_ref_young_man_attempt_01.png", + "reference_verify": "references/reference_verify_young_man.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_young_man_attempt_01.png", + "output": "references/ref_young_man.png", + "mask": "references/sam_mask_young_man.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 352.0, + 65.0, + 671.0, + 928.0 + ], + "mask_score": 3.483394, + "mask_area_ratio": 0.132506, + "elapsed_seconds": 8.2271 + } + }, + { + "name": "businessman", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_56/282555,cd31e000ef97ef85.jpg:person:23", + "source_name": "adult in dark suit", + "source_description": "Standing in back rows, wearing dark suit and tie. Source dataset: CrowdHuman. Scene context: A large group of people gathered for a celebration, many wearing bright traditional Chinese costumes, holding props like fans and a lion dance head, posing for a group photo behind a banner.", + "sub_caption": "adult in dark suit: Standing in back rows, wearing dark suit and tie.. Scene role: Standing on the sidewalk behind other pedestrians.", + "measured_bbox": [ + 0.5976, + 0.1322, + 0.6413, + 0.4385 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_businessman.png", + "raw_ref_image": "references/raw_ref_businessman_attempt_01.png", + "reference_verify": "references/reference_verify_businessman.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_businessman_attempt_01.png", + "output": "references/ref_businessman.png", + "mask": "references/sam_mask_businessman.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 336.0, + 19.0, + 690.0, + 1013.0 + ], + "mask_score": 2.970801, + "mask_area_ratio": 0.135565, + "elapsed_seconds": 8.2448 + } + }, + { + "name": "street_lamp", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_10/273278,10841e0001c957a7c.jpg:object:6", + "source_name": "street lamp", + "source_description": "A tall street lamp pole partially visible on the right side of the street. Source dataset: CrowdHuman. Scene context: A crowded city street at night, illuminated by numerous bright, colorful neon signs and shop displays.", + "sub_caption": "street lamp: A tall street lamp pole partially visible on the right side of the street.. Scene role: Towering above the right sidewalk corner, serving as city infrastructure.", + "measured_bbox": [ + 0.014, + 0.03, + 0.109, + 0.254 + ], + "detection_confidence": 0.8, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_street_lamp.png", + "raw_ref_image": "references/raw_ref_street_lamp_attempt_01.png", + "reference_verify": "references/reference_verify_street_lamp.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_street_lamp_attempt_01.png", + "output": "references/ref_street_lamp.png", + "mask": "references/sam_mask_street_lamp.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 257.0, + 5.0, + 840.0, + 1019.0 + ], + "mask_score": 3.134794, + "mask_area_ratio": 0.049316, + "elapsed_seconds": 8.2643 + } + }, + { + "name": "dashboard_reflection", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c24e7c72-f53291ea:object:8", + "source_name": "vehicle dashboard reflection", + "source_description": "A reflection on the windshield showing the interior dashboard and a document or object with large blue text. Source dataset: BDD100K. Scene context: View from inside a vehicle driving on a city street on an overcast day, passing a large brick building on the right.", + "sub_caption": "vehicle dashboard reflection: A reflection on the windshield showing the interior dashboard and a document or object with large blue text.. Scene role: Visible along the bottom edge of the frame, establishing the camera's perspective from inside a car.", + "measured_bbox": [ + 0.143, + 0.6854, + 0.461, + 0.8934 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_dashboard_reflection.png", + "raw_ref_image": "references/raw_ref_dashboard_reflection_attempt_01.png", + "reference_verify": "references/reference_verify_dashboard_reflection.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_dashboard_reflection_attempt_01.png", + "output": "references/ref_dashboard_reflection.png", + "mask": "references/sam_mask_dashboard_reflection.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 336.0, + 1023.0, + 1023.0 + ], + "mask_score": 1.211741, + "mask_area_ratio": 0.687541, + "elapsed_seconds": 9.9949 + } + } + ], + "not_emitted": [], + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/rows/sample_000009.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/rows/sample_000009.json new file mode 100644 index 0000000000000000000000000000000000000000..594f74bd375aa36d1d5928b67b75d41f6748d18f --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/rows/sample_000009.json @@ -0,0 +1,256 @@ +{ + "sample_id": "sample_000009", + "target_total": 5, + "target_people": 1, + "target_objects": 4, + "canvas_size": [ + 1248, + 832 + ], + "canvas_aspect_ratio": "3:2", + "main_image": "main_image.png", + "bbox_overlay": "bbox_overlay.png", + "plan": "plan.json", + "detections": "detections.json", + "vocab_task": "vocab_task.json", + "n_planned": 5, + "n_detected": 5, + "n_subjects": 5, + "subjects": [ + { + "name": "pedestrian_in_suit", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_11/282555,d1509000eb848483.jpg:person:21", + "source_name": "pedestrian in suit", + "source_description": "A person walking, wearing a dark suit. Source dataset: CrowdHuman. Scene context: A crowd of tourists gathers outside a grand palace, with some sitting on a stone balustrade and others walking along a wide paved path.", + "sub_caption": "pedestrian in suit: A person walking, wearing a dark suit.. Scene role: walking along the right side of the street on the sidewalk", + "measured_bbox": [ + 0.7498, + 0.407, + 0.8062, + 0.6382 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian_in_suit.png", + "raw_ref_image": "references/raw_ref_pedestrian_in_suit_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian_in_suit.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000009/references/raw_ref_pedestrian_in_suit_attempt_01.png", + "output": "references/ref_pedestrian_in_suit.png", + "mask": "references/sam_mask_pedestrian_in_suit.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 347.0, + 40.0, + 695.0, + 1018.0 + ], + "mask_score": 3.473173, + "mask_area_ratio": 0.152202, + "elapsed_seconds": 9.5571 + } + }, + { + "name": "yellow_building", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_32/273275,7595c000fa55d9d7.jpg:object:5", + "source_name": "yellow building", + "source_description": "A multi-story building with a yellow ochre facade and numerous shuttered windows visible in the background on the far left. Source dataset: CrowdHuman. Scene context: A large crowd of tourists is gathered around a monumental, ornate stone fountain with large statues and cascading water, situated in a city square surrounded by buildings.", + "sub_caption": "yellow building: A multi-story building with a yellow ochre facade and numerous shuttered windows.. Scene role: providing a backdrop on the right side of the street scene", + "measured_bbox": [ + 0.6651, + 0.0, + 0.9968, + 0.6296 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_yellow_building.png", + "raw_ref_image": "references/raw_ref_yellow_building_attempt_01.png", + "reference_verify": "references/reference_verify_yellow_building.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000009/references/raw_ref_yellow_building_attempt_01.png", + "output": "references/ref_yellow_building.png", + "mask": "references/sam_mask_yellow_building.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 4.0, + 0.0, + 1023.0, + 995.0 + ], + "mask_score": 2.131685, + "mask_area_ratio": 0.742735, + "elapsed_seconds": 8.5184 + } + }, + { + "name": "yellow_lane_line", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c9195e43-b2fdd978:object:2", + "source_name": "yellow lane line", + "source_description": "A solid yellow line painted on the road surface indicating the edge of the lane. Source dataset: BDD100K. Scene context: View from inside a vehicle driving on a highway, with a white car ahead in the next lane and lush green bushes along the right side barrier.", + "sub_caption": "yellow lane line: A solid yellow line painted on the road surface indicating the edge of the lane.. Scene role: extending along the road surface towards the distance", + "measured_bbox": [ + 0.4487, + 0.5308, + 0.5367, + 1.0 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_yellow_lane_line.png", + "raw_ref_image": "references/raw_ref_yellow_lane_line_attempt_01.png", + "reference_verify": "references/reference_verify_yellow_lane_line.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000009/references/raw_ref_yellow_lane_line_attempt_01.png", + "output": "references/ref_yellow_lane_line.png", + "mask": "references/sam_mask_yellow_lane_line.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 12.0, + 55.0, + 1018.0, + 969.0 + ], + "mask_score": 0.925602, + "mask_area_ratio": 0.952688, + "elapsed_seconds": 8.3819 + } + }, + { + "name": "overpass", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:bcd32109-8b0b1eef:object:6", + "source_name": "overpass", + "source_description": "A concrete bridge structure spanning across the highway ahead, casting a shadow over the road. Source dataset: BDD100K. Scene context: View from a moving vehicle on a multi-lane highway, approaching an underpass with several cars ahead.", + "sub_caption": "overpass: A concrete bridge structure spanning across the street ahead.. Scene role: arching over the road in the midground", + "measured_bbox": [ + 0.0031, + 0.1925, + 0.6919, + 0.5364 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_overpass.png", + "raw_ref_image": "references/raw_ref_overpass_attempt_01.png", + "reference_verify": "references/reference_verify_overpass.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000009/references/raw_ref_overpass_attempt_01.png", + "output": "references/ref_overpass.png", + "mask": "references/sam_mask_overpass.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 28.0, + 284.0, + 965.0, + 771.0 + ], + "mask_score": 3.406555, + "mask_area_ratio": 0.166775, + "elapsed_seconds": 8.3597 + } + }, + { + "name": "street_light", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:bd5bb58b-24a6f5e7:object:2", + "source_name": "street light", + "source_description": "Bright, glowing street lights illuminating the road. Source dataset: BDD100K. Scene context: Nighttime driving scene on a city street with multiple vehicles, overhead structures, and traffic signals.", + "sub_caption": "street light: Bright, glowing street lights illuminating the road.. Scene role: overhead fixtures providing illumination over the roadway and sidewalk", + "measured_bbox": [ + 0.5545, + 0.0, + 0.5804, + 0.0625 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_street_light.png", + "raw_ref_image": "references/raw_ref_street_light_attempt_01.png", + "reference_verify": "references/reference_verify_street_light.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000009/references/raw_ref_street_light_attempt_01.png", + "output": "references/ref_street_light.png", + "mask": "references/sam_mask_street_light.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 205.0, + 0.0, + 824.0, + 857.0 + ], + "mask_score": 3.391373, + "mask_area_ratio": 0.189186, + "elapsed_seconds": 8.2179 + } + } + ], + "not_emitted": [], + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/rows/sample_000010.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/rows/sample_000010.json new file mode 100644 index 0000000000000000000000000000000000000000..60e2119788d8f051819af9114f044d43aca406e0 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/rows/sample_000010.json @@ -0,0 +1,440 @@ +{ + "sample_id": "sample_000010", + "target_total": 9, + "target_people": 8, + "target_objects": 1, + "canvas_size": [ + 1280, + 720 + ], + "canvas_aspect_ratio": "16:9", + "main_image": "main_image.png", + "bbox_overlay": "bbox_overlay.png", + "plan": "plan.json", + "detections": "detections.json", + "vocab_task": "vocab_task.json", + "n_planned": 9, + "n_detected": 9, + "n_subjects": 9, + "subjects": [ + { + "name": "pedestrian_black_jacket", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_69/273275,3ad7000cc6b4598.jpg:person:7", + "source_name": "pedestrian", + "source_description": "Person walking away, wearing a black jacket and dark pants. Source dataset: CrowdHuman. Scene context: People are walking on a street in front of a large, ornate cathedral.", + "sub_caption": "pedestrian: Person walking away, wearing a black jacket and dark pants.. Scene role: walking away on the sidewalk to the right", + "measured_bbox": [ + 0.8947, + 0.408, + 0.9768, + 0.8 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian_black_jacket.png", + "raw_ref_image": "references/raw_ref_pedestrian_black_jacket_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian_black_jacket.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_black_jacket_attempt_01.png", + "output": "references/ref_pedestrian_black_jacket.png", + "mask": "references/sam_mask_pedestrian_black_jacket.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 332.0, + 25.0, + 691.0, + 1014.0 + ], + "mask_score": 3.419812, + "mask_area_ratio": 0.160983, + "elapsed_seconds": 8.2172 + } + }, + { + "name": "pedestrian_backpack", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_9/283991,1e62f00058996b51.jpg:person:10", + "source_name": "pedestrian", + "source_description": "A man partially visible behind the man in the black polo shirt, wearing a blue t-shirt and a backpack. Source dataset: CrowdHuman. Scene context: A group of people standing on a paved street in front of stone buildings, seemingly listening to a tour guide.", + "sub_caption": "pedestrian: A man wearing a blue t-shirt and a backpack.. Scene role: standing near the curb looking toward the road", + "measured_bbox": [ + 0.6953, + 0.4394, + 0.7156, + 0.5151 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian_backpack.png", + "raw_ref_image": "references/raw_ref_pedestrian_backpack_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian_backpack.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_backpack_attempt_01.png", + "output": "references/ref_pedestrian_backpack.png", + "mask": "references/sam_mask_pedestrian_backpack.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 344.0, + 30.0, + 689.0, + 1018.0 + ], + "mask_score": 3.422455, + "mask_area_ratio": 0.157988, + "elapsed_seconds": 8.1451 + } + }, + { + "name": "pedestrian_red_jacket", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_29/283991,4d5b0000ceec5a1.jpg:person:5", + "source_name": "pedestrian standing", + "source_description": "A person standing near a kiosk, wearing a bright red jacket. Source dataset: CrowdHuman. Scene context: A large outdoor public square with a fountain, surrounded by buildings and trees, with many pedestrians walking around and climbing a wide set of marble stairs in the foreground.", + "sub_caption": "pedestrian standing: A person standing, wearing a bright red jacket.. Scene role: waiting at a crosswalk edge", + "measured_bbox": [ + 0.4504, + 0.4033, + 0.474, + 0.5253 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian_red_jacket.png", + "raw_ref_image": "references/raw_ref_pedestrian_red_jacket_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian_red_jacket.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_red_jacket_attempt_01.png", + "output": "references/ref_pedestrian_red_jacket.png", + "mask": "references/sam_mask_pedestrian_red_jacket.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 346.0, + 92.0, + 677.0, + 984.0 + ], + "mask_score": 3.472322, + "mask_area_ratio": 0.129704, + "elapsed_seconds": 9.5973 + } + }, + { + "name": "pedestrian_striped_shirt", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_15/273278,141d72000a04cd076.jpg:person:8", + "source_name": "pedestrian", + "source_description": "Person in the background wearing a striped shirt and dark pants. Source dataset: CrowdHuman. Scene context: A busy city square with pedestrians, bicycles, and a yellow tram on a sunny day.", + "sub_caption": "pedestrian: Person wearing a striped shirt and dark pants.. Scene role: strolling along the sidewalk", + "measured_bbox": [ + 0.7269, + 0.3947, + 0.7711, + 0.5853 + ], + "detection_confidence": 100, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian_striped_shirt.png", + "raw_ref_image": "references/raw_ref_pedestrian_striped_shirt_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian_striped_shirt.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_striped_shirt_attempt_01.png", + "output": "references/ref_pedestrian_striped_shirt.png", + "mask": "references/sam_mask_pedestrian_striped_shirt.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 338.0, + 11.0, + 687.0, + 1018.0 + ], + "mask_score": 3.206288, + "mask_area_ratio": 0.147885, + "elapsed_seconds": 8.1875 + } + }, + { + "name": "man_pink_shirt", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_41/283992,8df800092d3201e.jpg:person:1", + "source_name": "man talking to young man", + "source_description": "Standing next to the young man with the backpack, wearing a pink shirt and dark shorts. Source dataset: CrowdHuman. Scene context: A bustling city street corner with people walking and standing around, surrounded by buildings with large digital billboards and store signs, with a few vehicles including a prominent red SUV passing through.", + "sub_caption": "man talking to young man: Man wearing a pink shirt and dark shorts.. Scene role: conversing near a storefront on the sidewalk", + "measured_bbox": [ + 0.8332, + 0.3734, + 0.8735, + 0.5918 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_man_pink_shirt.png", + "raw_ref_image": "references/raw_ref_man_pink_shirt_attempt_01.png", + "reference_verify": "references/reference_verify_man_pink_shirt.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_man_pink_shirt_attempt_01.png", + "output": "references/ref_man_pink_shirt.png", + "mask": "references/sam_mask_man_pink_shirt.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 340.0, + 42.0, + 689.0, + 995.0 + ], + "mask_score": 3.442738, + "mask_area_ratio": 0.146916, + "elapsed_seconds": 8.1734 + } + }, + { + "name": "pedestrian_light_jacket", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_16/273278,11c1c7000110dd323.jpg:person:1", + "source_name": "pedestrian", + "source_description": "Individual in the crowd near the intersection, wearing a light-colored jacket. Source dataset: CrowdHuman. Scene context: A bustling city street at night, brightly illuminated by numerous large electronic billboards on tall buildings, with heavy vehicle and pedestrian traffic below.", + "sub_caption": "pedestrian: Individual wearing a light-colored jacket.. Scene role: approaching the street intersection", + "measured_bbox": [ + 0.6065, + 0.3907, + 0.6375, + 0.4907 + ], + "detection_confidence": "high", + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian_light_jacket.png", + "raw_ref_image": "references/raw_ref_pedestrian_light_jacket_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian_light_jacket.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_light_jacket_attempt_01.png", + "output": "references/ref_pedestrian_light_jacket.png", + "mask": "references/sam_mask_pedestrian_light_jacket.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 342.0, + 28.0, + 681.0, + 1013.0 + ], + "mask_score": 3.460161, + "mask_area_ratio": 0.163844, + "elapsed_seconds": 9.6744 + } + }, + { + "name": "pedestrian_light_blue_shirt", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_4/273275,1dd0f000c9356aaa.jpg:person:18", + "source_name": "pedestrian", + "source_description": "A person in a light blue shirt walking away from the camera. Source dataset: CrowdHuman. Scene context: A wide, tree-lined avenue crowded with many pedestrians walking in both directions, adorned with red banners on the trees under a clear blue sky.", + "sub_caption": "pedestrian: A person in a light blue shirt walking away from the camera.. Scene role: walking further down the sidewalk in the mid-ground", + "measured_bbox": [ + 0.9459, + 0.3895, + 0.9964, + 0.6538 + ], + "detection_confidence": 100, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian_light_blue_shirt.png", + "raw_ref_image": "references/raw_ref_pedestrian_light_blue_shirt_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian_light_blue_shirt.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_light_blue_shirt_attempt_01.png", + "output": "references/ref_pedestrian_light_blue_shirt.png", + "mask": "references/sam_mask_pedestrian_light_blue_shirt.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 357.0, + 33.0, + 693.0, + 1012.0 + ], + "mask_score": 3.452806, + "mask_area_ratio": 0.153078, + "elapsed_seconds": 9.6513 + } + }, + { + "name": "distant_pedestrian", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_21/283991,2145f000467f0a28.jpg:person:14", + "source_name": "distant pedestrian", + "source_description": "A person walking in the background. Source dataset: CrowdHuman. Scene context: People walk through an outdoor plaza area with modern architecture, an outdoor seating section with red chairs on the left, and planters with yellow and blue flowers on the right.", + "sub_caption": "distant pedestrian: A person walking in the background.. Scene role: walking in the far background down the street", + "measured_bbox": [ + 0.6066, + 0.3904, + 0.6375, + 0.489 + ], + "detection_confidence": 1.0, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_distant_pedestrian.png", + "raw_ref_image": "references/raw_ref_distant_pedestrian_attempt_01.png", + "reference_verify": "references/reference_verify_distant_pedestrian.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_distant_pedestrian_attempt_01.png", + "output": "references/ref_distant_pedestrian.png", + "mask": "references/sam_mask_distant_pedestrian.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 360.0, + 88.0, + 677.0, + 997.0 + ], + "mask_score": 3.489431, + "mask_area_ratio": 0.138401, + "elapsed_seconds": 8.1869 + } + }, + { + "name": "black_suv", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b8fe1054-42625c45:object:1", + "source_name": "black suv", + "source_description": "A black SUV parked ahead of the silver SUV on the right. Source dataset: BDD100K. Scene context: View from a vehicle driving down a cracked city street lined with parked cars, buildings, and trees on a sunny day.", + "sub_caption": "black suv: A black SUV parked ahead on the right.. Scene role: parked at the curb on the right side of the street", + "measured_bbox": [ + 0.4391, + 0.47, + 0.6899, + 0.8264 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_black_suv.png", + "raw_ref_image": "references/raw_ref_black_suv_attempt_01.png", + "reference_verify": "references/reference_verify_black_suv.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_black_suv_attempt_01.png", + "output": "references/ref_black_suv.png", + "mask": "references/sam_mask_black_suv.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 16.0, + 190.0, + 1007.0, + 843.0 + ], + "mask_score": 3.120914, + "mask_area_ratio": 0.384048, + "elapsed_seconds": 9.8714 + } + } + ], + "not_emitted": [], + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/rows/sample_000011.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/rows/sample_000011.json new file mode 100644 index 0000000000000000000000000000000000000000..4006fd2caa2095600937e000701803693f3e6331 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/rows/sample_000011.json @@ -0,0 +1,486 @@ +{ + "sample_id": "sample_000011", + "target_total": 10, + "target_people": 1, + "target_objects": 9, + "canvas_size": [ + 1280, + 720 + ], + "canvas_aspect_ratio": "16:9", + "main_image": "main_image.png", + "bbox_overlay": "bbox_overlay.png", + "plan": "plan.json", + "detections": "detections.json", + "vocab_task": "vocab_task.json", + "n_planned": 10, + "n_detected": 10, + "n_subjects": 10, + "subjects": [ + { + "name": "blonde_woman", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_8/273275,44ab30007bea86d2.jpg:person:5", + "source_name": "woman in foreground", + "source_description": "A woman with blonde hair, seen in profile in the bottom center of the foreground. Source dataset: CrowdHuman. Scene context: A large crowd of people is gathered on a city street, many standing behind metal barricades, while some are taking photos.", + "sub_caption": "woman in foreground: A woman with blonde hair, seen in profile.. Scene role: walking along the right sidewalk under the street lamps", + "measured_bbox": [ + 0.7873, + 0.3886, + 0.8283, + 0.5843 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_blonde_woman.png", + "raw_ref_image": "references/raw_ref_blonde_woman_attempt_02.png", + "reference_verify": "references/reference_verify_blonde_woman.json", + "reference_verify_passed": true, + "reference_attempts": 2, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/raw_ref_blonde_woman_attempt_02.png", + "output": "references/ref_blonde_woman.png", + "mask": "references/sam_mask_blonde_woman.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 347.0, + 60.0, + 686.0, + 982.0 + ], + "mask_score": 3.476833, + "mask_area_ratio": 0.13921, + "elapsed_seconds": 10.126 + } + }, + { + "name": "metal_structure", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_54/273278,11abb000d12e48e5.jpg:object:5", + "source_name": "metal structure", + "source_description": "A tall, rectangular grey metal box or pillar on the right side, which one man is holding onto. Source dataset: CrowdHuman. Scene context: A crowded subway station with a metal barrier, where two people are climbing over the barrier while others stand by.", + "sub_caption": "metal structure: A tall, rectangular grey metal box or pillar.. Scene role: situated on the edge of the sidewalk as a utility box", + "measured_bbox": [ + 0.8171, + 0.3117, + 0.944, + 0.5699 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_metal_structure.png", + "raw_ref_image": "references/raw_ref_metal_structure_attempt_01.png", + "reference_verify": "references/reference_verify_metal_structure.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/raw_ref_metal_structure_attempt_01.png", + "output": "references/ref_metal_structure.png", + "mask": "references/sam_mask_metal_structure.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 91.0, + 32.0, + 932.0, + 1001.0 + ], + "mask_score": 3.477494, + "mask_area_ratio": 0.534141, + "elapsed_seconds": 8.3774 + } + }, + { + "name": "overhead_streetlights", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:bd1b8b79-829e787f:object:5", + "source_name": "streetlights", + "source_description": "Tall poles with bright lights illuminating the street from above, casting long reflections on the wet pavement. Source dataset: BDD100K. Scene context: Nighttime driving scene on a wet city street with streetlights reflecting on the road.", + "sub_caption": "streetlights: Tall poles with bright lights illuminating the street from above, casting long reflections on the wet pavement.. Scene role: providing overhead illumination for the wet road", + "measured_bbox": [ + 0.5014, + 0.0563, + 0.5225, + 0.1017 + ], + "detection_confidence": 0.9, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_overhead_streetlights.png", + "raw_ref_image": "references/raw_ref_overhead_streetlights_attempt_03.png", + "reference_verify": "references/reference_verify_overhead_streetlights.json", + "reference_verify_passed": true, + "reference_attempts": 3, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/raw_ref_overhead_streetlights_attempt_03.png", + "output": "references/ref_overhead_streetlights.png", + "mask": "references/sam_mask_overhead_streetlights.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 18.0, + 185.0, + 995.0, + 821.0 + ], + "mask_score": 3.398942, + "mask_area_ratio": 0.279411, + "elapsed_seconds": 8.219 + } + }, + { + "name": "silver_car", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:bba4ee51-3badc9f8:object:6", + "source_name": "silver car", + "source_description": "Silver car parked further down the street on the right. Source dataset: BDD100K. Scene context: View from inside a car driving down a residential street lined with parked cars and trees.", + "sub_caption": "silver car: A silver car.. Scene role: parked further down the street on the right curbside", + "measured_bbox": [ + 0.5294, + 0.4494, + 0.6488, + 0.6006 + ], + "detection_confidence": 1.0, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_silver_car.png", + "raw_ref_image": "references/raw_ref_silver_car_attempt_01.png", + "reference_verify": "references/reference_verify_silver_car.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/raw_ref_silver_car_attempt_01.png", + "output": "references/ref_silver_car.png", + "mask": "references/sam_mask_silver_car.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 47.0, + 245.0, + 984.0, + 779.0 + ], + "mask_score": 3.398036, + "mask_area_ratio": 0.279834, + "elapsed_seconds": 8.4126 + } + }, + { + "name": "white_suv", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c5e32cf6-7d2e04b4:object:0", + "source_name": "white suv", + "source_description": "A white Honda CR-V parked or stopped in the rightmost lane, showing its rear passenger side. Source dataset: BDD100K. Scene context: A view from a car driving down a multi-lane city street with parked cars on the right, oncoming traffic on the left, and a highway overpass in the distance on a sunny day.", + "sub_caption": "white suv: A white SUV showing its rear passenger side.. Scene role: stopped or parked in the rightmost lane ahead", + "measured_bbox": [ + 0.5082, + 0.4246, + 0.5826, + 0.5552 + ], + "detection_confidence": "high", + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_white_suv.png", + "raw_ref_image": "references/raw_ref_white_suv_attempt_01.png", + "reference_verify": "references/reference_verify_white_suv.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/raw_ref_white_suv_attempt_01.png", + "output": "references/ref_white_suv.png", + "mask": "references/sam_mask_white_suv.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 26.0, + 237.0, + 1002.0, + 800.0 + ], + "mask_score": 3.457781, + "mask_area_ratio": 0.315623, + "elapsed_seconds": 10.4584 + } + }, + { + "name": "background_street_light", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_15/273278,8d2ae00027075d75.jpg:object:5", + "source_name": "street light", + "source_description": "bright street lights illuminating the area in the background Source dataset: CrowdHuman. Scene context: A group of people wearing athletic clothing are posed for a group photo outdoors at night.", + "sub_caption": "street light: Bright street lights.. Scene role: illuminating the distant background area of the street", + "measured_bbox": [ + 0.5004, + 0.0592, + 0.5237, + 0.0998 + ], + "detection_confidence": 0.9, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_background_street_light.png", + "raw_ref_image": "references/raw_ref_background_street_light_attempt_01.png", + "reference_verify": "references/reference_verify_background_street_light.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/raw_ref_background_street_light_attempt_01.png", + "output": "references/ref_background_street_light.png", + "mask": "references/sam_mask_background_street_light.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 152.0, + 95.0, + 875.0, + 938.0 + ], + "mask_score": 3.437329, + "mask_area_ratio": 0.065212, + "elapsed_seconds": 8.1426 + } + }, + { + "name": "oncoming_white_car", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:be3159f3-13250ffe:object:2", + "source_name": "white car", + "source_description": "A white car driving toward the camera in the oncoming lane, further down the road. Source dataset: BDD100K. Scene context: A daytime street view from a vehicle approaching an intersection with traffic lights and several other cars.", + "sub_caption": "white car: A white car driving toward the camera with its headlights visible.. Scene role: active oncoming traffic in the left lane", + "measured_bbox": [ + 0.1529, + 0.4523, + 0.2891, + 0.5849 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_oncoming_white_car.png", + "raw_ref_image": "references/raw_ref_oncoming_white_car_attempt_01.png", + "reference_verify": "references/reference_verify_oncoming_white_car.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/raw_ref_oncoming_white_car_attempt_01.png", + "output": "references/ref_oncoming_white_car.png", + "mask": "references/sam_mask_oncoming_white_car.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 13.0, + 304.0, + 1011.0, + 819.0 + ], + "mask_score": 3.453796, + "mask_area_ratio": 0.299096, + "elapsed_seconds": 8.2507 + } + }, + { + "name": "windshield_mount", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c5694077-e345e2a8:object:12", + "source_name": "windshield mount", + "source_description": "A black mount attached to the inside of the windshield, partially obscuring the view. Source dataset: BDD100K. Scene context: A view from inside a car driving down a busy city street flanked by tall buildings, with various vehicles including cars, a delivery truck, and a cyclist in the foreground.", + "sub_caption": "windshield mount: A black mount attached to the inside of the windshield.. Scene role: partially obscuring the top view, framing the dashcam perspective", + "measured_bbox": [ + 0.0, + 0.0, + 0.4767, + 0.1353 + ], + "detection_confidence": "high", + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_windshield_mount.png", + "raw_ref_image": "references/raw_ref_windshield_mount_attempt_01.png", + "reference_verify": "references/reference_verify_windshield_mount.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/raw_ref_windshield_mount_attempt_01.png", + "output": "references/ref_windshield_mount.png", + "mask": "references/sam_mask_windshield_mount.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 88.0, + 131.0, + 967.0, + 895.0 + ], + "mask_score": 3.43205, + "mask_area_ratio": 0.236237, + "elapsed_seconds": 8.17 + } + }, + { + "name": "ego_car_dashboard", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c187431f-7b6ad6d6:object:0", + "source_name": "ego car dashboard", + "source_description": "The dark, lower foreground showing part of the dashboard and hood of the vehicle recording the video, with red reflections from taillights. Source dataset: BDD100K. Scene context: Nighttime driving on a multi-lane highway with moderate traffic, showing vehicles ahead and green overhead highway signs.", + "sub_caption": "ego car dashboard: The dark, lower foreground showing part of the dashboard and hood of the vehicle, with red ambient reflections.. Scene role: anchors the bottom of the frame, establishing the driver's perspective", + "measured_bbox": [ + 0.0, + 0.7306, + 1.0, + 1.0 + ], + "detection_confidence": 1.0, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_ego_car_dashboard.png", + "raw_ref_image": "references/raw_ref_ego_car_dashboard_attempt_01.png", + "reference_verify": "references/reference_verify_ego_car_dashboard.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/raw_ref_ego_car_dashboard_attempt_01.png", + "output": "references/ref_ego_car_dashboard.png", + "mask": "references/sam_mask_ego_car_dashboard.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 327.0, + 1023.0, + 788.0 + ], + "mask_score": 3.233951, + "mask_area_ratio": 0.206886, + "elapsed_seconds": 10.2609 + } + }, + { + "name": "vintage_street_lamp", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_29/273275,2032200056dda99e.jpg:object:0", + "source_name": "street lamp", + "source_description": "Ornate, black, vintage-style street lamp post. Source dataset: CrowdHuman. Scene context: A densely crowded city street with numerous pedestrians walking along a sidewalk lined with shops and tall, leafy trees.", + "sub_caption": "street lamp: An ornate, black, vintage-style street lamp post.. Scene role: providing decorative lighting on the right sidewalk next to the pedestrian", + "measured_bbox": [ + 0.7202, + 0.0, + 0.7633, + 0.6024 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_vintage_street_lamp.png", + "raw_ref_image": "references/raw_ref_vintage_street_lamp_attempt_01.png", + "reference_verify": "references/reference_verify_vintage_street_lamp.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/raw_ref_vintage_street_lamp_attempt_01.png", + "output": "references/ref_vintage_street_lamp.png", + "mask": "references/sam_mask_vintage_street_lamp.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 343.0, + 0.0, + 599.0, + 1023.0 + ], + "mask_score": 3.457917, + "mask_area_ratio": 0.047438, + "elapsed_seconds": 8.3114 + } + } + ], + "not_emitted": [], + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/bbox_overlay.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/bbox_overlay.png new file mode 100644 index 0000000000000000000000000000000000000000..ef2ccb8a0afdda175edbe6c52ab97882f4e35fde --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/bbox_overlay.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:99979e9627c64edb0169e8632b114cfc83b78f7737f715e1ebad7ab41e424cde +size 1641957 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/compose_prompt.txt b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/compose_prompt.txt new file mode 100644 index 0000000000000000000000000000000000000000..32b3406f2bc0bdc06b86a9226c85131cbbcdc158 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/compose_prompt.txt @@ -0,0 +1,63 @@ +Render the following JSON scene specification as a photorealistic 1248x832 image using a true 3:2 canvas. The rendered image must be a coherent driving scenario with an outdoor road, street, highway, intersection, curbside, parking, or traffic environment; do not render a non-driving indoor scene or product-catalog scene. Every listed person and object must appear visibly in the image. Keep normal proportions and the requested aspect ratio. The foreground must contain only subjects explicitly listed in the JSON scene specification. Do not add any unlisted foreground people, animals, vehicles, props, tools, products, signs, readable text, logos, or other localizable subjects. Background context may include non-localizable scenery only when it does not introduce a distinct foreground subject. No text, no labels, no logos, no watermarks. +JSON scene specification: +{ + "format": "structured_json_prompt", + "canvas": { + "size": [ + 1248, + 832 + ], + "aspect_ratio": "3:2", + "style": "photorealistic" + }, + "scene": { + "setting": "An overcast urban street with parallel parking spaces on the side and a section of the sidewalk bordered by a temporary barricade.", + "activity": "A pedestrian is walking down the sidewalk near a parked dark car and a metal barricade.", + "composition": "Eye-level camera from the road perspective, looking toward the curb. The parked car is on the right side of the street, a metal barrier is placed along the curb edge nearby, and the pedestrian is in the mid-ground, visible walking on the sidewalk.", + "constraints": [ + "no text", + "no labels", + "no watermarks", + "true 3:2 composition", + "final canvas size 1248x832", + "normal human and object proportions", + "no squeezed perspective", + "no anamorphic stretching", + "every listed person and object must be visibly present", + "the foreground may contain only the listed people and objects", + "no extra foreground people, animals, vehicles, props, tools, products, signs, readable text, logos, or other localizable subjects absent from this JSON", + "the entire image must be a coherent driving scenario", + "the scene must include an outdoor road, street, highway, intersection, curbside, parking, or traffic environment", + "integrate all listed subjects as road users, vehicles, roadside objects, traffic infrastructure, or driving-context objects", + "do not render an office, kitchen, studio, product catalog, indoor room, or unrelated non-driving scene" + ] + }, + "people": [ + { + "name": "pedestrian", + "source_index": 1, + "source_image_id": "CrowdHuman:data/data_69/273278,12fc4700013112375.jpg:person:3", + "source_name": "pedestrian", + "description": "A person wearing a dark coat and trousers.", + "role_in_scene": "Walking on the sidewalk alongside the street." + } + ], + "objects": [ + { + "name": "parked_dark_car", + "source_index": 1, + "source_image_id": "BDD100K:bcb356f6-520dd65c:object:9", + "source_name": "parked dark car", + "description": "A dark-colored sedan.", + "role_in_scene": "Parked on the right side of the street next to the curb." + }, + { + "name": "metal_barrier", + "source_index": 0, + "source_image_id": "CrowdHuman:data/data_74/284193,1da20000b642be5b.jpg:object:5", + "source_name": "metal barrier", + "description": "A silver metal barricade.", + "role_in_scene": "Placed along the edge of the sidewalk near the parked car, separating the walkway from the street." + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/crops/detect_refine_metal_barrier.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/crops/detect_refine_metal_barrier.png new file mode 100644 index 0000000000000000000000000000000000000000..5bc5330e56a0b0cd71fb405763381ae34e5a4fe2 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/crops/detect_refine_metal_barrier.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2586fa7ca1242298f1c781735748cc84131933e393dca50c7173e8fcc0ef5f21 +size 570005 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/crops/detect_refine_parked_dark_car.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/crops/detect_refine_parked_dark_car.png new file mode 100644 index 0000000000000000000000000000000000000000..0be7fcbfe82c2afab08b8008c18ad647a9d4e974 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/crops/detect_refine_parked_dark_car.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34e283f785b7f62a0261c8379fbe947e224c0e128276e1f8880838fe1f0717c0 +size 542408 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/crops/detect_refine_pedestrian.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/crops/detect_refine_pedestrian.png new file mode 100644 index 0000000000000000000000000000000000000000..28ba5764ef6411816c8344ceda3d97b4450b3e38 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/crops/detect_refine_pedestrian.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e82ae0c08ee16e53703f3c6a3707bdaa4c76dbe1dcb154e02ede8e0820829eb +size 121298 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/crops/diversify_input_metal_barrier.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/crops/diversify_input_metal_barrier.png new file mode 100644 index 0000000000000000000000000000000000000000..45cf359c7c0c04c9e96897eee7713bc3898d949e --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/crops/diversify_input_metal_barrier.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63a6085f6773f7e04bef6e68dde370444b97fcd97da70bdba9d41e7b95a46b83 +size 709134 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/crops/diversify_input_parked_dark_car.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/crops/diversify_input_parked_dark_car.png new file mode 100644 index 0000000000000000000000000000000000000000..bf2022aac5028ac5777bb723a926f5b57918bd06 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/crops/diversify_input_parked_dark_car.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e3ad7666a9d36d20ed4d900b79d8c3fc7b731e64ecb425bd4b8e20e4f2ab062 +size 642561 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/crops/diversify_input_pedestrian.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/crops/diversify_input_pedestrian.png new file mode 100644 index 0000000000000000000000000000000000000000..00d812593f4a8396ebec364823bece9e01163b5e --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/crops/diversify_input_pedestrian.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2159cd06a1384ce46c5a4e8958f4d1d0cae42eee3240a4b616c5da2de4859d71 +size 232053 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/detections.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/detections.json new file mode 100644 index 0000000000000000000000000000000000000000..fd060959941b0b27f3aa4c895cb382e79edb2030 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/detections.json @@ -0,0 +1,59 @@ +[ + { + "name": "pedestrian", + "present": true, + "bbox": [ + 0.177, + 0.0, + 0.3091, + 0.4552 + ], + "confidence": 0.95, + "notes": "A person wearing a dark coat and trousers.", + "coarse_bbox": [ + 0.178, + 0.0, + 0.309, + 0.451 + ], + "refine_crop": "crops/detect_refine_pedestrian.png" + }, + { + "name": "parked_dark_car", + "present": true, + "bbox": [ + 0.5856, + 0.0522, + 0.9973, + 0.6586 + ], + "confidence": 0.98, + "notes": "Tight bounding box around the visible portion of the dark sedan in the foreground.", + "coarse_bbox": [ + 0.585, + 0.076, + 0.996, + 0.657 + ], + "refine_crop": "crops/detect_refine_parked_dark_car.png" + }, + { + "name": "metal_barrier", + "present": true, + "bbox": [ + 0.0, + 0.1355, + 0.6068, + 0.558 + ], + "confidence": 0.95, + "notes": "The closest and most prominent metal barrier in the image.", + "coarse_bbox": [ + 0.004, + 0.139, + 0.608, + 0.556 + ], + "refine_crop": "crops/detect_refine_metal_barrier.png" + } +] diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/main_image.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/main_image.png new file mode 100644 index 0000000000000000000000000000000000000000..60cc68392496628dbe062b7485ab604eeace5384 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/main_image.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe6867739768fc6c8fef81c55f7104ab94e08962b4ff7d58b746679511f77339 +size 1684157 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/plan.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/plan.json new file mode 100644 index 0000000000000000000000000000000000000000..dfd501d5af245f3019e83452813b7eeeb1138864 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/plan.json @@ -0,0 +1,107 @@ +{ + "sample_id": "sample_000001", + "target_total": 3, + "target_people": 1, + "target_objects": 2, + "compose_prompt": { + "format": "structured_json_prompt", + "canvas": { + "size": [ + 1248, + 832 + ], + "aspect_ratio": "3:2", + "style": "photorealistic" + }, + "scene": { + "setting": "An overcast urban street with parallel parking spaces on the side and a section of the sidewalk bordered by a temporary barricade.", + "activity": "A pedestrian is walking down the sidewalk near a parked dark car and a metal barricade.", + "composition": "Eye-level camera from the road perspective, looking toward the curb. The parked car is on the right side of the street, a metal barrier is placed along the curb edge nearby, and the pedestrian is in the mid-ground, visible walking on the sidewalk.", + "constraints": [ + "no text", + "no labels", + "no watermarks", + "true 3:2 composition", + "final canvas size 1248x832", + "normal human and object proportions", + "no squeezed perspective", + "no anamorphic stretching", + "every listed person and object must be visibly present", + "the foreground may contain only the listed people and objects", + "no extra foreground people, animals, vehicles, props, tools, products, signs, readable text, logos, or other localizable subjects absent from this JSON", + "the entire image must be a coherent driving scenario", + "the scene must include an outdoor road, street, highway, intersection, curbside, parking, or traffic environment", + "integrate all listed subjects as road users, vehicles, roadside objects, traffic infrastructure, or driving-context objects", + "do not render an office, kitchen, studio, product catalog, indoor room, or unrelated non-driving scene" + ] + }, + "people": [ + { + "name": "pedestrian", + "source_index": 1, + "source_image_id": "CrowdHuman:data/data_69/273278,12fc4700013112375.jpg:person:3", + "source_name": "pedestrian", + "description": "A person wearing a dark coat and trousers.", + "role_in_scene": "Walking on the sidewalk alongside the street." + } + ], + "objects": [ + { + "name": "parked_dark_car", + "source_index": 1, + "source_image_id": "BDD100K:bcb356f6-520dd65c:object:9", + "source_name": "parked dark car", + "description": "A dark-colored sedan.", + "role_in_scene": "Parked on the right side of the street next to the curb." + }, + { + "name": "metal_barrier", + "source_index": 0, + "source_image_id": "CrowdHuman:data/data_74/284193,1da20000b642be5b.jpg:object:5", + "source_name": "metal barrier", + "description": "A silver metal barricade.", + "role_in_scene": "Placed along the edge of the sidewalk near the parked car, separating the walkway from the street." + } + ] + }, + "expected_subjects": [ + { + "name": "pedestrian", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_69/273278,12fc4700013112375.jpg:person:3", + "source_name": "pedestrian", + "source_description": "A person wearing a dark coat and trousers. Source dataset: CrowdHuman. Scene context: A bustling city street lined with trees showcasing vibrant yellow autumn foliage, with many pedestrians walking in both directions.", + "sub_caption": "pedestrian: A person wearing a dark coat and trousers.. Scene role: Walking on the sidewalk alongside the street.", + "ref_style": "white_bg_full_body_front" + }, + { + "name": "parked_dark_car", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:bcb356f6-520dd65c:object:9", + "source_name": "parked dark car", + "source_description": "A dark-colored sedan parked on the right side of the street, behind the silver car. Source dataset: BDD100K. Scene context: A large white New York City bus is driving down a city street on an overcast day with other cars parked and driving around it.", + "sub_caption": "parked dark car: A dark-colored sedan.. Scene role: Parked on the right side of the street next to the curb.", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "metal_barrier", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_74/284193,1da20000b642be5b.jpg:object:5", + "source_name": "metal barrier", + "source_description": "silver metal barricade placed near the entrance Source dataset: CrowdHuman. Scene context: People are gathered outside the entrance of a stone building with arched doorways and large windows.", + "sub_caption": "metal barrier: A silver metal barricade.. Scene role: Placed along the edge of the sidewalk near the parked car, separating the walkway from the street.", + "ref_style": "white_bg_encyclopedia_photo" + } + ], + "vocab_task_path": "sample_000001/vocab_task.json", + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/references.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/references.json new file mode 100644 index 0000000000000000000000000000000000000000..6da4bea09c3881563ee5526de21cc11063346ff0 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/references.json @@ -0,0 +1,101 @@ +{ + "references": [ + { + "name": "pedestrian", + "ref_image": "references/ref_pedestrian.png", + "raw_ref_image": "references/raw_ref_pedestrian_attempt_01.png", + "diversify_input": "crops/diversify_input_pedestrian.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000001/references/raw_ref_pedestrian_attempt_01.png", + "output": "references/ref_pedestrian.png", + "mask": "references/sam_mask_pedestrian.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 332.0, + 40.0, + 693.0, + 999.0 + ], + "mask_score": 3.438137, + "mask_area_ratio": 0.157722, + "elapsed_seconds": 8.8351 + }, + "reference_verify": "references/reference_verify_pedestrian.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "parked_dark_car", + "ref_image": "references/ref_parked_dark_car.png", + "raw_ref_image": "references/raw_ref_parked_dark_car_attempt_02.png", + "diversify_input": "crops/diversify_input_parked_dark_car.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000001/references/raw_ref_parked_dark_car_attempt_02.png", + "output": "references/ref_parked_dark_car.png", + "mask": "references/sam_mask_parked_dark_car.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 17.0, + 427.0, + 1006.0, + 796.0 + ], + "mask_score": 3.312519, + "mask_area_ratio": 0.186911, + "elapsed_seconds": 8.4991 + }, + "reference_verify": "references/reference_verify_parked_dark_car.json", + "reference_verify_passed": true, + "reference_attempts": 2 + }, + { + "name": "metal_barrier", + "ref_image": "references/ref_metal_barrier.png", + "raw_ref_image": "references/raw_ref_metal_barrier_attempt_01.png", + "diversify_input": "crops/diversify_input_metal_barrier.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000001/references/raw_ref_metal_barrier_attempt_01.png", + "output": "references/ref_metal_barrier.png", + "mask": "references/sam_mask_metal_barrier.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 37.0, + 201.0, + 1011.0, + 889.0 + ], + "mask_score": 2.936982, + "mask_area_ratio": 0.305722, + "elapsed_seconds": 9.8709 + }, + "reference_verify": "references/reference_verify_metal_barrier.json", + "reference_verify_passed": true, + "reference_attempts": 1 + } + ], + "reference_errors": {} +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/references/ref_metal_barrier.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/references/ref_metal_barrier.png new file mode 100644 index 0000000000000000000000000000000000000000..a59c9721395d1c3f286f8633e0d92f4a5c8d7fe0 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/references/ref_metal_barrier.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d28e0dcdca85332d964acbcdbfd4773b6c8842e516ba3c70ced8233446ffe058 +size 395256 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/references/ref_parked_dark_car.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/references/ref_parked_dark_car.png new file mode 100644 index 0000000000000000000000000000000000000000..fbfa62274db6116a270b6bb8fe703fcb4b7ad6f9 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/references/ref_parked_dark_car.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a6bd35ba006fe2344469c2be7bb4222fb81cb2a7b66c9e8985e28caf051dda0 +size 367468 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/references/ref_pedestrian.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/references/ref_pedestrian.png new file mode 100644 index 0000000000000000000000000000000000000000..d937525847b1f4a981c8f4bf5be2302a913ee884 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/references/ref_pedestrian.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d42c2b302e5ebbb4d3e382f93b483b4d4e59f4de78d8163d176e8167138098e +size 273649 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/references/reference_verify_metal_barrier.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/references/reference_verify_metal_barrier.json new file mode 100644 index 0000000000000000000000000000000000000000..cbfabc91787497256273726c69ef0a479b61a6c6 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/references/reference_verify_metal_barrier.json @@ -0,0 +1,46 @@ +{ + "name": "metal_barrier", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_metal_barrier_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_metal_barrier_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_metal_barrier_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000001/references/raw_ref_metal_barrier_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000001/references/candidate_ref_metal_barrier_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000001/references/candidate_sam_mask_metal_barrier_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 37.0, + 201.0, + 1011.0, + 889.0 + ], + "mask_score": 2.936982, + "mask_area_ratio": 0.305722, + "elapsed_seconds": 9.8709 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows a complete metal barrier on a white background, satisfying all hard requirements." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/references/reference_verify_parked_dark_car.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/references/reference_verify_parked_dark_car.json new file mode 100644 index 0000000000000000000000000000000000000000..93e17673f4c49629bf6288335b9862706f1ee20f --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/references/reference_verify_parked_dark_car.json @@ -0,0 +1,88 @@ +{ + "name": "parked_dark_car", + "passed": true, + "accepted_attempt": 2, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_parked_dark_car_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_parked_dark_car_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_parked_dark_car_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000001/references/raw_ref_parked_dark_car_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000001/references/candidate_ref_parked_dark_car_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000001/references/candidate_sam_mask_parked_dark_car_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 128.0, + 54.0, + 1023.0, + 926.0 + ], + "mask_score": 2.75749, + "mask_area_ratio": 0.554314, + "elapsed_seconds": 8.6259 + }, + "verify": { + "passed": false, + "subject_visible": true, + "complete_subject": false, + "cropped_or_truncated": true, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [ + "The vehicle is heavily cropped on the right side and rear, exceeding minor edge cropping.", + "Severe masking artifacts are present on the hood, windshield, and roof of the car." + ], + "notes": "The image shows only the front-left portion of the car and suffers from poor background removal that creates holes in the subject itself." + } + }, + { + "attempt": 2, + "raw_ref_image": "references/raw_ref_parked_dark_car_attempt_02.png", + "candidate_ref_image": "references/candidate_ref_parked_dark_car_attempt_02.png", + "candidate_sam_mask": "references/candidate_sam_mask_parked_dark_car_attempt_02.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000001/references/raw_ref_parked_dark_car_attempt_02.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000001/references/candidate_ref_parked_dark_car_attempt_02.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000001/references/candidate_sam_mask_parked_dark_car_attempt_02.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 17.0, + 427.0, + 1006.0, + 796.0 + ], + "mask_score": 3.312519, + "mask_area_ratio": 0.186911, + "elapsed_seconds": 8.4991 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows a complete dark sedan isolated on a white background, perfectly matching the subject." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/references/reference_verify_pedestrian.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/references/reference_verify_pedestrian.json new file mode 100644 index 0000000000000000000000000000000000000000..dce90cf4fa11144bd0f7384aa5623e2e773bb823 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/references/reference_verify_pedestrian.json @@ -0,0 +1,46 @@ +{ + "name": "pedestrian", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_pedestrian_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_pedestrian_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_pedestrian_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000001/references/raw_ref_pedestrian_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000001/references/candidate_ref_pedestrian_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000001/references/candidate_sam_mask_pedestrian_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 332.0, + 40.0, + 693.0, + 999.0 + ], + "mask_score": 3.438137, + "mask_area_ratio": 0.157722, + "elapsed_seconds": 8.8351 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows a complete full-body shot of a single person on a clean white background with no cropping." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/references/sam_mask_metal_barrier.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/references/sam_mask_metal_barrier.png new file mode 100644 index 0000000000000000000000000000000000000000..6cfd515b868de7bb948dfeee28cb6ece26558447 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/references/sam_mask_metal_barrier.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/references/sam_mask_parked_dark_car.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/references/sam_mask_parked_dark_car.png new file mode 100644 index 0000000000000000000000000000000000000000..72f0478d637822d4d1d84ae8c24a3105f79167f7 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/references/sam_mask_parked_dark_car.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/references/sam_mask_pedestrian.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/references/sam_mask_pedestrian.png new file mode 100644 index 0000000000000000000000000000000000000000..415251e79529852b993206678a9ac88924f3062d Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/references/sam_mask_pedestrian.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/row.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/row.json new file mode 100644 index 0000000000000000000000000000000000000000..b58924ea676d48cf68e5a36b20fecd5e98653637 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/row.json @@ -0,0 +1,164 @@ +{ + "sample_id": "sample_000001", + "target_total": 3, + "target_people": 1, + "target_objects": 2, + "canvas_size": [ + 1248, + 832 + ], + "canvas_aspect_ratio": "3:2", + "main_image": "main_image.png", + "bbox_overlay": "bbox_overlay.png", + "plan": "plan.json", + "detections": "detections.json", + "vocab_task": "vocab_task.json", + "n_planned": 3, + "n_detected": 3, + "n_subjects": 3, + "subjects": [ + { + "name": "pedestrian", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_69/273278,12fc4700013112375.jpg:person:3", + "source_name": "pedestrian", + "source_description": "A person wearing a dark coat and trousers. Source dataset: CrowdHuman. Scene context: A bustling city street lined with trees showcasing vibrant yellow autumn foliage, with many pedestrians walking in both directions.", + "sub_caption": "pedestrian: A person wearing a dark coat and trousers.. Scene role: Walking on the sidewalk alongside the street.", + "measured_bbox": [ + 0.177, + 0.0, + 0.3091, + 0.4552 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian.png", + "raw_ref_image": "references/raw_ref_pedestrian_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000001/references/raw_ref_pedestrian_attempt_01.png", + "output": "references/ref_pedestrian.png", + "mask": "references/sam_mask_pedestrian.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 332.0, + 40.0, + 693.0, + 999.0 + ], + "mask_score": 3.438137, + "mask_area_ratio": 0.157722, + "elapsed_seconds": 8.8351 + } + }, + { + "name": "parked_dark_car", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:bcb356f6-520dd65c:object:9", + "source_name": "parked dark car", + "source_description": "A dark-colored sedan parked on the right side of the street, behind the silver car. Source dataset: BDD100K. Scene context: A large white New York City bus is driving down a city street on an overcast day with other cars parked and driving around it.", + "sub_caption": "parked dark car: A dark-colored sedan.. Scene role: Parked on the right side of the street next to the curb.", + "measured_bbox": [ + 0.5856, + 0.0522, + 0.9973, + 0.6586 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_parked_dark_car.png", + "raw_ref_image": "references/raw_ref_parked_dark_car_attempt_02.png", + "reference_verify": "references/reference_verify_parked_dark_car.json", + "reference_verify_passed": true, + "reference_attempts": 2, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000001/references/raw_ref_parked_dark_car_attempt_02.png", + "output": "references/ref_parked_dark_car.png", + "mask": "references/sam_mask_parked_dark_car.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 17.0, + 427.0, + 1006.0, + 796.0 + ], + "mask_score": 3.312519, + "mask_area_ratio": 0.186911, + "elapsed_seconds": 8.4991 + } + }, + { + "name": "metal_barrier", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_74/284193,1da20000b642be5b.jpg:object:5", + "source_name": "metal barrier", + "source_description": "silver metal barricade placed near the entrance Source dataset: CrowdHuman. Scene context: People are gathered outside the entrance of a stone building with arched doorways and large windows.", + "sub_caption": "metal barrier: A silver metal barricade.. Scene role: Placed along the edge of the sidewalk near the parked car, separating the walkway from the street.", + "measured_bbox": [ + 0.0, + 0.1355, + 0.6068, + 0.558 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_metal_barrier.png", + "raw_ref_image": "references/raw_ref_metal_barrier_attempt_01.png", + "reference_verify": "references/reference_verify_metal_barrier.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000001/references/raw_ref_metal_barrier_attempt_01.png", + "output": "references/ref_metal_barrier.png", + "mask": "references/sam_mask_metal_barrier.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 37.0, + 201.0, + 1011.0, + 889.0 + ], + "mask_score": 2.936982, + "mask_area_ratio": 0.305722, + "elapsed_seconds": 9.8709 + } + } + ], + "not_emitted": [], + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/vocab_task.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/vocab_task.json new file mode 100644 index 0000000000000000000000000000000000000000..f4747140e18fe3ab140d40f1cd193eccaabcb8cf --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/vocab_task.json @@ -0,0 +1,56 @@ +{ + "task_id": "sample_000001", + "sample_id": "sample_000001", + "sample_index": 1, + "target_total": 3, + "target_people": 1, + "target_objects": 2, + "people_candidates": [ + { + "candidate_index": 0, + "source_offset": 13962, + "image_id": "CrowdHuman:data/data_13/282555,65d1d00050480dce.jpg:person:2", + "name": "toddler", + "description": "A young child with short brown hair, wearing a light blue patterned sweater, being carried by the woman in blue. Source dataset: CrowdHuman. Scene context: Two women, one carrying a toddler and the other walking hand-in-hand with a young girl, are crossing a street with parked cars in the background." + }, + { + "candidate_index": 1, + "source_offset": 171812, + "image_id": "CrowdHuman:data/data_69/273278,12fc4700013112375.jpg:person:3", + "name": "pedestrian", + "description": "A person wearing a dark coat and trousers. Source dataset: CrowdHuman. Scene context: A bustling city street lined with trees showcasing vibrant yellow autumn foliage, with many pedestrians walking in both directions." + } + ], + "object_candidates": [ + { + "candidate_index": 0, + "source_offset": 129279, + "image_id": "CrowdHuman:data/data_74/284193,1da20000b642be5b.jpg:object:5", + "name": "metal barrier", + "description": "silver metal barricade placed near the entrance Source dataset: CrowdHuman. Scene context: People are gathered outside the entrance of a stone building with arched doorways and large windows." + }, + { + "candidate_index": 1, + "source_offset": 182609, + "image_id": "BDD100K:bcb356f6-520dd65c:object:9", + "name": "parked dark car", + "description": "A dark-colored sedan parked on the right side of the street, behind the silver car. Source dataset: BDD100K. Scene context: A large white New York City bus is driving down a city street on an overcast day with other cars parked and driving around it." + }, + { + "candidate_index": 2, + "source_offset": 92980, + "image_id": "CrowdHuman:data/data_56/273278,97d7f00040d24761.jpg:object:4", + "name": "paved path", + "description": "A textured paved walkway the group is walking on. Source dataset: CrowdHuman. Scene context: A group of five adults walking together and conversing in a park-like outdoor setting." + }, + { + "candidate_index": 3, + "source_offset": 83796, + "image_id": "CrowdHuman:data/data_51/273278,23a1a000c26da45e.jpg:object:0", + "name": "plastic bag", + "description": "White plastic shopping bag held by the seated woman. Source dataset: CrowdHuman. Scene context: People are standing and sitting inside a brightly lit train or subway car." + } + ], + "rng_seed": 1782032722, + "created_at": 1782223460.188576 +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/bbox_overlay.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/bbox_overlay.png new file mode 100644 index 0000000000000000000000000000000000000000..43f9fe946f0f887773e0deab97227f98de3ac913 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/bbox_overlay.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92f8922574263aaedc713b37b52f628ff62482d0cd076dfddc5c5921ec8cf8de +size 1382507 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/compose_prompt.txt b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/compose_prompt.txt new file mode 100644 index 0000000000000000000000000000000000000000..c2b91187ec99a4dd2ff10b308a45ab76397fd069 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/compose_prompt.txt @@ -0,0 +1,159 @@ +Render the following JSON scene specification as a photorealistic 1280x720 image using a true 16:9 canvas. The rendered image must be a coherent driving scenario with an outdoor road, street, highway, intersection, curbside, parking, or traffic environment; do not render a non-driving indoor scene or product-catalog scene. Every listed person and object must appear visibly in the image. Keep normal proportions and the requested aspect ratio. The foreground must contain only subjects explicitly listed in the JSON scene specification. Do not add any unlisted foreground people, animals, vehicles, props, tools, products, signs, readable text, logos, or other localizable subjects. Background context may include non-localizable scenery only when it does not introduce a distinct foreground subject. No text, no labels, no logos, no watermarks. +JSON scene specification: +{ + "format": "structured_json_prompt", + "canvas": { + "size": [ + 1280, + 720 + ], + "aspect_ratio": "16:9", + "style": "photorealistic" + }, + "scene": { + "setting": "A multi-lane city street at twilight, viewed from a dashcam inside a moving vehicle.", + "activity": "Vehicles are driving and parked along the road while pedestrians walk near illuminated storefronts under a fading sky.", + "composition": "Wide-angle perspective originating from the vehicle dashboard at the bottom frame. The road stretches into the center, separated by double yellow lines, with depth built through receding cars, pedestrians, street lights, and trees framing the sides.", + "constraints": [ + "no text", + "no labels", + "no watermarks", + "true 16:9 composition", + "final canvas size 1280x720", + "normal human and object proportions", + "no squeezed perspective", + "no anamorphic stretching", + "every listed person and object must be visibly present", + "the foreground may contain only the listed people and objects", + "no extra foreground people, animals, vehicles, props, tools, products, signs, readable text, logos, or other localizable subjects absent from this JSON", + "the entire image must be a coherent driving scenario", + "the scene must include an outdoor road, street, highway, intersection, curbside, parking, or traffic environment", + "integrate all listed subjects as road users, vehicles, roadside objects, traffic infrastructure, or driving-context objects", + "do not render an office, kitchen, studio, product catalog, indoor room, or unrelated non-driving scene" + ] + }, + "people": [ + { + "name": "pedestrian_right", + "source_index": 0, + "source_image_id": "BDD100K:c5d864fa-b0b2380b:person:2", + "source_name": "pedestrian", + "description": "Person walking away from the camera on the right sidewalk, wearing dark clothing.", + "role_in_scene": "Walking along the right-hand sidewalk under the trees." + }, + { + "name": "pedestrian_left", + "source_index": 2, + "source_image_id": "BDD100K:b714a088-861a043b:person:2", + "source_name": "pedestrian", + "description": "A person near a shop entrance on the left, partially obscured by shadows.", + "role_in_scene": "Standing near a building entrance on the left side of the street." + } + ], + "objects": [ + { + "name": "city_buildings", + "source_index": 1, + "source_image_id": "CrowdHuman:data/data_9/283554,31eeb000e9237b31.jpg:object:9", + "source_name": "building", + "description": "Various city buildings of different heights forming the urban landscape along the street.", + "role_in_scene": "Lining the street and forming the architectural background on both sides." + }, + { + "name": "pink_scooter", + "source_index": 5, + "source_image_id": "CrowdHuman:data/data_66/283991,a60c0001c08f63b.jpg:object:3", + "source_name": "pink scooter", + "description": "A prominent pink step-through style motor scooter.", + "role_in_scene": "Parked on the right sidewalk near the street signs." + }, + { + "name": "street_signs", + "source_index": 6, + "source_image_id": "BDD100K:b5ab0e46-8eab4733:object:8", + "source_name": "street signs", + "description": "Various street signs attached to a metal pole on the right side of the street.", + "role_in_scene": "Mounted on a pole alongside the road on the right." + }, + { + "name": "storefront_sign", + "source_index": 7, + "source_image_id": "CrowdHuman:data/data_4/273275,46a6f0005d04fc24.jpg:object:7", + "source_name": "storefront sign", + "description": "A dark, illuminated sign structure above a shop entrance.", + "role_in_scene": "Hanging above a shop entrance on the left side of the street, illuminating the adjacent pedestrian." + }, + { + "name": "parked_suv_right", + "source_index": 11, + "source_image_id": "BDD100K:c54441e6-400c221e:object:4", + "source_name": "parked SUV", + "description": "Dark-colored SUV parked on the right side of the road.", + "role_in_scene": "Parked parallel to the curb on the right side of the street." + }, + { + "name": "dark_car_left", + "source_index": 12, + "source_image_id": "BDD100K:bb2e43e4-5e7a7129:object:6", + "source_name": "car", + "description": "A dark car parked along the left curb further ahead.", + "role_in_scene": "Parked alongside the left curb." + }, + { + "name": "dark_suv_driving", + "source_index": 14, + "source_image_id": "BDD100K:c889c950-865ca5b6:object:0", + "source_name": "dark SUV", + "description": "A dark SUV traveling in the left lane, with visible glowing red taillights.", + "role_in_scene": "Driving ahead of the camera vehicle in the opposing or adjacent left lane." + }, + { + "name": "street_light", + "source_index": 15, + "source_image_id": "BDD100K:c84f848e-2a5e0737:object:2", + "source_name": "street light", + "description": "Tall pole with a bright light on top, illuminating the road.", + "role_in_scene": "Casting warm light onto the street from the right-hand sidewalk." + }, + { + "name": "vehicle_dashboard", + "source_index": 16, + "source_image_id": "BDD100K:b5032e1d-dad95b60:object:9", + "source_name": "dashboard", + "description": "The dark, reflective dashboard and lower windshield area of the camera vehicle.", + "role_in_scene": "Occupying the bottom foreground of the image, establishing the perspective from inside the car." + }, + { + "name": "white_car_ahead", + "source_index": 20, + "source_image_id": "BDD100K:b3a102ed-6ef54f5e:object:3", + "source_name": "white car", + "description": "A white car visible further down the road.", + "role_in_scene": "Driving away in the right lane, further in the distance." + }, + { + "name": "double_yellow_lines", + "source_index": 22, + "source_image_id": "BDD100K:c417a291-7802692d:object:8", + "source_name": "yellow lines", + "description": "Double yellow painted lines separating opposite directions of traffic.", + "role_in_scene": "Running down the center of the road, receding into the distance." + }, + { + "name": "street_trees", + "source_index": 23, + "source_image_id": "BDD100K:c4891df0-24371ae1:object:3", + "source_name": "trees", + "description": "Numerous trees with dense foliage lining both sides of the road.", + "role_in_scene": "Planted along the sidewalks, softening the urban environment and framing the street." + }, + { + "name": "twilight_sky", + "source_index": 25, + "source_image_id": "BDD100K:c13c0d1f-00dfd075:object:9", + "source_name": "sky", + "description": "Clear twilight sky transitioning from bright near the horizon to dark blue at the top.", + "role_in_scene": "Providing the atmospheric backdrop above the buildings and street." + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/detect_refine_city_buildings.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/detect_refine_city_buildings.png new file mode 100644 index 0000000000000000000000000000000000000000..2fb0a0c48a38c1078585dd0732985c2e87e4af53 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/detect_refine_city_buildings.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/detect_refine_dark_car_left.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/detect_refine_dark_car_left.png new file mode 100644 index 0000000000000000000000000000000000000000..db3c681a492b957c64318f32b3fb5e48259fda7f Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/detect_refine_dark_car_left.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/detect_refine_dark_suv_driving.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/detect_refine_dark_suv_driving.png new file mode 100644 index 0000000000000000000000000000000000000000..60dc46e5d2d93b5c37cd8b4eaeb47e6e00044580 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/detect_refine_dark_suv_driving.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/detect_refine_double_yellow_lines.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/detect_refine_double_yellow_lines.png new file mode 100644 index 0000000000000000000000000000000000000000..d2232b8adc68e6451e1f62c547cd9fbabf97e68c Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/detect_refine_double_yellow_lines.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/detect_refine_parked_suv_right.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/detect_refine_parked_suv_right.png new file mode 100644 index 0000000000000000000000000000000000000000..ea6311cb29fedf2f4383d4e6702dd893c8a7a7d8 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/detect_refine_parked_suv_right.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/detect_refine_pedestrian_left.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/detect_refine_pedestrian_left.png new file mode 100644 index 0000000000000000000000000000000000000000..ec8cfd8357762700a06461e27b383c89d4165afc Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/detect_refine_pedestrian_left.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/detect_refine_pedestrian_right.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/detect_refine_pedestrian_right.png new file mode 100644 index 0000000000000000000000000000000000000000..75c587d22b432644e208c46145cf786c9eb97c4b Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/detect_refine_pedestrian_right.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/detect_refine_pink_scooter.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/detect_refine_pink_scooter.png new file mode 100644 index 0000000000000000000000000000000000000000..ca8bf7789448bb77c2f2f021beec6a548da79ca3 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/detect_refine_pink_scooter.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/detect_refine_storefront_sign.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/detect_refine_storefront_sign.png new file mode 100644 index 0000000000000000000000000000000000000000..c81e664090aa2deb8322f68604d9929bab75c8e3 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/detect_refine_storefront_sign.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/detect_refine_street_light.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/detect_refine_street_light.png new file mode 100644 index 0000000000000000000000000000000000000000..1308f1ad9f1d49a052f3c676f7c9d542c16493bb Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/detect_refine_street_light.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/detect_refine_street_signs.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/detect_refine_street_signs.png new file mode 100644 index 0000000000000000000000000000000000000000..5311c898b143258f6c4421724836da58016f318a Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/detect_refine_street_signs.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/detect_refine_street_trees.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/detect_refine_street_trees.png new file mode 100644 index 0000000000000000000000000000000000000000..5a625a32a92066ba5fcf52bd7254dbe4fde7cf1b --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/detect_refine_street_trees.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d41a45f30fffb52bcd099edf6547b26d89397ee26f8e133a60f3209d7c531c2 +size 456453 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/detect_refine_twilight_sky.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/detect_refine_twilight_sky.png new file mode 100644 index 0000000000000000000000000000000000000000..1492b7d7c3578fa83e6375f920cf78530ffe3e8c --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/detect_refine_twilight_sky.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30d8f74bcba4a8814550f9a27eac23b65b97d7e4e037da8c8adf893125c5bc48 +size 889258 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/detect_refine_vehicle_dashboard.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/detect_refine_vehicle_dashboard.png new file mode 100644 index 0000000000000000000000000000000000000000..8739da93b769a565fd4bf563fc601bd6e629e860 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/detect_refine_vehicle_dashboard.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b94217a22582105bd0585d90f8b864a60d485ba4c6d598d075c3524adb79b971 +size 300755 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/detect_refine_white_car_ahead.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/detect_refine_white_car_ahead.png new file mode 100644 index 0000000000000000000000000000000000000000..8c415b7f87c0ae8847b25cd4cc8a052e4640a846 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/detect_refine_white_car_ahead.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/diversify_input_city_buildings.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/diversify_input_city_buildings.png new file mode 100644 index 0000000000000000000000000000000000000000..cf5f0ad868cdbbfddb9bf375445eb74d61e00f77 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/diversify_input_city_buildings.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/diversify_input_dark_car_left.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/diversify_input_dark_car_left.png new file mode 100644 index 0000000000000000000000000000000000000000..c9b1e77d71c58dd65a018f97bbd0729b723c340d Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/diversify_input_dark_car_left.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/diversify_input_dark_suv_driving.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/diversify_input_dark_suv_driving.png new file mode 100644 index 0000000000000000000000000000000000000000..947e335ba8f51a97b4dfc64ff8ef4720e2024121 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/diversify_input_dark_suv_driving.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/diversify_input_double_yellow_lines.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/diversify_input_double_yellow_lines.png new file mode 100644 index 0000000000000000000000000000000000000000..140b534ff9c125ca7e63bd5f38039b718c8f9218 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/diversify_input_double_yellow_lines.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:902f03a03972a998c14115a3bb844f071fde86f4f4ad39a241cb93b66f72ac7f +size 127646 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/diversify_input_parked_suv_right.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/diversify_input_parked_suv_right.png new file mode 100644 index 0000000000000000000000000000000000000000..806fee321b2b9cd2c7b1fd404be78cadf7a2a2ac Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/diversify_input_parked_suv_right.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/diversify_input_pedestrian_left.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/diversify_input_pedestrian_left.png new file mode 100644 index 0000000000000000000000000000000000000000..51007b371056cb972d18a4f385e72bedd50760d8 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/diversify_input_pedestrian_left.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/diversify_input_pedestrian_right.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/diversify_input_pedestrian_right.png new file mode 100644 index 0000000000000000000000000000000000000000..d6bc4606643f3597ba8bc8b9714bcaed86e05de5 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/diversify_input_pedestrian_right.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/diversify_input_pink_scooter.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/diversify_input_pink_scooter.png new file mode 100644 index 0000000000000000000000000000000000000000..4a48e2dd8d745222e84913674b425235d056139e Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/diversify_input_pink_scooter.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/diversify_input_storefront_sign.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/diversify_input_storefront_sign.png new file mode 100644 index 0000000000000000000000000000000000000000..34c5c8850ef5bdcc18f214423a7e793bb0cb6a60 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/diversify_input_storefront_sign.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/diversify_input_street_light.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/diversify_input_street_light.png new file mode 100644 index 0000000000000000000000000000000000000000..7e2e0f0b528465a123ce31120675264a84953571 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/diversify_input_street_light.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/diversify_input_street_signs.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/diversify_input_street_signs.png new file mode 100644 index 0000000000000000000000000000000000000000..b81c7dba0cca6228fcf6fa134d95e3201df0af84 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/diversify_input_street_signs.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/diversify_input_street_trees.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/diversify_input_street_trees.png new file mode 100644 index 0000000000000000000000000000000000000000..093756de52d227e79beafc404037ef583c85876f --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/diversify_input_street_trees.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76c93152f35cb83cc978d24187c3ff38cd9c200a512951ef673987935b2fc35d +size 520923 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/diversify_input_twilight_sky.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/diversify_input_twilight_sky.png new file mode 100644 index 0000000000000000000000000000000000000000..ba4396fcacf7958acf8e24830e1a4ec46acb497c --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/diversify_input_twilight_sky.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4f7b132f60324c11ee6e440090be5f548e2b57ab509a3f1320a7e69af8f42a8 +size 764555 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/diversify_input_vehicle_dashboard.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/diversify_input_vehicle_dashboard.png new file mode 100644 index 0000000000000000000000000000000000000000..2ba01dcf53facb484d227a6805478871dd8883fb Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/diversify_input_vehicle_dashboard.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/diversify_input_white_car_ahead.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/diversify_input_white_car_ahead.png new file mode 100644 index 0000000000000000000000000000000000000000..2d19d9ba3cfdee3e891dde8c177dc7166ecacb19 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/diversify_input_white_car_ahead.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/detections.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/detections.json new file mode 100644 index 0000000000000000000000000000000000000000..d59fd1b08ab32207d8aa93fa774ee85b97e97bba --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/detections.json @@ -0,0 +1,287 @@ +[ + { + "name": "pedestrian_right", + "present": true, + "bbox": [ + 0.8872, + 0.491, + 0.9451, + 0.6701 + ], + "confidence": 0.95, + "notes": "Tight bounding box around the person walking in dark clothing.", + "coarse_bbox": [ + 0.888, + 0.493, + 0.946, + 0.671 + ], + "refine_crop": "crops/detect_refine_pedestrian_right.png" + }, + { + "name": "pedestrian_left", + "present": true, + "bbox": [ + 0.1301, + 0.5154, + 0.1517, + 0.611 + ], + "confidence": 0.95, + "notes": "The cropped image shows a pedestrian matching the description standing near an entrance.", + "coarse_bbox": [ + 0.13, + 0.512, + 0.152, + 0.609 + ], + "refine_crop": "crops/detect_refine_pedestrian_left.png" + }, + { + "name": "city_buildings", + "present": true, + "bbox": [ + 0.3358, + 0.3425, + 0.4929, + 0.5277 + ], + "confidence": 0.95, + "notes": "Bounding box covers the prominent row of city buildings of various heights forming the urban landscape in the background.", + "coarse_bbox": [ + 0.332, + 0.342, + 0.493, + 0.542 + ], + "refine_crop": "crops/detect_refine_city_buildings.png" + }, + { + "name": "pink_scooter", + "present": true, + "bbox": [ + 0.677, + 0.56, + 0.7935, + 0.7095 + ], + "confidence": 0.95, + "notes": "Tight bounding box around the pink scooter.", + "coarse_bbox": [ + 0.678, + 0.574, + 0.794, + 0.711 + ], + "refine_crop": "crops/detect_refine_pink_scooter.png" + }, + { + "name": "street_signs", + "present": true, + "bbox": [ + 0.8162, + 0.2869, + 0.8575, + 0.4063 + ], + "confidence": 0.99, + "notes": "street sign, perfectly visible.", + "coarse_bbox": [ + 0.816, + 0.286, + 0.857, + 0.407 + ], + "refine_crop": "crops/detect_refine_street_signs.png" + }, + { + "name": "storefront_sign", + "present": true, + "bbox": [ + 0.1052, + 0.4218, + 0.186, + 0.4781 + ], + "confidence": 0.95, + "notes": "A prominent illuminated sign fills the majority of the crop, displaying text and a graphic. The bounding box tightly encloses the rectangular sign itself, excluding the immediate surrounding dark structure.", + "coarse_bbox": [ + 0.104, + 0.425, + 0.185, + 0.48 + ], + "refine_crop": "crops/detect_refine_storefront_sign.png" + }, + { + "name": "parked_suv_right", + "present": true, + "bbox": [ + 0.6057, + 0.5099, + 0.7451, + 0.6703 + ], + "confidence": 0.95, + "notes": "Tight bounding box around the dark-colored parked SUV.", + "coarse_bbox": [ + 0.605, + 0.509, + 0.741, + 0.675 + ], + "refine_crop": "crops/detect_refine_parked_suv_right.png" + }, + { + "name": "dark_car_left", + "present": true, + "bbox": [ + 0.2139, + 0.5323, + 0.3044, + 0.6201 + ], + "confidence": 0.95, + "notes": "Tight bounding box around the prominent dark SUV in the image.", + "coarse_bbox": [ + 0.215, + 0.535, + 0.303, + 0.616 + ], + "refine_crop": "crops/detect_refine_dark_car_left.png" + }, + { + "name": "dark_suv_driving", + "present": true, + "bbox": [ + 0.3005, + 0.5101, + 0.4179, + 0.6508 + ], + "confidence": 0.98, + "notes": "The dark SUV is clearly visible.", + "coarse_bbox": [ + 0.299, + 0.509, + 0.419, + 0.647 + ], + "refine_crop": "crops/detect_refine_dark_suv_driving.png" + }, + { + "name": "street_light", + "present": true, + "bbox": [ + 0.8171, + 0.1755, + 0.8719, + 0.2202 + ], + "confidence": 0.95, + "notes": "Street light arm and fixture on the pole.", + "coarse_bbox": [ + 0.8, + 0.17, + 0.875, + 0.678 + ], + "refine_crop": "crops/detect_refine_street_light.png" + }, + { + "name": "vehicle_dashboard", + "present": true, + "bbox": [ + 0.0, + 0.9261, + 1.0, + 1.0 + ], + "confidence": "high", + "notes": "The dark, reflective dashboard and lower windshield area of the camera vehicle is clearly visible at the bottom of the crop.", + "coarse_bbox": [ + 0.0, + 0.786, + 1.0, + 1.0 + ], + "refine_crop": "crops/detect_refine_vehicle_dashboard.png" + }, + { + "name": "white_car_ahead", + "present": true, + "bbox": [ + 0.4811, + 0.5382, + 0.5174, + 0.5915 + ], + "confidence": 0.98, + "notes": "The white car occupies the majority of the crop. Bounding box captures the visible extent.", + "coarse_bbox": [ + 0.48, + 0.539, + 0.517, + 0.589 + ], + "refine_crop": "crops/detect_refine_white_car_ahead.png" + }, + { + "name": "double_yellow_lines", + "present": true, + "bbox": [ + 0.3008, + 0.5732, + 0.4776, + 0.8029 + ], + "confidence": 0.98, + "notes": "Double yellow painted lines separating opposite directions of traffic.. Scene role: Running down the center of the road, receding into the distance.", + "coarse_bbox": [ + 0.303, + 0.565, + 0.473, + 0.804 + ], + "refine_crop": "crops/detect_refine_double_yellow_lines.png" + }, + { + "name": "street_trees", + "present": true, + "bbox": [ + 0.001, + 0.002, + 0.375, + 0.63 + ], + "confidence": 0.95, + "notes": "refine failed; using coarse bbox", + "coarse_bbox": [ + 0.001, + 0.002, + 0.375, + 0.63 + ], + "refine_crop": "crops/detect_refine_street_trees.png" + }, + { + "name": "twilight_sky", + "present": true, + "bbox": [ + 0.116, + 0.0, + 0.714, + 0.4742 + ], + "confidence": 0.95, + "notes": "Tight bounding box capturing the visible twilight sky area above the buildings and trees.", + "coarse_bbox": [ + 0.0, + 0.0, + 1.0, + 0.49 + ], + "refine_crop": "crops/detect_refine_twilight_sky.png" + } +] diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/main_image.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/main_image.png new file mode 100644 index 0000000000000000000000000000000000000000..2fff681d46514dc89f472b54c84c043d784c78d6 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/main_image.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0f4902072a5c1ed2821d1a7ccf6155d796954c75b9283c6c726a0b21b1c49e0 +size 1539722 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/plan.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/plan.json new file mode 100644 index 0000000000000000000000000000000000000000..01d0e0b317ceeb04c8523e82c1322e5c4f08a403 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/plan.json @@ -0,0 +1,335 @@ +{ + "sample_id": "sample_000002", + "target_total": 15, + "target_people": 2, + "target_objects": 13, + "compose_prompt": { + "format": "structured_json_prompt", + "canvas": { + "size": [ + 1280, + 720 + ], + "aspect_ratio": "16:9", + "style": "photorealistic" + }, + "scene": { + "setting": "A multi-lane city street at twilight, viewed from a dashcam inside a moving vehicle.", + "activity": "Vehicles are driving and parked along the road while pedestrians walk near illuminated storefronts under a fading sky.", + "composition": "Wide-angle perspective originating from the vehicle dashboard at the bottom frame. The road stretches into the center, separated by double yellow lines, with depth built through receding cars, pedestrians, street lights, and trees framing the sides.", + "constraints": [ + "no text", + "no labels", + "no watermarks", + "true 16:9 composition", + "final canvas size 1280x720", + "normal human and object proportions", + "no squeezed perspective", + "no anamorphic stretching", + "every listed person and object must be visibly present", + "the foreground may contain only the listed people and objects", + "no extra foreground people, animals, vehicles, props, tools, products, signs, readable text, logos, or other localizable subjects absent from this JSON", + "the entire image must be a coherent driving scenario", + "the scene must include an outdoor road, street, highway, intersection, curbside, parking, or traffic environment", + "integrate all listed subjects as road users, vehicles, roadside objects, traffic infrastructure, or driving-context objects", + "do not render an office, kitchen, studio, product catalog, indoor room, or unrelated non-driving scene" + ] + }, + "people": [ + { + "name": "pedestrian_right", + "source_index": 0, + "source_image_id": "BDD100K:c5d864fa-b0b2380b:person:2", + "source_name": "pedestrian", + "description": "Person walking away from the camera on the right sidewalk, wearing dark clothing.", + "role_in_scene": "Walking along the right-hand sidewalk under the trees." + }, + { + "name": "pedestrian_left", + "source_index": 2, + "source_image_id": "BDD100K:b714a088-861a043b:person:2", + "source_name": "pedestrian", + "description": "A person near a shop entrance on the left, partially obscured by shadows.", + "role_in_scene": "Standing near a building entrance on the left side of the street." + } + ], + "objects": [ + { + "name": "city_buildings", + "source_index": 1, + "source_image_id": "CrowdHuman:data/data_9/283554,31eeb000e9237b31.jpg:object:9", + "source_name": "building", + "description": "Various city buildings of different heights forming the urban landscape along the street.", + "role_in_scene": "Lining the street and forming the architectural background on both sides." + }, + { + "name": "pink_scooter", + "source_index": 5, + "source_image_id": "CrowdHuman:data/data_66/283991,a60c0001c08f63b.jpg:object:3", + "source_name": "pink scooter", + "description": "A prominent pink step-through style motor scooter.", + "role_in_scene": "Parked on the right sidewalk near the street signs." + }, + { + "name": "street_signs", + "source_index": 6, + "source_image_id": "BDD100K:b5ab0e46-8eab4733:object:8", + "source_name": "street signs", + "description": "Various street signs attached to a metal pole on the right side of the street.", + "role_in_scene": "Mounted on a pole alongside the road on the right." + }, + { + "name": "storefront_sign", + "source_index": 7, + "source_image_id": "CrowdHuman:data/data_4/273275,46a6f0005d04fc24.jpg:object:7", + "source_name": "storefront sign", + "description": "A dark, illuminated sign structure above a shop entrance.", + "role_in_scene": "Hanging above a shop entrance on the left side of the street, illuminating the adjacent pedestrian." + }, + { + "name": "parked_suv_right", + "source_index": 11, + "source_image_id": "BDD100K:c54441e6-400c221e:object:4", + "source_name": "parked SUV", + "description": "Dark-colored SUV parked on the right side of the road.", + "role_in_scene": "Parked parallel to the curb on the right side of the street." + }, + { + "name": "dark_car_left", + "source_index": 12, + "source_image_id": "BDD100K:bb2e43e4-5e7a7129:object:6", + "source_name": "car", + "description": "A dark car parked along the left curb further ahead.", + "role_in_scene": "Parked alongside the left curb." + }, + { + "name": "dark_suv_driving", + "source_index": 14, + "source_image_id": "BDD100K:c889c950-865ca5b6:object:0", + "source_name": "dark SUV", + "description": "A dark SUV traveling in the left lane, with visible glowing red taillights.", + "role_in_scene": "Driving ahead of the camera vehicle in the opposing or adjacent left lane." + }, + { + "name": "street_light", + "source_index": 15, + "source_image_id": "BDD100K:c84f848e-2a5e0737:object:2", + "source_name": "street light", + "description": "Tall pole with a bright light on top, illuminating the road.", + "role_in_scene": "Casting warm light onto the street from the right-hand sidewalk." + }, + { + "name": "vehicle_dashboard", + "source_index": 16, + "source_image_id": "BDD100K:b5032e1d-dad95b60:object:9", + "source_name": "dashboard", + "description": "The dark, reflective dashboard and lower windshield area of the camera vehicle.", + "role_in_scene": "Occupying the bottom foreground of the image, establishing the perspective from inside the car." + }, + { + "name": "white_car_ahead", + "source_index": 20, + "source_image_id": "BDD100K:b3a102ed-6ef54f5e:object:3", + "source_name": "white car", + "description": "A white car visible further down the road.", + "role_in_scene": "Driving away in the right lane, further in the distance." + }, + { + "name": "double_yellow_lines", + "source_index": 22, + "source_image_id": "BDD100K:c417a291-7802692d:object:8", + "source_name": "yellow lines", + "description": "Double yellow painted lines separating opposite directions of traffic.", + "role_in_scene": "Running down the center of the road, receding into the distance." + }, + { + "name": "street_trees", + "source_index": 23, + "source_image_id": "BDD100K:c4891df0-24371ae1:object:3", + "source_name": "trees", + "description": "Numerous trees with dense foliage lining both sides of the road.", + "role_in_scene": "Planted along the sidewalks, softening the urban environment and framing the street." + }, + { + "name": "twilight_sky", + "source_index": 25, + "source_image_id": "BDD100K:c13c0d1f-00dfd075:object:9", + "source_name": "sky", + "description": "Clear twilight sky transitioning from bright near the horizon to dark blue at the top.", + "role_in_scene": "Providing the atmospheric backdrop above the buildings and street." + } + ] + }, + "expected_subjects": [ + { + "name": "pedestrian_right", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "BDD100K:c5d864fa-b0b2380b:person:2", + "source_name": "pedestrian", + "source_description": "Person walking away from the camera on the right sidewalk, wearing dark clothing. Source dataset: BDD100K. Scene context: View from inside a vehicle driving down a wet city street lined with tall buildings, with pedestrians on the sidewalk and several cars and taxis ahead in the rain.", + "sub_caption": "pedestrian: Person walking away from the camera on the right sidewalk, wearing dark clothing.. Scene role: Walking along the right-hand sidewalk under the trees.", + "ref_style": "white_bg_full_body_front" + }, + { + "name": "pedestrian_left", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "BDD100K:b714a088-861a043b:person:2", + "source_name": "pedestrian", + "source_description": "another person near the shop entrance on the left, partially obscured Source dataset: BDD100K. Scene context: a city street scene from the perspective of a vehicle, showing multiple cars in motion, buildings lining the road, and pedestrians on the sidewalks during dusk or early evening", + "sub_caption": "pedestrian: A person near a shop entrance on the left, partially obscured by shadows.. Scene role: Standing near a building entrance on the left side of the street.", + "ref_style": "white_bg_full_body_front" + }, + { + "name": "city_buildings", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_9/283554,31eeb000e9237b31.jpg:object:9", + "source_name": "building", + "source_description": "Various city buildings of different heights forming the skyline in the background. Source dataset: CrowdHuman. Scene context: People are walking and resting on the wooden walkway of a large suspension bridge with a city skyline in the background.", + "sub_caption": "building: Various city buildings of different heights forming the urban landscape along the street.. Scene role: Lining the street and forming the architectural background on both sides.", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "pink_scooter", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_66/283991,a60c0001c08f63b.jpg:object:3", + "source_name": "pink scooter", + "source_description": "A prominent pink step-through style motor scooter. Source dataset: CrowdHuman. Scene context: A busy city intersection with many people riding scooters and some cars in the background.", + "sub_caption": "pink scooter: A prominent pink step-through style motor scooter.. Scene role: Parked on the right sidewalk near the street signs.", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "street_signs", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b5ab0e46-8eab4733:object:8", + "source_name": "street signs", + "source_description": "Various street signs attached to a pole on the right side of the street, including a speed limit sign. Source dataset: BDD100K. Scene context: A narrow city street with several parked and moving box trucks and vans, bordered by multi-story buildings and construction barriers.", + "sub_caption": "street signs: Various street signs attached to a metal pole on the right side of the street.. Scene role: Mounted on a pole alongside the road on the right.", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "storefront_sign", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_4/273275,46a6f0005d04fc24.jpg:object:7", + "source_name": "storefront sign", + "source_description": "A dark sign with white text 'DELI' and a smaller red 'ATM' sign beneath it on the left. Source dataset: CrowdHuman. Scene context: A densely crowded city street with numerous pedestrians walking towards the camera, while emergency vehicles with flashing lights are visible in the background traffic.", + "sub_caption": "storefront sign: A dark, illuminated sign structure above a shop entrance.. Scene role: Hanging above a shop entrance on the left side of the street, illuminating the adjacent pedestrian.", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "parked_suv_right", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c54441e6-400c221e:object:4", + "source_name": "parked SUV", + "source_description": "Dark-colored SUV parked ahead of the sedan on the right side of the road. Source dataset: BDD100K. Scene context: Nighttime driving scene approaching an intersection with green traffic lights and parked cars on the right.", + "sub_caption": "parked SUV: Dark-colored SUV parked on the right side of the road.. Scene role: Parked parallel to the curb on the right side of the street.", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "dark_car_left", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:bb2e43e4-5e7a7129:object:6", + "source_name": "car", + "source_description": "A dark car parked along the left curb further ahead. Source dataset: BDD100K. Scene context: A dashcam view from a vehicle driving down a multi-lane road on a partly cloudy day, with a few other cars and pedestrians on the sidewalk.", + "sub_caption": "car: A dark car parked along the left curb further ahead.. Scene role: Parked alongside the left curb.", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "dark_suv_driving", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c889c950-865ca5b6:object:0", + "source_name": "dark SUV", + "source_description": "A dark SUV traveling in the left lane, with visible red taillights. Source dataset: BDD100K. Scene context: Nighttime driving on a highway with several cars visible ahead, illuminated mainly by taillights and headlights.", + "sub_caption": "dark SUV: A dark SUV traveling in the left lane, with visible glowing red taillights.. Scene role: Driving ahead of the camera vehicle in the opposing or adjacent left lane.", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "street_light", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c84f848e-2a5e0737:object:2", + "source_name": "street light", + "source_description": "Tall pole with a bright light on top, illuminating the road from the right side. Source dataset: BDD100K. Scene context: Nighttime driving view on a multi-lane highway with streetlights illuminating the dark road ahead.", + "sub_caption": "street light: Tall pole with a bright light on top, illuminating the road.. Scene role: Casting warm light onto the street from the right-hand sidewalk.", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "vehicle_dashboard", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b5032e1d-dad95b60:object:9", + "source_name": "dashboard", + "source_description": "The dark, reflective dashboard and lower windshield area of the camera vehicle in the foreground. Source dataset: BDD100K. Scene context: View from inside a vehicle driving down a residential city street lined with trees, parked cars, and multi-story brick buildings.", + "sub_caption": "dashboard: The dark, reflective dashboard and lower windshield area of the camera vehicle.. Scene role: Occupying the bottom foreground of the image, establishing the perspective from inside the car.", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "white_car_ahead", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b3a102ed-6ef54f5e:object:3", + "source_name": "white car", + "source_description": "A white car visible further down the road in the right lane. Source dataset: BDD100K. Scene context: Nighttime driving scene in a city with cars stopped in traffic and a construction site on the right.", + "sub_caption": "white car: A white car visible further down the road.. Scene role: Driving away in the right lane, further in the distance.", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "double_yellow_lines", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c417a291-7802692d:object:8", + "source_name": "yellow lines", + "source_description": "Double yellow painted lines separating opposite directions of traffic. Source dataset: BDD100K. Scene context: A pedestrian crossing a street at a crosswalk with several cars stopped around them, with storefronts visible in the background.", + "sub_caption": "yellow lines: Double yellow painted lines separating opposite directions of traffic.. Scene role: Running down the center of the road, receding into the distance.", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "street_trees", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c4891df0-24371ae1:object:3", + "source_name": "trees", + "source_description": "Numerous trees with dense green and yellowish foliage lining both sides of the road. Source dataset: BDD100K. Scene context: A dark SUV drives ahead on a wet road lined with trees and a fence under a cloudy sky.", + "sub_caption": "trees: Numerous trees with dense foliage lining both sides of the road.. Scene role: Planted along the sidewalks, softening the urban environment and framing the street.", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "twilight_sky", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c13c0d1f-00dfd075:object:9", + "source_name": "sky", + "source_description": "Clear twilight sky transitioning from bright near the horizon to dark blue at the top. Source dataset: BDD100K. Scene context: View from a moving vehicle looking down a multi-lane city street lined with parked cars and large trees under a clear twilight sky.", + "sub_caption": "sky: Clear twilight sky transitioning from bright near the horizon to dark blue at the top.. Scene role: Providing the atmospheric backdrop above the buildings and street.", + "ref_style": "white_bg_encyclopedia_photo" + } + ], + "vocab_task_path": "sample_000002/vocab_task.json", + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references.json new file mode 100644 index 0000000000000000000000000000000000000000..ccbc9b3fb1d06e0b0a642b4af04f8b8ceb9f1649 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references.json @@ -0,0 +1,485 @@ +{ + "references": [ + { + "name": "pedestrian_right", + "ref_image": "references/ref_pedestrian_right.png", + "raw_ref_image": "references/raw_ref_pedestrian_right_attempt_01.png", + "diversify_input": "crops/diversify_input_pedestrian_right.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_pedestrian_right_attempt_01.png", + "output": "references/ref_pedestrian_right.png", + "mask": "references/sam_mask_pedestrian_right.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 345.0, + 55.0, + 678.0, + 982.0 + ], + "mask_score": 3.462354, + "mask_area_ratio": 0.14014, + "elapsed_seconds": 8.2387 + }, + "reference_verify": "references/reference_verify_pedestrian_right.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "pedestrian_left", + "ref_image": "references/ref_pedestrian_left.png", + "raw_ref_image": "references/raw_ref_pedestrian_left_attempt_01.png", + "diversify_input": "crops/diversify_input_pedestrian_left.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_pedestrian_left_attempt_01.png", + "output": "references/ref_pedestrian_left.png", + "mask": "references/sam_mask_pedestrian_left.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 342.0, + 67.0, + 681.0, + 996.0 + ], + "mask_score": 3.481605, + "mask_area_ratio": 0.150858, + "elapsed_seconds": 8.1403 + }, + "reference_verify": "references/reference_verify_pedestrian_left.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "city_buildings", + "ref_image": "references/ref_city_buildings.png", + "raw_ref_image": "references/raw_ref_city_buildings_attempt_01.png", + "diversify_input": "crops/diversify_input_city_buildings.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_city_buildings_attempt_01.png", + "output": "references/ref_city_buildings.png", + "mask": "references/sam_mask_city_buildings.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 27.0, + 178.0, + 996.0, + 865.0 + ], + "mask_score": 3.420089, + "mask_area_ratio": 0.463421, + "elapsed_seconds": 8.2735 + }, + "reference_verify": "references/reference_verify_city_buildings.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "pink_scooter", + "ref_image": "references/ref_pink_scooter.png", + "raw_ref_image": "references/raw_ref_pink_scooter_attempt_01.png", + "diversify_input": "crops/diversify_input_pink_scooter.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_pink_scooter_attempt_01.png", + "output": "references/ref_pink_scooter.png", + "mask": "references/sam_mask_pink_scooter.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 41.0, + 120.0, + 982.0, + 920.0 + ], + "mask_score": 3.414017, + "mask_area_ratio": 0.259921, + "elapsed_seconds": 8.1841 + }, + "reference_verify": "references/reference_verify_pink_scooter.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "street_signs", + "ref_image": "references/ref_street_signs.png", + "raw_ref_image": "references/raw_ref_street_signs_attempt_01.png", + "diversify_input": "crops/diversify_input_street_signs.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_street_signs_attempt_01.png", + "output": "references/ref_street_signs.png", + "mask": "references/sam_mask_street_signs.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 226.0, + 71.0, + 707.0, + 967.0 + ], + "mask_score": 3.475593, + "mask_area_ratio": 0.25818, + "elapsed_seconds": 9.8621 + }, + "reference_verify": "references/reference_verify_street_signs.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "storefront_sign", + "ref_image": "references/ref_storefront_sign.png", + "raw_ref_image": "references/raw_ref_storefront_sign_attempt_01.png", + "diversify_input": "crops/diversify_input_storefront_sign.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_storefront_sign_attempt_01.png", + "output": "references/ref_storefront_sign.png", + "mask": "references/sam_mask_storefront_sign.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 107.0, + 274.0, + 900.0, + 749.0 + ], + "mask_score": 3.354337, + "mask_area_ratio": 0.167885, + "elapsed_seconds": 8.1782 + }, + "reference_verify": "references/reference_verify_storefront_sign.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "parked_suv_right", + "ref_image": "references/ref_parked_suv_right.png", + "raw_ref_image": "references/raw_ref_parked_suv_right_attempt_01.png", + "diversify_input": "crops/diversify_input_parked_suv_right.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_parked_suv_right_attempt_01.png", + "output": "references/ref_parked_suv_right.png", + "mask": "references/sam_mask_parked_suv_right.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 52.0, + 216.0, + 993.0, + 835.0 + ], + "mask_score": 3.459027, + "mask_area_ratio": 0.361156, + "elapsed_seconds": 10.1865 + }, + "reference_verify": "references/reference_verify_parked_suv_right.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "dark_car_left", + "ref_image": "references/ref_dark_car_left.png", + "raw_ref_image": "references/raw_ref_dark_car_left_attempt_01.png", + "diversify_input": "crops/diversify_input_dark_car_left.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_dark_car_left_attempt_01.png", + "output": "references/ref_dark_car_left.png", + "mask": "references/sam_mask_dark_car_left.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 42.0, + 237.0, + 982.0, + 794.0 + ], + "mask_score": 3.479099, + "mask_area_ratio": 0.30617, + "elapsed_seconds": 8.2274 + }, + "reference_verify": "references/reference_verify_dark_car_left.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "dark_suv_driving", + "ref_image": "references/ref_dark_suv_driving.png", + "raw_ref_image": "references/raw_ref_dark_suv_driving_attempt_01.png", + "diversify_input": "crops/diversify_input_dark_suv_driving.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_dark_suv_driving_attempt_01.png", + "output": "references/ref_dark_suv_driving.png", + "mask": "references/sam_mask_dark_suv_driving.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 96.0, + 294.0, + 928.0, + 812.0 + ], + "mask_score": 3.455576, + "mask_area_ratio": 0.251452, + "elapsed_seconds": 9.8494 + }, + "reference_verify": "references/reference_verify_dark_suv_driving.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "street_light", + "ref_image": "references/ref_street_light.png", + "raw_ref_image": "references/raw_ref_street_light_attempt_01.png", + "diversify_input": "crops/diversify_input_street_light.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_street_light_attempt_01.png", + "output": "references/ref_street_light.png", + "mask": "references/sam_mask_street_light.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 71.0, + 277.0, + 913.0, + 727.0 + ], + "mask_score": 3.350243, + "mask_area_ratio": 0.068855, + "elapsed_seconds": 8.2963 + }, + "reference_verify": "references/reference_verify_street_light.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "vehicle_dashboard", + "ref_image": "references/ref_vehicle_dashboard.png", + "raw_ref_image": "references/raw_ref_vehicle_dashboard_attempt_01.png", + "diversify_input": "crops/diversify_input_vehicle_dashboard.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_vehicle_dashboard_attempt_01.png", + "output": "references/ref_vehicle_dashboard.png", + "mask": "references/sam_mask_vehicle_dashboard.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 24.0, + 317.0, + 1001.0, + 706.0 + ], + "mask_score": 2.942001, + "mask_area_ratio": 0.133658, + "elapsed_seconds": 8.3645 + }, + "reference_verify": "references/reference_verify_vehicle_dashboard.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "white_car_ahead", + "ref_image": "references/ref_white_car_ahead.png", + "raw_ref_image": "references/raw_ref_white_car_ahead_attempt_01.png", + "diversify_input": "crops/diversify_input_white_car_ahead.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_white_car_ahead_attempt_01.png", + "output": "references/ref_white_car_ahead.png", + "mask": "references/sam_mask_white_car_ahead.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 112.0, + 168.0, + 912.0, + 855.0 + ], + "mask_score": 3.412999, + "mask_area_ratio": 0.338258, + "elapsed_seconds": 8.3339 + }, + "reference_verify": "references/reference_verify_white_car_ahead.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "double_yellow_lines", + "ref_image": "references/ref_double_yellow_lines.png", + "raw_ref_image": "references/raw_ref_double_yellow_lines_attempt_01.png", + "diversify_input": "crops/diversify_input_double_yellow_lines.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_double_yellow_lines_attempt_01.png", + "output": "references/ref_double_yellow_lines.png", + "mask": "references/sam_mask_double_yellow_lines.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 0.0, + 1023.0, + 1023.0 + ], + "mask_score": 2.141169, + "mask_area_ratio": 0.667065, + "elapsed_seconds": 8.2719 + }, + "reference_verify": "references/reference_verify_double_yellow_lines.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "street_trees", + "ref_image": "references/ref_street_trees.png", + "raw_ref_image": "references/raw_ref_street_trees_attempt_01.png", + "diversify_input": "crops/diversify_input_street_trees.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_street_trees_attempt_01.png", + "output": "references/ref_street_trees.png", + "mask": "references/sam_mask_street_trees.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 65.0, + 64.0, + 958.0, + 969.0 + ], + "mask_score": 3.478968, + "mask_area_ratio": 0.365667, + "elapsed_seconds": 8.231 + }, + "reference_verify": "references/reference_verify_street_trees.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "twilight_sky", + "ref_image": "references/ref_twilight_sky.png", + "raw_ref_image": "references/raw_ref_twilight_sky_attempt_01.png", + "diversify_input": "crops/diversify_input_twilight_sky.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_twilight_sky_attempt_01.png", + "output": "references/ref_twilight_sky.png", + "mask": "references/sam_mask_twilight_sky.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 208.0, + 1023.0, + 814.0 + ], + "mask_score": 2.437955, + "mask_area_ratio": 0.529621, + "elapsed_seconds": 9.8292 + }, + "reference_verify": "references/reference_verify_twilight_sky.json", + "reference_verify_passed": true, + "reference_attempts": 1 + } + ], + "reference_errors": {} +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_city_buildings.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_city_buildings.png new file mode 100644 index 0000000000000000000000000000000000000000..780b7e7376ea6d80d0fad562f73d458c6862798f --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_city_buildings.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c027e497c165eb4db29987fd70c09938bbced506ec382d9b50ad4da3f885c17 +size 1003879 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_dark_car_left.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_dark_car_left.png new file mode 100644 index 0000000000000000000000000000000000000000..b9ece1feaa0ffffd4c2d540996fad2df5607f1f3 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_dark_car_left.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73198a84921cc7c3275e73fc027843ce6192e770953344e0c560b0620fdae51a +size 564765 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_dark_suv_driving.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_dark_suv_driving.png new file mode 100644 index 0000000000000000000000000000000000000000..136f0ef3bb3b239832c159908a0b562b124fcede --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_dark_suv_driving.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8ac8fa7e530c76b7574b819b8291068b64138c1f1f12eef12ad9302bd819528 +size 468906 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_double_yellow_lines.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_double_yellow_lines.png new file mode 100644 index 0000000000000000000000000000000000000000..18f2a6bcfe357c9dde8deb54cf2e335f5cdf5771 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_double_yellow_lines.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6944b03c836bb2840bb6033473a448bb5cb2982037ab9bbe3a0b9dfbb4c03a3 +size 1604938 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_parked_suv_right.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_parked_suv_right.png new file mode 100644 index 0000000000000000000000000000000000000000..8d3422eeb9ea4d7052327b6751e8b93d15c25279 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_parked_suv_right.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed2d9d21cf4e03b170a21dc32864ef892092ec46acf222237155cf15af66ed17 +size 676868 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_pedestrian_left.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_pedestrian_left.png new file mode 100644 index 0000000000000000000000000000000000000000..aaf956581be6ca3d6c00dc7331c3ba46a73370ee --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_pedestrian_left.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68b921f74264a32c1480ec67c81be0a43cf44b65809d0a9f531731868a939098 +size 332592 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_pedestrian_right.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_pedestrian_right.png new file mode 100644 index 0000000000000000000000000000000000000000..9e291a9161bbbd41f93da498d8bf7b04282f9af3 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_pedestrian_right.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:958027fbe02a311709e05c8146814a95731301505c75466611eeb6491aeccdf0 +size 255550 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_pink_scooter.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_pink_scooter.png new file mode 100644 index 0000000000000000000000000000000000000000..234d7939152f95777a6c6dad3d82bdabdcb810d8 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_pink_scooter.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:113b044af104bba78c7ef2457d14bdfabe45cc966408676afc65c6c34b0c243f +size 551088 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_storefront_sign.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_storefront_sign.png new file mode 100644 index 0000000000000000000000000000000000000000..380e5ed25d26cb4980a7eaf0d7ebd8130c3dc3b3 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_storefront_sign.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c617e1f256ec94e64a58a6ad0272b0bc366257c16ac9c344c26f50e66aceb5d7 +size 350837 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_street_light.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_street_light.png new file mode 100644 index 0000000000000000000000000000000000000000..aa1f86bd3feedae05f1cc9fd789ff36bbf2055d8 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_street_light.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ca9accb54f925479c94f77239164e1864afb3a5c42087515d00c898b1f4653f +size 137906 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_street_signs.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_street_signs.png new file mode 100644 index 0000000000000000000000000000000000000000..78808098bfb1364d4f9ad1066c6349b9f012d866 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_street_signs.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80f9da64fee2aa367990befd5a364dac57533525e6e4710325a682d5b5377ea7 +size 493047 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_street_trees.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_street_trees.png new file mode 100644 index 0000000000000000000000000000000000000000..bd1900b491104cd51516a312e1e4a3216fd21c04 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_street_trees.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0fbb8bab41e4e2f49d71a88237c0ed17d2572f775112b8dd49f7849a7da1f00a +size 968875 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_twilight_sky.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_twilight_sky.png new file mode 100644 index 0000000000000000000000000000000000000000..4727fa0e3bec21da4b760fa7d3185d456c4a814f --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_twilight_sky.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2a5edec51bd82b38816c65e5db7276d47e8fc5217e4d6caafb5d472711ef5ff +size 595141 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_vehicle_dashboard.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_vehicle_dashboard.png new file mode 100644 index 0000000000000000000000000000000000000000..d49f4198343308587ec995b49c9e24269827f4f7 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_vehicle_dashboard.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b60fe50bbfcd91acbd889cf06991cb402793576d67dadfc53633f69c827eb723 +size 263583 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_white_car_ahead.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_white_car_ahead.png new file mode 100644 index 0000000000000000000000000000000000000000..65c0519aae82c9f59b158884bbfb9fe0e5ab4be7 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_white_car_ahead.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf837ed52a7128e78cac434d4ea85a25e1f625ca5afcf7aab2e1ab5186ca72c4 +size 544575 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/reference_verify_city_buildings.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/reference_verify_city_buildings.json new file mode 100644 index 0000000000000000000000000000000000000000..7a54aac6b7265b3fcaa1ad5c3e65fb676de792c1 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/reference_verify_city_buildings.json @@ -0,0 +1,46 @@ +{ + "name": "city_buildings", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_city_buildings_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_city_buildings_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_city_buildings_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_city_buildings_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/candidate_ref_city_buildings_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/candidate_sam_mask_city_buildings_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 27.0, + 178.0, + 996.0, + 865.0 + ], + "mask_score": 3.420089, + "mask_area_ratio": 0.463421, + "elapsed_seconds": 8.2735 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": true, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows a representative crop of city buildings isolated on a white background. As a large environmental feature/scene background element, this representative crop is acceptable." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/reference_verify_dark_car_left.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/reference_verify_dark_car_left.json new file mode 100644 index 0000000000000000000000000000000000000000..8b016f26a0e210628e535c176b55dc9f38230a74 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/reference_verify_dark_car_left.json @@ -0,0 +1,46 @@ +{ + "name": "dark_car_left", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_dark_car_left_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_dark_car_left_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_dark_car_left_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_dark_car_left_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/candidate_ref_dark_car_left_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/candidate_sam_mask_dark_car_left_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 42.0, + 237.0, + 982.0, + 794.0 + ], + "mask_score": 3.479099, + "mask_area_ratio": 0.30617, + "elapsed_seconds": 8.2274 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows a complete, isolated dark car on a white background, which perfectly meets the criteria." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/reference_verify_dark_suv_driving.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/reference_verify_dark_suv_driving.json new file mode 100644 index 0000000000000000000000000000000000000000..a0cc24405a9224b00d071275c275459ce7d736f9 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/reference_verify_dark_suv_driving.json @@ -0,0 +1,46 @@ +{ + "name": "dark_suv_driving", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_dark_suv_driving_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_dark_suv_driving_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_dark_suv_driving_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_dark_suv_driving_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/candidate_ref_dark_suv_driving_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/candidate_sam_mask_dark_suv_driving_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 96.0, + 294.0, + 928.0, + 812.0 + ], + "mask_score": 3.455576, + "mask_area_ratio": 0.251452, + "elapsed_seconds": 9.8494 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The dark SUV is fully visible, complete, isolated on a white background, and clearly recognizable as the intended subject." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/reference_verify_double_yellow_lines.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/reference_verify_double_yellow_lines.json new file mode 100644 index 0000000000000000000000000000000000000000..76ae2582927208c5101058c3b61a3ad33825231e --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/reference_verify_double_yellow_lines.json @@ -0,0 +1,46 @@ +{ + "name": "double_yellow_lines", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_double_yellow_lines_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_double_yellow_lines_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_double_yellow_lines_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_double_yellow_lines_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/candidate_ref_double_yellow_lines_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/candidate_sam_mask_double_yellow_lines_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 0.0, + 1023.0, + 1023.0 + ], + "mask_score": 2.141169, + "mask_area_ratio": 0.667065, + "elapsed_seconds": 8.2719 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": true, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "Representative crop of double yellow lines on asphalt on a white background." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/reference_verify_parked_suv_right.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/reference_verify_parked_suv_right.json new file mode 100644 index 0000000000000000000000000000000000000000..744f0ae48ed7230e42071e17fe0e2f838a5b95a8 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/reference_verify_parked_suv_right.json @@ -0,0 +1,46 @@ +{ + "name": "parked_suv_right", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_parked_suv_right_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_parked_suv_right_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_parked_suv_right_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_parked_suv_right_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/candidate_ref_parked_suv_right_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/candidate_sam_mask_parked_suv_right_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 52.0, + 216.0, + 993.0, + 835.0 + ], + "mask_score": 3.459027, + "mask_area_ratio": 0.361156, + "elapsed_seconds": 10.1865 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows a dark-colored SUV isolated on a white background. It is complete, not cropped, and serves as an acceptable reference for the subject." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/reference_verify_pedestrian_left.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/reference_verify_pedestrian_left.json new file mode 100644 index 0000000000000000000000000000000000000000..6c21eda6133cb397c8c2cffeca3c207b27186608 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/reference_verify_pedestrian_left.json @@ -0,0 +1,46 @@ +{ + "name": "pedestrian_left", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_pedestrian_left_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_pedestrian_left_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_pedestrian_left_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_pedestrian_left_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/candidate_ref_pedestrian_left_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/candidate_sam_mask_pedestrian_left_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 342.0, + 67.0, + 681.0, + 996.0 + ], + "mask_score": 3.481605, + "mask_area_ratio": 0.150858, + "elapsed_seconds": 8.1403 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image meets all requirements for a person reference image. The full body is visible with no cropping, there is a single main subject, and the background is white." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/reference_verify_pedestrian_right.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/reference_verify_pedestrian_right.json new file mode 100644 index 0000000000000000000000000000000000000000..be2e0a9b7c3abcc4a83a138ac86d047bf8eef193 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/reference_verify_pedestrian_right.json @@ -0,0 +1,46 @@ +{ + "name": "pedestrian_right", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_pedestrian_right_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_pedestrian_right_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_pedestrian_right_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_pedestrian_right_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/candidate_ref_pedestrian_right_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/candidate_sam_mask_pedestrian_right_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 345.0, + 55.0, + 678.0, + 982.0 + ], + "mask_score": 3.462354, + "mask_area_ratio": 0.14014, + "elapsed_seconds": 8.2387 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "Full body visible with white margin around." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/reference_verify_pink_scooter.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/reference_verify_pink_scooter.json new file mode 100644 index 0000000000000000000000000000000000000000..7f34d4bc7283bf2ff339781f21d742eecbf01b7b --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/reference_verify_pink_scooter.json @@ -0,0 +1,46 @@ +{ + "name": "pink_scooter", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_pink_scooter_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_pink_scooter_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_pink_scooter_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_pink_scooter_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/candidate_ref_pink_scooter_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/candidate_sam_mask_pink_scooter_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 41.0, + 120.0, + 982.0, + 920.0 + ], + "mask_score": 3.414017, + "mask_area_ratio": 0.259921, + "elapsed_seconds": 8.1841 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The pink scooter is completely visible, isolated, and cleanly presented on a white background with no cropping." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/reference_verify_storefront_sign.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/reference_verify_storefront_sign.json new file mode 100644 index 0000000000000000000000000000000000000000..bd049a9291d0f670413366d66722fccba715ea14 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/reference_verify_storefront_sign.json @@ -0,0 +1,46 @@ +{ + "name": "storefront_sign", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_storefront_sign_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_storefront_sign_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_storefront_sign_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_storefront_sign_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/candidate_ref_storefront_sign_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/candidate_sam_mask_storefront_sign_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 107.0, + 274.0, + 900.0, + 749.0 + ], + "mask_score": 3.354337, + "mask_area_ratio": 0.167885, + "elapsed_seconds": 8.1782 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows a storefront sign on a white background. It is complete, not cropped, and is the single main subject." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/reference_verify_street_light.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/reference_verify_street_light.json new file mode 100644 index 0000000000000000000000000000000000000000..9b6e47753cd48861f4ae30a241e1940a269b2f85 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/reference_verify_street_light.json @@ -0,0 +1,46 @@ +{ + "name": "street_light", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_street_light_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_street_light_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_street_light_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_street_light_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/candidate_ref_street_light_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/candidate_sam_mask_street_light_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 71.0, + 277.0, + 913.0, + 727.0 + ], + "mask_score": 3.350243, + "mask_area_ratio": 0.068855, + "elapsed_seconds": 8.2963 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": false, + "cropped_or_truncated": true, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows the lamp head of the street light. The pole is truncated, which is acceptable for large environmental objects where the head is the primary functional/recognizable feature." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/reference_verify_street_signs.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/reference_verify_street_signs.json new file mode 100644 index 0000000000000000000000000000000000000000..870ef437ed786bfe272c5fe61ee7336f3625e4eb --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/reference_verify_street_signs.json @@ -0,0 +1,46 @@ +{ + "name": "street_signs", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_street_signs_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_street_signs_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_street_signs_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_street_signs_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/candidate_ref_street_signs_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/candidate_sam_mask_street_signs_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 226.0, + 71.0, + 707.0, + 967.0 + ], + "mask_score": 3.475593, + "mask_area_ratio": 0.25818, + "elapsed_seconds": 9.8621 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The street sign and pole are fully visible, clearly separated from the background, and meet all requirements." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/reference_verify_street_trees.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/reference_verify_street_trees.json new file mode 100644 index 0000000000000000000000000000000000000000..1b3e10ca6ae231bad6c02e364a6f84a2b5bea5fa --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/reference_verify_street_trees.json @@ -0,0 +1,46 @@ +{ + "name": "street_trees", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_street_trees_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_street_trees_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_street_trees_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_street_trees_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/candidate_ref_street_trees_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/candidate_sam_mask_street_trees_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 65.0, + 64.0, + 958.0, + 969.0 + ], + "mask_score": 3.478968, + "mask_area_ratio": 0.365667, + "elapsed_seconds": 8.231 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "A single, isolated tree is shown on a white background, which is an acceptable representative specimen for the requested subject." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/reference_verify_twilight_sky.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/reference_verify_twilight_sky.json new file mode 100644 index 0000000000000000000000000000000000000000..89003bcfee56632c1dc9321ff958ca05cfaa57c0 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/reference_verify_twilight_sky.json @@ -0,0 +1,46 @@ +{ + "name": "twilight_sky", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_twilight_sky_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_twilight_sky_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_twilight_sky_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_twilight_sky_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/candidate_ref_twilight_sky_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/candidate_sam_mask_twilight_sky_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 208.0, + 1023.0, + 814.0 + ], + "mask_score": 2.437955, + "mask_area_ratio": 0.529621, + "elapsed_seconds": 9.8292 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows a representative crop of a twilight sky transitioning from bright near the horizon to darker blue. As a continuous environmental feature, a representative specimen is acceptable." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/reference_verify_vehicle_dashboard.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/reference_verify_vehicle_dashboard.json new file mode 100644 index 0000000000000000000000000000000000000000..4fe4ea3639b39872d69a6fa79b621697e2b6eb40 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/reference_verify_vehicle_dashboard.json @@ -0,0 +1,46 @@ +{ + "name": "vehicle_dashboard", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_vehicle_dashboard_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_vehicle_dashboard_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_vehicle_dashboard_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_vehicle_dashboard_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/candidate_ref_vehicle_dashboard_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/candidate_sam_mask_vehicle_dashboard_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 24.0, + 317.0, + 1001.0, + 706.0 + ], + "mask_score": 2.942001, + "mask_area_ratio": 0.133658, + "elapsed_seconds": 8.3645 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The main recognizable form of the dashboard is visible and isolated on a white background. There are some floating masking artifacts above the main shape, but the subject remains clearly identifiable and useful as a reference." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/reference_verify_white_car_ahead.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/reference_verify_white_car_ahead.json new file mode 100644 index 0000000000000000000000000000000000000000..c130564270b76f43c0568c73dbaeceabf106775f --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/reference_verify_white_car_ahead.json @@ -0,0 +1,46 @@ +{ + "name": "white_car_ahead", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_white_car_ahead_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_white_car_ahead_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_white_car_ahead_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_white_car_ahead_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/candidate_ref_white_car_ahead_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/candidate_sam_mask_white_car_ahead_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 112.0, + 168.0, + 912.0, + 855.0 + ], + "mask_score": 3.412999, + "mask_area_ratio": 0.338258, + "elapsed_seconds": 8.3339 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows a complete rear view of a white car on a white background." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/sam_mask_city_buildings.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/sam_mask_city_buildings.png new file mode 100644 index 0000000000000000000000000000000000000000..c974f80f0e84797e44ea8b67a38cbd91f2206e9a Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/sam_mask_city_buildings.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/sam_mask_dark_car_left.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/sam_mask_dark_car_left.png new file mode 100644 index 0000000000000000000000000000000000000000..c8584156c25822ff556ab8a2ef0e40c904bb9037 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/sam_mask_dark_car_left.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/sam_mask_dark_suv_driving.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/sam_mask_dark_suv_driving.png new file mode 100644 index 0000000000000000000000000000000000000000..01968ed799103860857846675b4d78f6d997c2f7 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/sam_mask_dark_suv_driving.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/sam_mask_double_yellow_lines.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/sam_mask_double_yellow_lines.png new file mode 100644 index 0000000000000000000000000000000000000000..04c1eb11e284420d2793225be6816409378b1733 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/sam_mask_double_yellow_lines.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/sam_mask_parked_suv_right.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/sam_mask_parked_suv_right.png new file mode 100644 index 0000000000000000000000000000000000000000..31b2de461babd24aabb3e358cbc5227cc521bd28 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/sam_mask_parked_suv_right.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/sam_mask_pedestrian_left.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/sam_mask_pedestrian_left.png new file mode 100644 index 0000000000000000000000000000000000000000..e2b073773da7f51fc8ba0085b3d38933b6729069 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/sam_mask_pedestrian_left.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/sam_mask_pedestrian_right.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/sam_mask_pedestrian_right.png new file mode 100644 index 0000000000000000000000000000000000000000..b5f7c60ce12038344c430475481c6f6da943f823 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/sam_mask_pedestrian_right.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/sam_mask_pink_scooter.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/sam_mask_pink_scooter.png new file mode 100644 index 0000000000000000000000000000000000000000..566c40afc0ae039abad72d95e326a24c0824e494 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/sam_mask_pink_scooter.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/sam_mask_storefront_sign.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/sam_mask_storefront_sign.png new file mode 100644 index 0000000000000000000000000000000000000000..9346d3cc5ea17c675d39bcb184c3f0987ffbb259 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/sam_mask_storefront_sign.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/sam_mask_street_light.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/sam_mask_street_light.png new file mode 100644 index 0000000000000000000000000000000000000000..efdc3da6a8614535c56669e762ee9b0a250333ed Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/sam_mask_street_light.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/sam_mask_street_signs.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/sam_mask_street_signs.png new file mode 100644 index 0000000000000000000000000000000000000000..ce952f6620731a406dad9251b4244c4049512061 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/sam_mask_street_signs.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/sam_mask_street_trees.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/sam_mask_street_trees.png new file mode 100644 index 0000000000000000000000000000000000000000..3f58627a5a7bf1ea6b7c823213502c184002885f Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/sam_mask_street_trees.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/sam_mask_twilight_sky.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/sam_mask_twilight_sky.png new file mode 100644 index 0000000000000000000000000000000000000000..b523ed79d399b4706e24258d8a6debbf33101928 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/sam_mask_twilight_sky.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/sam_mask_vehicle_dashboard.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/sam_mask_vehicle_dashboard.png new file mode 100644 index 0000000000000000000000000000000000000000..5bae6f4c4aefe1207f69c9680e3785eeb8dd1d33 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/sam_mask_vehicle_dashboard.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/sam_mask_white_car_ahead.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/sam_mask_white_car_ahead.png new file mode 100644 index 0000000000000000000000000000000000000000..458dd0278d33504f3ad859222c0e490cb7780b85 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/sam_mask_white_car_ahead.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/row.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/row.json new file mode 100644 index 0000000000000000000000000000000000000000..6a67e01ff3fd6dc56330bcfa2185344802267159 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/row.json @@ -0,0 +1,716 @@ +{ + "sample_id": "sample_000002", + "target_total": 15, + "target_people": 2, + "target_objects": 13, + "canvas_size": [ + 1280, + 720 + ], + "canvas_aspect_ratio": "16:9", + "main_image": "main_image.png", + "bbox_overlay": "bbox_overlay.png", + "plan": "plan.json", + "detections": "detections.json", + "vocab_task": "vocab_task.json", + "n_planned": 15, + "n_detected": 15, + "n_subjects": 15, + "subjects": [ + { + "name": "pedestrian_right", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "BDD100K:c5d864fa-b0b2380b:person:2", + "source_name": "pedestrian", + "source_description": "Person walking away from the camera on the right sidewalk, wearing dark clothing. Source dataset: BDD100K. Scene context: View from inside a vehicle driving down a wet city street lined with tall buildings, with pedestrians on the sidewalk and several cars and taxis ahead in the rain.", + "sub_caption": "pedestrian: Person walking away from the camera on the right sidewalk, wearing dark clothing.. Scene role: Walking along the right-hand sidewalk under the trees.", + "measured_bbox": [ + 0.8872, + 0.491, + 0.9451, + 0.6701 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian_right.png", + "raw_ref_image": "references/raw_ref_pedestrian_right_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian_right.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_pedestrian_right_attempt_01.png", + "output": "references/ref_pedestrian_right.png", + "mask": "references/sam_mask_pedestrian_right.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 345.0, + 55.0, + 678.0, + 982.0 + ], + "mask_score": 3.462354, + "mask_area_ratio": 0.14014, + "elapsed_seconds": 8.2387 + } + }, + { + "name": "pedestrian_left", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "BDD100K:b714a088-861a043b:person:2", + "source_name": "pedestrian", + "source_description": "another person near the shop entrance on the left, partially obscured Source dataset: BDD100K. Scene context: a city street scene from the perspective of a vehicle, showing multiple cars in motion, buildings lining the road, and pedestrians on the sidewalks during dusk or early evening", + "sub_caption": "pedestrian: A person near a shop entrance on the left, partially obscured by shadows.. Scene role: Standing near a building entrance on the left side of the street.", + "measured_bbox": [ + 0.1301, + 0.5154, + 0.1517, + 0.611 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian_left.png", + "raw_ref_image": "references/raw_ref_pedestrian_left_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian_left.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_pedestrian_left_attempt_01.png", + "output": "references/ref_pedestrian_left.png", + "mask": "references/sam_mask_pedestrian_left.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 342.0, + 67.0, + 681.0, + 996.0 + ], + "mask_score": 3.481605, + "mask_area_ratio": 0.150858, + "elapsed_seconds": 8.1403 + } + }, + { + "name": "city_buildings", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_9/283554,31eeb000e9237b31.jpg:object:9", + "source_name": "building", + "source_description": "Various city buildings of different heights forming the skyline in the background. Source dataset: CrowdHuman. Scene context: People are walking and resting on the wooden walkway of a large suspension bridge with a city skyline in the background.", + "sub_caption": "building: Various city buildings of different heights forming the urban landscape along the street.. Scene role: Lining the street and forming the architectural background on both sides.", + "measured_bbox": [ + 0.3358, + 0.3425, + 0.4929, + 0.5277 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_city_buildings.png", + "raw_ref_image": "references/raw_ref_city_buildings_attempt_01.png", + "reference_verify": "references/reference_verify_city_buildings.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_city_buildings_attempt_01.png", + "output": "references/ref_city_buildings.png", + "mask": "references/sam_mask_city_buildings.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 27.0, + 178.0, + 996.0, + 865.0 + ], + "mask_score": 3.420089, + "mask_area_ratio": 0.463421, + "elapsed_seconds": 8.2735 + } + }, + { + "name": "pink_scooter", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_66/283991,a60c0001c08f63b.jpg:object:3", + "source_name": "pink scooter", + "source_description": "A prominent pink step-through style motor scooter. Source dataset: CrowdHuman. Scene context: A busy city intersection with many people riding scooters and some cars in the background.", + "sub_caption": "pink scooter: A prominent pink step-through style motor scooter.. Scene role: Parked on the right sidewalk near the street signs.", + "measured_bbox": [ + 0.677, + 0.56, + 0.7935, + 0.7095 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_pink_scooter.png", + "raw_ref_image": "references/raw_ref_pink_scooter_attempt_01.png", + "reference_verify": "references/reference_verify_pink_scooter.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_pink_scooter_attempt_01.png", + "output": "references/ref_pink_scooter.png", + "mask": "references/sam_mask_pink_scooter.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 41.0, + 120.0, + 982.0, + 920.0 + ], + "mask_score": 3.414017, + "mask_area_ratio": 0.259921, + "elapsed_seconds": 8.1841 + } + }, + { + "name": "street_signs", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b5ab0e46-8eab4733:object:8", + "source_name": "street signs", + "source_description": "Various street signs attached to a pole on the right side of the street, including a speed limit sign. Source dataset: BDD100K. Scene context: A narrow city street with several parked and moving box trucks and vans, bordered by multi-story buildings and construction barriers.", + "sub_caption": "street signs: Various street signs attached to a metal pole on the right side of the street.. Scene role: Mounted on a pole alongside the road on the right.", + "measured_bbox": [ + 0.8162, + 0.2869, + 0.8575, + 0.4063 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_street_signs.png", + "raw_ref_image": "references/raw_ref_street_signs_attempt_01.png", + "reference_verify": "references/reference_verify_street_signs.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_street_signs_attempt_01.png", + "output": "references/ref_street_signs.png", + "mask": "references/sam_mask_street_signs.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 226.0, + 71.0, + 707.0, + 967.0 + ], + "mask_score": 3.475593, + "mask_area_ratio": 0.25818, + "elapsed_seconds": 9.8621 + } + }, + { + "name": "storefront_sign", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_4/273275,46a6f0005d04fc24.jpg:object:7", + "source_name": "storefront sign", + "source_description": "A dark sign with white text 'DELI' and a smaller red 'ATM' sign beneath it on the left. Source dataset: CrowdHuman. Scene context: A densely crowded city street with numerous pedestrians walking towards the camera, while emergency vehicles with flashing lights are visible in the background traffic.", + "sub_caption": "storefront sign: A dark, illuminated sign structure above a shop entrance.. Scene role: Hanging above a shop entrance on the left side of the street, illuminating the adjacent pedestrian.", + "measured_bbox": [ + 0.1052, + 0.4218, + 0.186, + 0.4781 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_storefront_sign.png", + "raw_ref_image": "references/raw_ref_storefront_sign_attempt_01.png", + "reference_verify": "references/reference_verify_storefront_sign.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_storefront_sign_attempt_01.png", + "output": "references/ref_storefront_sign.png", + "mask": "references/sam_mask_storefront_sign.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 107.0, + 274.0, + 900.0, + 749.0 + ], + "mask_score": 3.354337, + "mask_area_ratio": 0.167885, + "elapsed_seconds": 8.1782 + } + }, + { + "name": "parked_suv_right", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c54441e6-400c221e:object:4", + "source_name": "parked SUV", + "source_description": "Dark-colored SUV parked ahead of the sedan on the right side of the road. Source dataset: BDD100K. Scene context: Nighttime driving scene approaching an intersection with green traffic lights and parked cars on the right.", + "sub_caption": "parked SUV: Dark-colored SUV parked on the right side of the road.. Scene role: Parked parallel to the curb on the right side of the street.", + "measured_bbox": [ + 0.6057, + 0.5099, + 0.7451, + 0.6703 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_parked_suv_right.png", + "raw_ref_image": "references/raw_ref_parked_suv_right_attempt_01.png", + "reference_verify": "references/reference_verify_parked_suv_right.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_parked_suv_right_attempt_01.png", + "output": "references/ref_parked_suv_right.png", + "mask": "references/sam_mask_parked_suv_right.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 52.0, + 216.0, + 993.0, + 835.0 + ], + "mask_score": 3.459027, + "mask_area_ratio": 0.361156, + "elapsed_seconds": 10.1865 + } + }, + { + "name": "dark_car_left", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:bb2e43e4-5e7a7129:object:6", + "source_name": "car", + "source_description": "A dark car parked along the left curb further ahead. Source dataset: BDD100K. Scene context: A dashcam view from a vehicle driving down a multi-lane road on a partly cloudy day, with a few other cars and pedestrians on the sidewalk.", + "sub_caption": "car: A dark car parked along the left curb further ahead.. Scene role: Parked alongside the left curb.", + "measured_bbox": [ + 0.2139, + 0.5323, + 0.3044, + 0.6201 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_dark_car_left.png", + "raw_ref_image": "references/raw_ref_dark_car_left_attempt_01.png", + "reference_verify": "references/reference_verify_dark_car_left.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_dark_car_left_attempt_01.png", + "output": "references/ref_dark_car_left.png", + "mask": "references/sam_mask_dark_car_left.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 42.0, + 237.0, + 982.0, + 794.0 + ], + "mask_score": 3.479099, + "mask_area_ratio": 0.30617, + "elapsed_seconds": 8.2274 + } + }, + { + "name": "dark_suv_driving", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c889c950-865ca5b6:object:0", + "source_name": "dark SUV", + "source_description": "A dark SUV traveling in the left lane, with visible red taillights. Source dataset: BDD100K. Scene context: Nighttime driving on a highway with several cars visible ahead, illuminated mainly by taillights and headlights.", + "sub_caption": "dark SUV: A dark SUV traveling in the left lane, with visible glowing red taillights.. Scene role: Driving ahead of the camera vehicle in the opposing or adjacent left lane.", + "measured_bbox": [ + 0.3005, + 0.5101, + 0.4179, + 0.6508 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_dark_suv_driving.png", + "raw_ref_image": "references/raw_ref_dark_suv_driving_attempt_01.png", + "reference_verify": "references/reference_verify_dark_suv_driving.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_dark_suv_driving_attempt_01.png", + "output": "references/ref_dark_suv_driving.png", + "mask": "references/sam_mask_dark_suv_driving.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 96.0, + 294.0, + 928.0, + 812.0 + ], + "mask_score": 3.455576, + "mask_area_ratio": 0.251452, + "elapsed_seconds": 9.8494 + } + }, + { + "name": "street_light", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c84f848e-2a5e0737:object:2", + "source_name": "street light", + "source_description": "Tall pole with a bright light on top, illuminating the road from the right side. Source dataset: BDD100K. Scene context: Nighttime driving view on a multi-lane highway with streetlights illuminating the dark road ahead.", + "sub_caption": "street light: Tall pole with a bright light on top, illuminating the road.. Scene role: Casting warm light onto the street from the right-hand sidewalk.", + "measured_bbox": [ + 0.8171, + 0.1755, + 0.8719, + 0.2202 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_street_light.png", + "raw_ref_image": "references/raw_ref_street_light_attempt_01.png", + "reference_verify": "references/reference_verify_street_light.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_street_light_attempt_01.png", + "output": "references/ref_street_light.png", + "mask": "references/sam_mask_street_light.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 71.0, + 277.0, + 913.0, + 727.0 + ], + "mask_score": 3.350243, + "mask_area_ratio": 0.068855, + "elapsed_seconds": 8.2963 + } + }, + { + "name": "vehicle_dashboard", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b5032e1d-dad95b60:object:9", + "source_name": "dashboard", + "source_description": "The dark, reflective dashboard and lower windshield area of the camera vehicle in the foreground. Source dataset: BDD100K. Scene context: View from inside a vehicle driving down a residential city street lined with trees, parked cars, and multi-story brick buildings.", + "sub_caption": "dashboard: The dark, reflective dashboard and lower windshield area of the camera vehicle.. Scene role: Occupying the bottom foreground of the image, establishing the perspective from inside the car.", + "measured_bbox": [ + 0.0, + 0.9261, + 1.0, + 1.0 + ], + "detection_confidence": "high", + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_vehicle_dashboard.png", + "raw_ref_image": "references/raw_ref_vehicle_dashboard_attempt_01.png", + "reference_verify": "references/reference_verify_vehicle_dashboard.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_vehicle_dashboard_attempt_01.png", + "output": "references/ref_vehicle_dashboard.png", + "mask": "references/sam_mask_vehicle_dashboard.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 24.0, + 317.0, + 1001.0, + 706.0 + ], + "mask_score": 2.942001, + "mask_area_ratio": 0.133658, + "elapsed_seconds": 8.3645 + } + }, + { + "name": "white_car_ahead", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b3a102ed-6ef54f5e:object:3", + "source_name": "white car", + "source_description": "A white car visible further down the road in the right lane. Source dataset: BDD100K. Scene context: Nighttime driving scene in a city with cars stopped in traffic and a construction site on the right.", + "sub_caption": "white car: A white car visible further down the road.. Scene role: Driving away in the right lane, further in the distance.", + "measured_bbox": [ + 0.4811, + 0.5382, + 0.5174, + 0.5915 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_white_car_ahead.png", + "raw_ref_image": "references/raw_ref_white_car_ahead_attempt_01.png", + "reference_verify": "references/reference_verify_white_car_ahead.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_white_car_ahead_attempt_01.png", + "output": "references/ref_white_car_ahead.png", + "mask": "references/sam_mask_white_car_ahead.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 112.0, + 168.0, + 912.0, + 855.0 + ], + "mask_score": 3.412999, + "mask_area_ratio": 0.338258, + "elapsed_seconds": 8.3339 + } + }, + { + "name": "double_yellow_lines", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c417a291-7802692d:object:8", + "source_name": "yellow lines", + "source_description": "Double yellow painted lines separating opposite directions of traffic. Source dataset: BDD100K. Scene context: A pedestrian crossing a street at a crosswalk with several cars stopped around them, with storefronts visible in the background.", + "sub_caption": "yellow lines: Double yellow painted lines separating opposite directions of traffic.. Scene role: Running down the center of the road, receding into the distance.", + "measured_bbox": [ + 0.3008, + 0.5732, + 0.4776, + 0.8029 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_double_yellow_lines.png", + "raw_ref_image": "references/raw_ref_double_yellow_lines_attempt_01.png", + "reference_verify": "references/reference_verify_double_yellow_lines.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_double_yellow_lines_attempt_01.png", + "output": "references/ref_double_yellow_lines.png", + "mask": "references/sam_mask_double_yellow_lines.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 0.0, + 1023.0, + 1023.0 + ], + "mask_score": 2.141169, + "mask_area_ratio": 0.667065, + "elapsed_seconds": 8.2719 + } + }, + { + "name": "street_trees", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c4891df0-24371ae1:object:3", + "source_name": "trees", + "source_description": "Numerous trees with dense green and yellowish foliage lining both sides of the road. Source dataset: BDD100K. Scene context: A dark SUV drives ahead on a wet road lined with trees and a fence under a cloudy sky.", + "sub_caption": "trees: Numerous trees with dense foliage lining both sides of the road.. Scene role: Planted along the sidewalks, softening the urban environment and framing the street.", + "measured_bbox": [ + 0.001, + 0.002, + 0.375, + 0.63 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_street_trees.png", + "raw_ref_image": "references/raw_ref_street_trees_attempt_01.png", + "reference_verify": "references/reference_verify_street_trees.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_street_trees_attempt_01.png", + "output": "references/ref_street_trees.png", + "mask": "references/sam_mask_street_trees.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 65.0, + 64.0, + 958.0, + 969.0 + ], + "mask_score": 3.478968, + "mask_area_ratio": 0.365667, + "elapsed_seconds": 8.231 + } + }, + { + "name": "twilight_sky", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c13c0d1f-00dfd075:object:9", + "source_name": "sky", + "source_description": "Clear twilight sky transitioning from bright near the horizon to dark blue at the top. Source dataset: BDD100K. Scene context: View from a moving vehicle looking down a multi-lane city street lined with parked cars and large trees under a clear twilight sky.", + "sub_caption": "sky: Clear twilight sky transitioning from bright near the horizon to dark blue at the top.. Scene role: Providing the atmospheric backdrop above the buildings and street.", + "measured_bbox": [ + 0.116, + 0.0, + 0.714, + 0.4742 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_twilight_sky.png", + "raw_ref_image": "references/raw_ref_twilight_sky_attempt_01.png", + "reference_verify": "references/reference_verify_twilight_sky.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_twilight_sky_attempt_01.png", + "output": "references/ref_twilight_sky.png", + "mask": "references/sam_mask_twilight_sky.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 208.0, + 1023.0, + 814.0 + ], + "mask_score": 2.437955, + "mask_area_ratio": 0.529621, + "elapsed_seconds": 9.8292 + } + } + ], + "not_emitted": [], + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/vocab_task.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/vocab_task.json new file mode 100644 index 0000000000000000000000000000000000000000..4fe59f83d71c72750ea6b87f03795437c02e862d --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/vocab_task.json @@ -0,0 +1,224 @@ +{ + "task_id": "sample_000002", + "sample_id": "sample_000002", + "sample_index": 2, + "target_total": 15, + "target_people": 2, + "target_objects": 13, + "people_candidates": [ + { + "candidate_index": 0, + "source_offset": 200197, + "image_id": "BDD100K:c5d864fa-b0b2380b:person:2", + "name": "pedestrian", + "description": "Person walking away from the camera on the right sidewalk, wearing dark clothing. Source dataset: BDD100K. Scene context: View from inside a vehicle driving down a wet city street lined with tall buildings, with pedestrians on the sidewalk and several cars and taxis ahead in the rain." + }, + { + "candidate_index": 1, + "source_offset": 21685, + "image_id": "CrowdHuman:data/data_16/273278,13c1f3000bd2d0bbe.jpg:person:1", + "name": "shopper", + "description": "Woman holding a bright yellow shopping bag, wearing a white top and grey pants, walking alongside a companion. Source dataset: CrowdHuman. Scene context: A bustling city street at night, lined with brightly lit storefronts and large illuminated signs, filled with pedestrians walking and shopping at street vendor stalls." + }, + { + "candidate_index": 2, + "source_offset": 196602, + "image_id": "BDD100K:b714a088-861a043b:person:2", + "name": "pedestrian", + "description": "another person near the shop entrance on the left, partially obscured Source dataset: BDD100K. Scene context: a city street scene from the perspective of a vehicle, showing multiple cars in motion, buildings lining the road, and pedestrians on the sidewalks during dusk or early evening" + }, + { + "candidate_index": 3, + "source_offset": 38044, + "image_id": "CrowdHuman:data/data_21/283554,1b06b00011302479.jpg:person:26", + "name": "person standing", + "description": "Person standing behind the right bench, wearing light clothing. Source dataset: CrowdHuman. Scene context: A group of people standing and sitting on benches near a city street, with trees and large buildings in the background." + } + ], + "object_candidates": [ + { + "candidate_index": 0, + "source_offset": 97568, + "image_id": "CrowdHuman:data/data_59/273271,1f2b0000eb60e942.jpg:object:0", + "name": "champagne glass", + "description": "A tall, slender glass holding a light-colored bubbly liquid, held by the woman in the black dress. Source dataset: CrowdHuman. Scene context: A group of five adults at a party, three sitting on a couch and two on a confetti-covered floor, holding up champagne glasses in celebration." + }, + { + "candidate_index": 1, + "source_offset": 132505, + "image_id": "CrowdHuman:data/data_9/283554,31eeb000e9237b31.jpg:object:9", + "name": "building", + "description": "Various city buildings of different heights forming the skyline in the background. Source dataset: CrowdHuman. Scene context: People are walking and resting on the wooden walkway of a large suspension bridge with a city skyline in the background." + }, + { + "candidate_index": 2, + "source_offset": 75108, + "image_id": "CrowdHuman:data/data_46/283991,1979e0006e725ed3.jpg:object:11", + "name": "bicycle wheel", + "description": "the front wheel of a bicycle visible amidst the crowd Source dataset: CrowdHuman. Scene context: A crowd of people gathers in a city square for a protest or demonstration, with many holding flags and signs." + }, + { + "candidate_index": 3, + "source_offset": 107106, + "image_id": "CrowdHuman:data/data_63/273278,6f9fc00027f3f324.jpg:object:7", + "name": "bag", + "description": "A crumpled white plastic bag lying on the sidewalk near the edge of the street. Source dataset: CrowdHuman. Scene context: People wait at a covered bus stop beside a street with a bus parked behind them, while a large advertisement sign stands prominently in the foreground." + }, + { + "candidate_index": 4, + "source_offset": 89231, + "image_id": "CrowdHuman:data/data_54/273278,126226000f9ec04e2.jpg:object:3", + "name": "backpack", + "description": "A gray backpack with darker straps and details, worn by the pedestrian in the red jacket. Source dataset: CrowdHuman. Scene context: A bustling tree-lined pedestrian street with people walking and browsing souvenir stalls." + }, + { + "candidate_index": 5, + "source_offset": 113102, + "image_id": "CrowdHuman:data/data_66/283991,a60c0001c08f63b.jpg:object:3", + "name": "pink scooter", + "description": "A prominent pink step-through style motor scooter. Source dataset: CrowdHuman. Scene context: A busy city intersection with many people riding scooters and some cars in the background." + }, + { + "candidate_index": 6, + "source_offset": 151562, + "image_id": "BDD100K:b5ab0e46-8eab4733:object:8", + "name": "street signs", + "description": "Various street signs attached to a pole on the right side of the street, including a speed limit sign. Source dataset: BDD100K. Scene context: A narrow city street with several parked and moving box trucks and vans, bordered by multi-story buildings and construction barriers." + }, + { + "candidate_index": 7, + "source_offset": 60366, + "image_id": "CrowdHuman:data/data_4/273275,46a6f0005d04fc24.jpg:object:7", + "name": "storefront sign", + "description": "A dark sign with white text 'DELI' and a smaller red 'ATM' sign beneath it on the left. Source dataset: CrowdHuman. Scene context: A densely crowded city street with numerous pedestrians walking towards the camera, while emergency vehicles with flashing lights are visible in the background traffic." + }, + { + "candidate_index": 8, + "source_offset": 84198, + "image_id": "CrowdHuman:data/data_51/273278,131e65000ff71f35b.jpg:object:5", + "name": "mannequin", + "description": "Another mannequin torso displaying white lingerie. Source dataset: CrowdHuman. Scene context: A bustling city street at night, brightly lit with numerous neon signs and storefronts, crowded with pedestrians walking in both directions." + }, + { + "candidate_index": 9, + "source_offset": 41160, + "image_id": "CrowdHuman:data/data_29/283647,1742900042076f47.jpg:object:8", + "name": "car", + "description": "The front portion of a dark-colored car visible in the lower right corner, appearing blurred. Source dataset: CrowdHuman. Scene context: A grand, multi-towered stone building, possibly a cathedral or important civic structure, overlooks a public plaza or square where several pedestrians are walking or congregating around a dark fountain." + }, + { + "candidate_index": 10, + "source_offset": 57282, + "image_id": "CrowdHuman:data/data_38/273278,d172d0001ac1c7d0.jpg:object:7", + "name": "stairs", + "description": "Concrete stairs the students are walking up. Source dataset: CrowdHuman. Scene context: A group of young female students walking up some stairs, some carrying backpacks, folders, or papers, with a textured brown wall in the background." + }, + { + "candidate_index": 11, + "source_offset": 222565, + "image_id": "BDD100K:c54441e6-400c221e:object:4", + "name": "parked SUV", + "description": "Dark-colored SUV parked ahead of the sedan on the right side of the road. Source dataset: BDD100K. Scene context: Nighttime driving scene approaching an intersection with green traffic lights and parked cars on the right." + }, + { + "candidate_index": 12, + "source_offset": 175127, + "image_id": "BDD100K:bb2e43e4-5e7a7129:object:6", + "name": "car", + "description": "A dark car parked along the left curb further ahead. Source dataset: BDD100K. Scene context: A dashcam view from a vehicle driving down a multi-lane road on a partly cloudy day, with a few other cars and pedestrians on the sidewalk." + }, + { + "candidate_index": 13, + "source_offset": 150969, + "image_id": "BDD100K:b58436bb-5790dfd3:object:5", + "name": "overhead sign 2", + "description": "A second rectangular green highway sign with white text and arrows, mounted next to the first sign. Source dataset: BDD100K. Scene context: View from a moving vehicle driving on a bridge at dusk, with other cars and a yellow taxi ahead, and highway signs overhead." + }, + { + "candidate_index": 14, + "source_offset": 238259, + "image_id": "BDD100K:c889c950-865ca5b6:object:0", + "name": "dark SUV", + "description": "A dark SUV traveling in the left lane, with visible red taillights. Source dataset: BDD100K. Scene context: Nighttime driving on a highway with several cars visible ahead, illuminated mainly by taillights and headlights." + }, + { + "candidate_index": 15, + "source_offset": 237241, + "image_id": "BDD100K:c84f848e-2a5e0737:object:2", + "name": "street light", + "description": "Tall pole with a bright light on top, illuminating the road from the right side. Source dataset: BDD100K. Scene context: Nighttime driving view on a multi-lane highway with streetlights illuminating the dark road ahead." + }, + { + "candidate_index": 16, + "source_offset": 148362, + "image_id": "BDD100K:b5032e1d-dad95b60:object:9", + "name": "dashboard", + "description": "The dark, reflective dashboard and lower windshield area of the camera vehicle in the foreground. Source dataset: BDD100K. Scene context: View from inside a vehicle driving down a residential city street lined with trees, parked cars, and multi-story brick buildings." + }, + { + "candidate_index": 17, + "source_offset": 147478, + "image_id": "BDD100K:b4d0e72d-3b208072:object:16", + "name": "drainage grate", + "description": "A metal drainage grate on the edge of the road on the right. Source dataset: BDD100K. Scene context: A driving scene on a multi-lane highway with a dark red minivan in the left lane, under a partly cloudy sky." + }, + { + "candidate_index": 18, + "source_offset": 47380, + "image_id": "CrowdHuman:data/data_32/283992,1008000116a704e.jpg:object:0", + "name": "police car", + "description": "The front portion of a white police vehicle on the left, with bright blue flashing lights on its roof. Source dataset: CrowdHuman. Scene context: A tense street scene with a police officer and plainclothes armed men moving urgently through a cordoned-off area, with an ambulance in the background." + }, + { + "candidate_index": 19, + "source_offset": 46976, + "image_id": "CrowdHuman:data/data_32/282555,34e4900001d063c5.jpg:object:1", + "name": "curved canopy structure", + "description": "An arched structural framework with beams crossing diagonally, visible above the fence. Source dataset: CrowdHuman. Scene context: A group of eight people pose for a photo at night on a walkway enclosed by wire mesh fencing under a curved structural canopy." + }, + { + "candidate_index": 20, + "source_offset": 142061, + "image_id": "BDD100K:b3a102ed-6ef54f5e:object:3", + "name": "white car", + "description": "A white car visible further down the road in the right lane. Source dataset: BDD100K. Scene context: Nighttime driving scene in a city with cars stopped in traffic and a construction site on the right." + }, + { + "candidate_index": 21, + "source_offset": 33363, + "image_id": "CrowdHuman:data/data_26/273271,1ee5700005ba3c28.jpg:object:6", + "name": "fan", + "description": "A black oscillating fan standing in the background. Source dataset: CrowdHuman. Scene context: A group of young adults are gathered in a dimly lit room, many of them sitting at computers and appearing to be engaged in a LAN party or gaming event." + }, + { + "candidate_index": 22, + "source_offset": 218458, + "image_id": "BDD100K:c417a291-7802692d:object:8", + "name": "yellow lines", + "description": "Double yellow painted lines separating opposite directions of traffic. Source dataset: BDD100K. Scene context: A pedestrian crossing a street at a crosswalk with several cars stopped around them, with storefronts visible in the background." + }, + { + "candidate_index": 23, + "source_offset": 218847, + "image_id": "BDD100K:c4891df0-24371ae1:object:3", + "name": "trees", + "description": "Numerous trees with dense green and yellowish foliage lining both sides of the road. Source dataset: BDD100K. Scene context: A dark SUV drives ahead on a wet road lined with trees and a fence under a cloudy sky." + }, + { + "candidate_index": 24, + "source_offset": 126497, + "image_id": "CrowdHuman:data/data_72/283647,12bd000f875dc20.jpg:object:0", + "name": "billboard", + "description": "a large, bright sign on a building on the left side of the street Source dataset: CrowdHuman. Scene context: A wide city street is filled with numerous pedestrians walking, with tall buildings adorned with vibrant billboards lining the sides and a prominent green overpass structure in the distance." + }, + { + "candidate_index": 25, + "source_offset": 203204, + "image_id": "BDD100K:c13c0d1f-00dfd075:object:9", + "name": "sky", + "description": "Clear twilight sky transitioning from bright near the horizon to dark blue at the top. Source dataset: BDD100K. Scene context: View from a moving vehicle looking down a multi-lane city street lined with parked cars and large trees under a clear twilight sky." + } + ], + "rng_seed": 1782137451, + "created_at": 1782223460.2311263 +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/bbox_overlay.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/bbox_overlay.png new file mode 100644 index 0000000000000000000000000000000000000000..89924a3848652abca6a7c813770015219eadf281 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/bbox_overlay.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9423132c2840dc214a4a5f458134bc9c2ec84338b6bfdd1dbe0825fc42439e5 +size 1666519 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/compose_prompt.txt b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/compose_prompt.txt new file mode 100644 index 0000000000000000000000000000000000000000..5ae3a8c86530da9d074fe2aaeb45312db99d7fc9 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/compose_prompt.txt @@ -0,0 +1,63 @@ +Render the following JSON scene specification as a photorealistic 1248x832 image using a true 3:2 canvas. The rendered image must be a coherent driving scenario with an outdoor road, street, highway, intersection, curbside, parking, or traffic environment; do not render a non-driving indoor scene or product-catalog scene. Every listed person and object must appear visibly in the image. Keep normal proportions and the requested aspect ratio. The foreground must contain only subjects explicitly listed in the JSON scene specification. Do not add any unlisted foreground people, animals, vehicles, props, tools, products, signs, readable text, logos, or other localizable subjects. Background context may include non-localizable scenery only when it does not introduce a distinct foreground subject. No text, no labels, no logos, no watermarks. +JSON scene specification: +{ + "format": "structured_json_prompt", + "canvas": { + "size": [ + 1248, + 832 + ], + "aspect_ratio": "3:2", + "style": "photorealistic" + }, + "scene": { + "setting": "A city intersection crosswalk on a busy multi-lane street during the day", + "activity": "A pedestrian wearing dark clothing waits patiently at the edge of the crosswalk while a black sedan and a silver car drive past", + "composition": "Medium-wide street level shot, placing the pedestrian in the foreground right, while the cars traverse the middle ground. Good depth separation between the sidewalk and the street", + "constraints": [ + "no text", + "no labels", + "no watermarks", + "true 3:2 composition", + "final canvas size 1248x832", + "normal human and object proportions", + "no squeezed perspective", + "no anamorphic stretching", + "every listed person and object must be visibly present", + "the foreground may contain only the listed people and objects", + "no extra foreground people, animals, vehicles, props, tools, products, signs, readable text, logos, or other localizable subjects absent from this JSON", + "the entire image must be a coherent driving scenario", + "the scene must include an outdoor road, street, highway, intersection, curbside, parking, or traffic environment", + "integrate all listed subjects as road users, vehicles, roadside objects, traffic infrastructure, or driving-context objects", + "do not render an office, kitchen, studio, product catalog, indoor room, or unrelated non-driving scene" + ] + }, + "people": [ + { + "name": "shopper", + "source_index": 0, + "source_image_id": "CrowdHuman:data/data_12/282555,5c403000efcca35d.jpg:person:13", + "source_name": "shopper", + "description": "A person standing and waiting, wearing a dark top and dark pants", + "role_in_scene": "waiting at the crosswalk curb" + } + ], + "objects": [ + { + "name": "black_sedan", + "source_index": 1, + "source_image_id": "BDD100K:c0c183ff-1b24f541:object:7", + "source_name": "black sedan", + "description": "A dark, modern black sedan", + "role_in_scene": "driving in the nearest lane on the street" + }, + { + "name": "silver_car", + "source_index": 3, + "source_image_id": "BDD100K:be49ae7a-1ffaa683:object:2", + "source_name": "silver car", + "description": "A sleek silver car", + "role_in_scene": "driving in the adjacent lane slightly ahead of the black sedan" + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/crops/detect_refine_black_sedan.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/crops/detect_refine_black_sedan.png new file mode 100644 index 0000000000000000000000000000000000000000..b05b6a371989eb24d3176896925d4a520a4c8a67 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/crops/detect_refine_black_sedan.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1995ceb8b64841063b9e221a2805df200faf7856a208388b8eb6cdc2c0700e7c +size 194800 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/crops/detect_refine_shopper.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/crops/detect_refine_shopper.png new file mode 100644 index 0000000000000000000000000000000000000000..897e31e0f85a32205ad129329967515f94056c8e Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/crops/detect_refine_shopper.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/crops/detect_refine_silver_car.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/crops/detect_refine_silver_car.png new file mode 100644 index 0000000000000000000000000000000000000000..6c1997bb2a23f3ff200272d411a403f68eddea8b --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/crops/detect_refine_silver_car.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ad1dd9eff2e986a54fef2a16e417e613f5f942f5c751d18d9a981abf1314f20 +size 122978 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/crops/diversify_input_black_sedan.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/crops/diversify_input_black_sedan.png new file mode 100644 index 0000000000000000000000000000000000000000..acce3157634f50f12ddba1e913f5bb1cd4ed00b0 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/crops/diversify_input_black_sedan.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:087835be9dbfa972874597ea435025ac2fd20c7118ae808dbd3a232a4c83950e +size 248640 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/crops/diversify_input_shopper.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/crops/diversify_input_shopper.png new file mode 100644 index 0000000000000000000000000000000000000000..ea55f55407d64778d15d83eee31c5a4e0be17d1a --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/crops/diversify_input_shopper.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb492b83b07fe5e390c98674f107d4dddab5888b30e72166913d13e8f4290171 +size 191445 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/crops/diversify_input_silver_car.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/crops/diversify_input_silver_car.png new file mode 100644 index 0000000000000000000000000000000000000000..e2a71ea28a3d6e61e14c255644e771ebbff0bd63 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/crops/diversify_input_silver_car.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3916805d8fa1e1e694b76b59c63dc49ddcb171fd27ce174b456bd4b296bbf72f +size 164621 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/detections.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/detections.json new file mode 100644 index 0000000000000000000000000000000000000000..25ce0067ba8445c4de54be4a8532a5955407f8c8 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/detections.json @@ -0,0 +1,59 @@ +[ + { + "name": "shopper", + "present": true, + "bbox": [ + 0.7364, + 0.2825, + 0.8267, + 0.7222 + ], + "confidence": 100, + "notes": "The person standing at the crosswalk wearing a dark top and pants is prominently visible in the center.", + "coarse_bbox": [ + 0.736, + 0.286, + 0.826, + 0.722 + ], + "refine_crop": "crops/detect_refine_shopper.png" + }, + { + "name": "black_sedan", + "present": true, + "bbox": [ + 0.0883, + 0.2514, + 0.5002, + 0.449 + ], + "confidence": 0.99, + "notes": "Tight bounding box around the prominent black sedan driving in the nearest lane.", + "coarse_bbox": [ + 0.091, + 0.254, + 0.5, + 0.455 + ], + "refine_crop": "crops/detect_refine_black_sedan.png" + }, + { + "name": "silver_car", + "present": true, + "bbox": [ + 0.3669, + 0.2463, + 0.7048, + 0.409 + ], + "confidence": 0.99, + "notes": "The sleek silver car.", + "coarse_bbox": [ + 0.372, + 0.248, + 0.704, + 0.405 + ], + "refine_crop": "crops/detect_refine_silver_car.png" + } +] diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/main_image.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/main_image.png new file mode 100644 index 0000000000000000000000000000000000000000..0ac4a3e5c622feab583f5193c6a761783d49e287 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/main_image.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07f65eb234804a98576a94d150c0a640addc3f86cbf5dd2c8f5fd3507fc1fa99 +size 1695037 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/plan.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/plan.json new file mode 100644 index 0000000000000000000000000000000000000000..e5e1b8ba4a3b5288fddedb9629be5e9bc29113bd --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/plan.json @@ -0,0 +1,107 @@ +{ + "sample_id": "sample_000003", + "target_total": 3, + "target_people": 1, + "target_objects": 2, + "compose_prompt": { + "format": "structured_json_prompt", + "canvas": { + "size": [ + 1248, + 832 + ], + "aspect_ratio": "3:2", + "style": "photorealistic" + }, + "scene": { + "setting": "A city intersection crosswalk on a busy multi-lane street during the day", + "activity": "A pedestrian wearing dark clothing waits patiently at the edge of the crosswalk while a black sedan and a silver car drive past", + "composition": "Medium-wide street level shot, placing the pedestrian in the foreground right, while the cars traverse the middle ground. Good depth separation between the sidewalk and the street", + "constraints": [ + "no text", + "no labels", + "no watermarks", + "true 3:2 composition", + "final canvas size 1248x832", + "normal human and object proportions", + "no squeezed perspective", + "no anamorphic stretching", + "every listed person and object must be visibly present", + "the foreground may contain only the listed people and objects", + "no extra foreground people, animals, vehicles, props, tools, products, signs, readable text, logos, or other localizable subjects absent from this JSON", + "the entire image must be a coherent driving scenario", + "the scene must include an outdoor road, street, highway, intersection, curbside, parking, or traffic environment", + "integrate all listed subjects as road users, vehicles, roadside objects, traffic infrastructure, or driving-context objects", + "do not render an office, kitchen, studio, product catalog, indoor room, or unrelated non-driving scene" + ] + }, + "people": [ + { + "name": "shopper", + "source_index": 0, + "source_image_id": "CrowdHuman:data/data_12/282555,5c403000efcca35d.jpg:person:13", + "source_name": "shopper", + "description": "A person standing and waiting, wearing a dark top and dark pants", + "role_in_scene": "waiting at the crosswalk curb" + } + ], + "objects": [ + { + "name": "black_sedan", + "source_index": 1, + "source_image_id": "BDD100K:c0c183ff-1b24f541:object:7", + "source_name": "black sedan", + "description": "A dark, modern black sedan", + "role_in_scene": "driving in the nearest lane on the street" + }, + { + "name": "silver_car", + "source_index": 3, + "source_image_id": "BDD100K:be49ae7a-1ffaa683:object:2", + "source_name": "silver car", + "description": "A sleek silver car", + "role_in_scene": "driving in the adjacent lane slightly ahead of the black sedan" + } + ] + }, + "expected_subjects": [ + { + "name": "shopper", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_12/282555,5c403000efcca35d.jpg:person:13", + "source_name": "shopper", + "source_description": "A person standing on the top level, wearing a dark top and dark pants. Source dataset: CrowdHuman. Scene context: A multi-level outdoor shopping mall with various people walking and shopping.", + "sub_caption": "shopper: A person standing and waiting, wearing a dark top and dark pants. Scene role: waiting at the crosswalk curb", + "ref_style": "white_bg_full_body_front" + }, + { + "name": "black_sedan", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c0c183ff-1b24f541:object:7", + "source_name": "black sedan", + "source_description": "A black sedan driving in the right lane ahead of the red hatchback. Source dataset: BDD100K. Scene context: A view from a vehicle driving down a multi-lane city street flanked by tall buildings, with various cars moving in the same direction and parked along the side.", + "sub_caption": "black sedan: A dark, modern black sedan. Scene role: driving in the nearest lane on the street", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "silver_car", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:be49ae7a-1ffaa683:object:2", + "source_name": "silver car", + "source_description": "A silver car driving in the adjacent right lane, slightly ahead of the white car. Source dataset: BDD100K. Scene context: View from inside a vehicle driving in heavy traffic on a multi-lane road under a clear sky.", + "sub_caption": "silver car: A sleek silver car. Scene role: driving in the adjacent lane slightly ahead of the black sedan", + "ref_style": "white_bg_encyclopedia_photo" + } + ], + "vocab_task_path": "sample_000003/vocab_task.json", + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/references.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/references.json new file mode 100644 index 0000000000000000000000000000000000000000..ffee46630cc2ea96bc79b005fd2ce0385a29941b --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/references.json @@ -0,0 +1,101 @@ +{ + "references": [ + { + "name": "shopper", + "ref_image": "references/ref_shopper.png", + "raw_ref_image": "references/raw_ref_shopper_attempt_01.png", + "diversify_input": "crops/diversify_input_shopper.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000003/references/raw_ref_shopper_attempt_01.png", + "output": "references/ref_shopper.png", + "mask": "references/sam_mask_shopper.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 348.0, + 80.0, + 678.0, + 995.0 + ], + "mask_score": 3.467753, + "mask_area_ratio": 0.132874, + "elapsed_seconds": 49.4008 + }, + "reference_verify": "references/reference_verify_shopper.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "black_sedan", + "ref_image": "references/ref_black_sedan.png", + "raw_ref_image": "references/raw_ref_black_sedan_attempt_01.png", + "diversify_input": "crops/diversify_input_black_sedan.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000003/references/raw_ref_black_sedan_attempt_01.png", + "output": "references/ref_black_sedan.png", + "mask": "references/sam_mask_black_sedan.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 321.0, + 1023.0, + 700.0 + ], + "mask_score": 2.52477, + "mask_area_ratio": 0.559944, + "elapsed_seconds": 8.5091 + }, + "reference_verify": "references/reference_verify_black_sedan.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "silver_car", + "ref_image": "references/ref_silver_car.png", + "raw_ref_image": "references/raw_ref_silver_car_attempt_01.png", + "diversify_input": "crops/diversify_input_silver_car.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000003/references/raw_ref_silver_car_attempt_01.png", + "output": "references/ref_silver_car.png", + "mask": "references/sam_mask_silver_car.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 16.0, + 369.0, + 1006.0, + 693.0 + ], + "mask_score": 3.457475, + "mask_area_ratio": 0.178123, + "elapsed_seconds": 9.7472 + }, + "reference_verify": "references/reference_verify_silver_car.json", + "reference_verify_passed": true, + "reference_attempts": 1 + } + ], + "reference_errors": {} +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/references/ref_black_sedan.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/references/ref_black_sedan.png new file mode 100644 index 0000000000000000000000000000000000000000..e3ef5cf75c6ee9a67678f5d5f1fb59ec5106c1a4 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/references/ref_black_sedan.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ccf701b924cff916c97b1ec16fde669dda56209da2498f2023f3c4446d2b499f +size 512509 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/references/ref_shopper.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/references/ref_shopper.png new file mode 100644 index 0000000000000000000000000000000000000000..d1d8947cbafcab5e5fe4e6330dbbe6a65afabe67 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/references/ref_shopper.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00acd7374e54db42b9b6a0b1445b2fd347dc528c670e008c8b4349b19a50dcef +size 258947 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/references/ref_silver_car.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/references/ref_silver_car.png new file mode 100644 index 0000000000000000000000000000000000000000..e222cf22e6c15b7b60abe4b6428632db9621d60e --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/references/ref_silver_car.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd7d88aeda3f22eb06c25d0939a517cfc4f5864ad9cce98a8b46317da475772c +size 341823 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/references/reference_verify_black_sedan.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/references/reference_verify_black_sedan.json new file mode 100644 index 0000000000000000000000000000000000000000..755acd71da394923e921b41d49969648364c9b69 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/references/reference_verify_black_sedan.json @@ -0,0 +1,46 @@ +{ + "name": "black_sedan", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_black_sedan_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_black_sedan_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_black_sedan_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000003/references/raw_ref_black_sedan_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000003/references/candidate_ref_black_sedan_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000003/references/candidate_sam_mask_black_sedan_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 321.0, + 1023.0, + 700.0 + ], + "mask_score": 2.52477, + "mask_area_ratio": 0.559944, + "elapsed_seconds": 8.5091 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows a complete, isolated black sedan on a white background, serving as an excellent reference." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/references/reference_verify_shopper.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/references/reference_verify_shopper.json new file mode 100644 index 0000000000000000000000000000000000000000..64172a6092d9bdf92771414ee6430c233026ee44 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/references/reference_verify_shopper.json @@ -0,0 +1,46 @@ +{ + "name": "shopper", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_shopper_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_shopper_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_shopper_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000003/references/raw_ref_shopper_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000003/references/candidate_ref_shopper_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000003/references/candidate_sam_mask_shopper_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 348.0, + 80.0, + 678.0, + 995.0 + ], + "mask_score": 3.467753, + "mask_area_ratio": 0.132874, + "elapsed_seconds": 49.4008 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows a full body of a single person wearing a dark top and dark pants on a white background with sufficient margin." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/references/reference_verify_silver_car.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/references/reference_verify_silver_car.json new file mode 100644 index 0000000000000000000000000000000000000000..3c4aa4d9f9f5b990292e4e57046b5af862973dec --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/references/reference_verify_silver_car.json @@ -0,0 +1,46 @@ +{ + "name": "silver_car", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_silver_car_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_silver_car_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_silver_car_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000003/references/raw_ref_silver_car_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000003/references/candidate_ref_silver_car_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000003/references/candidate_sam_mask_silver_car_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 16.0, + 369.0, + 1006.0, + 693.0 + ], + "mask_score": 3.457475, + "mask_area_ratio": 0.178123, + "elapsed_seconds": 9.7472 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows a complete, isolated silver car against a white background, which meets all the requirements." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/references/sam_mask_black_sedan.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/references/sam_mask_black_sedan.png new file mode 100644 index 0000000000000000000000000000000000000000..cacc487cb9c2bd9184f55625084a4a71ac9d5275 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/references/sam_mask_black_sedan.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/references/sam_mask_shopper.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/references/sam_mask_shopper.png new file mode 100644 index 0000000000000000000000000000000000000000..d307de57ce84a7cadef273429798ced7ca9c71c6 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/references/sam_mask_shopper.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/references/sam_mask_silver_car.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/references/sam_mask_silver_car.png new file mode 100644 index 0000000000000000000000000000000000000000..ffaaa1127155b6a7b06bf8bd78d0c389c1b8683a Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/references/sam_mask_silver_car.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/row.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/row.json new file mode 100644 index 0000000000000000000000000000000000000000..3324bd0ee4b849f963ea0d2908ef069c23619477 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/row.json @@ -0,0 +1,164 @@ +{ + "sample_id": "sample_000003", + "target_total": 3, + "target_people": 1, + "target_objects": 2, + "canvas_size": [ + 1248, + 832 + ], + "canvas_aspect_ratio": "3:2", + "main_image": "main_image.png", + "bbox_overlay": "bbox_overlay.png", + "plan": "plan.json", + "detections": "detections.json", + "vocab_task": "vocab_task.json", + "n_planned": 3, + "n_detected": 3, + "n_subjects": 3, + "subjects": [ + { + "name": "shopper", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_12/282555,5c403000efcca35d.jpg:person:13", + "source_name": "shopper", + "source_description": "A person standing on the top level, wearing a dark top and dark pants. Source dataset: CrowdHuman. Scene context: A multi-level outdoor shopping mall with various people walking and shopping.", + "sub_caption": "shopper: A person standing and waiting, wearing a dark top and dark pants. Scene role: waiting at the crosswalk curb", + "measured_bbox": [ + 0.7364, + 0.2825, + 0.8267, + 0.7222 + ], + "detection_confidence": 100, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_shopper.png", + "raw_ref_image": "references/raw_ref_shopper_attempt_01.png", + "reference_verify": "references/reference_verify_shopper.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000003/references/raw_ref_shopper_attempt_01.png", + "output": "references/ref_shopper.png", + "mask": "references/sam_mask_shopper.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 348.0, + 80.0, + 678.0, + 995.0 + ], + "mask_score": 3.467753, + "mask_area_ratio": 0.132874, + "elapsed_seconds": 49.4008 + } + }, + { + "name": "black_sedan", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c0c183ff-1b24f541:object:7", + "source_name": "black sedan", + "source_description": "A black sedan driving in the right lane ahead of the red hatchback. Source dataset: BDD100K. Scene context: A view from a vehicle driving down a multi-lane city street flanked by tall buildings, with various cars moving in the same direction and parked along the side.", + "sub_caption": "black sedan: A dark, modern black sedan. Scene role: driving in the nearest lane on the street", + "measured_bbox": [ + 0.0883, + 0.2514, + 0.5002, + 0.449 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_black_sedan.png", + "raw_ref_image": "references/raw_ref_black_sedan_attempt_01.png", + "reference_verify": "references/reference_verify_black_sedan.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000003/references/raw_ref_black_sedan_attempt_01.png", + "output": "references/ref_black_sedan.png", + "mask": "references/sam_mask_black_sedan.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 321.0, + 1023.0, + 700.0 + ], + "mask_score": 2.52477, + "mask_area_ratio": 0.559944, + "elapsed_seconds": 8.5091 + } + }, + { + "name": "silver_car", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:be49ae7a-1ffaa683:object:2", + "source_name": "silver car", + "source_description": "A silver car driving in the adjacent right lane, slightly ahead of the white car. Source dataset: BDD100K. Scene context: View from inside a vehicle driving in heavy traffic on a multi-lane road under a clear sky.", + "sub_caption": "silver car: A sleek silver car. Scene role: driving in the adjacent lane slightly ahead of the black sedan", + "measured_bbox": [ + 0.3669, + 0.2463, + 0.7048, + 0.409 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_silver_car.png", + "raw_ref_image": "references/raw_ref_silver_car_attempt_01.png", + "reference_verify": "references/reference_verify_silver_car.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000003/references/raw_ref_silver_car_attempt_01.png", + "output": "references/ref_silver_car.png", + "mask": "references/sam_mask_silver_car.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 16.0, + 369.0, + 1006.0, + 693.0 + ], + "mask_score": 3.457475, + "mask_area_ratio": 0.178123, + "elapsed_seconds": 9.7472 + } + } + ], + "not_emitted": [], + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/vocab_task.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/vocab_task.json new file mode 100644 index 0000000000000000000000000000000000000000..02b6561de918da038a2c8786a72b9d0cc0961452 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/vocab_task.json @@ -0,0 +1,56 @@ +{ + "task_id": "sample_000003", + "sample_id": "sample_000003", + "sample_index": 3, + "target_total": 3, + "target_people": 1, + "target_objects": 2, + "people_candidates": [ + { + "candidate_index": 0, + "source_offset": 11490, + "image_id": "CrowdHuman:data/data_12/282555,5c403000efcca35d.jpg:person:13", + "name": "shopper", + "description": "A person standing on the top level, wearing a dark top and dark pants. Source dataset: CrowdHuman. Scene context: A multi-level outdoor shopping mall with various people walking and shopping." + }, + { + "candidate_index": 1, + "source_offset": 10082, + "image_id": "CrowdHuman:data/data_12/273275,a967f0002c2986ab.jpg:person:10", + "name": "uniformed attendee", + "description": "Person in a dark military-style uniform with a beret, standing near the seated man. Source dataset: CrowdHuman. Scene context: A group of people, including flag bearers in uniform, are gathered outdoors on a dirt path next to a stone building and trees for a ceremony." + } + ], + "object_candidates": [ + { + "candidate_index": 0, + "source_offset": 69825, + "image_id": "CrowdHuman:data/data_43/273278,b8637000bf6cd3e6.jpg:object:3", + "name": "sunglasses", + "description": "Dark-lensed sunglasses worn by a fan in the front right. Source dataset: CrowdHuman. Scene context: A large crowd of enthusiastic fans in a stadium bleacher section, many wearing matching blue apparel and holding signs, cheering animatedly." + }, + { + "candidate_index": 1, + "source_offset": 202163, + "image_id": "BDD100K:c0c183ff-1b24f541:object:7", + "name": "black sedan", + "description": "A black sedan driving in the right lane ahead of the red hatchback. Source dataset: BDD100K. Scene context: A view from a vehicle driving down a multi-lane city street flanked by tall buildings, with various cars moving in the same direction and parked along the side." + }, + { + "candidate_index": 2, + "source_offset": 190614, + "image_id": "BDD100K:be6b4502-e0c95034:object:4", + "name": "distant vehicles", + "description": "Several indistinct vehicles with headlights and taillights visible further down the street. Source dataset: BDD100K. Scene context: A nighttime view from a vehicle approaching an intersection with a crosswalk and green traffic lights, with several cars ahead." + }, + { + "candidate_index": 3, + "source_offset": 190043, + "image_id": "BDD100K:be49ae7a-1ffaa683:object:2", + "name": "silver car", + "description": "A silver car driving in the adjacent right lane, slightly ahead of the white car. Source dataset: BDD100K. Scene context: View from inside a vehicle driving in heavy traffic on a multi-lane road under a clear sky." + } + ], + "rng_seed": 1782242180, + "created_at": 1782223460.2559462 +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/bbox_overlay.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/bbox_overlay.png new file mode 100644 index 0000000000000000000000000000000000000000..f9f65d32a28090ca706f84d35e0905db60dc634a --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/bbox_overlay.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2dfa8285d93461cc4eba64cd70a07a923781ce5abce988c01b2b16ee37804bb0 +size 1582207 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/compose_prompt.txt b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/compose_prompt.txt new file mode 100644 index 0000000000000000000000000000000000000000..cf4f5b4ede45c89e84ff1095cb2b71edfd996042 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/compose_prompt.txt @@ -0,0 +1,79 @@ +Render the following JSON scene specification as a photorealistic 1248x832 image using a true 3:2 canvas. The rendered image must be a coherent driving scenario with an outdoor road, street, highway, intersection, curbside, parking, or traffic environment; do not render a non-driving indoor scene or product-catalog scene. Every listed person and object must appear visibly in the image. Keep normal proportions and the requested aspect ratio. The foreground must contain only subjects explicitly listed in the JSON scene specification. Do not add any unlisted foreground people, animals, vehicles, props, tools, products, signs, readable text, logos, or other localizable subjects. Background context may include non-localizable scenery only when it does not introduce a distinct foreground subject. No text, no labels, no logos, no watermarks. +JSON scene specification: +{ + "format": "structured_json_prompt", + "canvas": { + "size": [ + 1248, + 832 + ], + "aspect_ratio": "3:2", + "style": "photorealistic" + }, + "scene": { + "setting": "An urban city street intersection during daytime from a driver's perspective.", + "activity": "A man is walking near a crosswalk at an intersection with a red traffic light, while a delivery truck and a dark car sit parked along the side of the road.", + "composition": "The camera is positioned from a vehicle driver's eye level looking forward down the street. Bright, natural daylight. The street lines lead the viewer's eye toward the crosswalk and the suspended red traffic light. The dark car and large white box truck are parked along the right curb, providing depth. The pedestrian is positioned near the corner of the crosswalk.", + "constraints": [ + "no text", + "no labels", + "no watermarks", + "true 3:2 composition", + "final canvas size 1248x832", + "normal human and object proportions", + "no squeezed perspective", + "no anamorphic stretching", + "every listed person and object must be visibly present", + "the foreground may contain only the listed people and objects", + "no extra foreground people, animals, vehicles, props, tools, products, signs, readable text, logos, or other localizable subjects absent from this JSON", + "the entire image must be a coherent driving scenario", + "the scene must include an outdoor road, street, highway, intersection, curbside, parking, or traffic environment", + "integrate all listed subjects as road users, vehicles, roadside objects, traffic infrastructure, or driving-context objects", + "do not render an office, kitchen, studio, product catalog, indoor room, or unrelated non-driving scene" + ] + }, + "people": [ + { + "name": "pedestrian_walker", + "source_index": 0, + "source_image_id": "CrowdHuman:data/data_51/273275,145927000354f7525.jpg:person:10", + "source_name": "walker", + "description": "A man walking, wearing a plain blue hoodie, grey pants, and dark shoes.", + "role_in_scene": "Walking near the crosswalk on the side of the street." + } + ], + "objects": [ + { + "name": "red_traffic_light", + "source_index": 0, + "source_image_id": "BDD100K:b6df605f-51c158b8:object:6", + "source_name": "traffic light", + "description": "A traffic signal suspended over the intersection, illuminated with a bright red light.", + "role_in_scene": "Hanging high above the center of the intersection in the driver's line of sight." + }, + { + "name": "plain_delivery_truck", + "source_index": 2, + "source_image_id": "BDD100K:b3a7b21a-48bcf2b8:object:2", + "source_name": "delivery truck", + "description": "A large, plain white box truck without any visible markings or graphics.", + "role_in_scene": "Parked alongside the right edge of the street curb." + }, + { + "name": "dark_parked_car", + "source_index": 3, + "source_image_id": "BDD100K:bcd37eef-1b958ae3:object:5", + "source_name": "dark parked car", + "description": "A dark-colored passenger vehicle.", + "role_in_scene": "Parked parallel to the curb directly behind the delivery truck." + }, + { + "name": "street_lines", + "source_index": 4, + "source_image_id": "BDD100K:c0c9ec9a-d3638a82:object:6", + "source_name": "street lines", + "description": "Double yellow center lines separating traffic directions, and solid white painted lines forming a distinct crosswalk.", + "role_in_scene": "Painted on the asphalt, guiding traffic and defining the pedestrian crossing area in the foreground." + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/crops/detect_refine_dark_parked_car.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/crops/detect_refine_dark_parked_car.png new file mode 100644 index 0000000000000000000000000000000000000000..34354b177dd4fa1858a9ce7a4527180fed7c1552 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/crops/detect_refine_dark_parked_car.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:57390ea29c6003d9c68c08d15f7f8195a45634244f7edcbbaf46f9af0e8bb4cb +size 115619 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/crops/detect_refine_pedestrian_walker.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/crops/detect_refine_pedestrian_walker.png new file mode 100644 index 0000000000000000000000000000000000000000..c0347a76d9480a5a879be89f0a32821152d3cc5a Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/crops/detect_refine_pedestrian_walker.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/crops/detect_refine_plain_delivery_truck.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/crops/detect_refine_plain_delivery_truck.png new file mode 100644 index 0000000000000000000000000000000000000000..8d6d82b8cfd7695fa0779113daa3111a1deed10b --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/crops/detect_refine_plain_delivery_truck.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60a8239b949ce31a3a0917a578041463e616ea31b6f2a77b2a3ca4be1963fe2b +size 303974 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/crops/detect_refine_red_traffic_light.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/crops/detect_refine_red_traffic_light.png new file mode 100644 index 0000000000000000000000000000000000000000..c5322eddf61b0ceb08c5bf511935ec76c6667962 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/crops/detect_refine_red_traffic_light.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/crops/detect_refine_street_lines.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/crops/detect_refine_street_lines.png new file mode 100644 index 0000000000000000000000000000000000000000..755420ad05253b5c1dfbd1697e71519c4759f974 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/crops/detect_refine_street_lines.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46afd3406b0b5c52e36fd701d6d7b4a6d4523a6e3ca8cfc40e6cb812c338e2b0 +size 736732 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/crops/diversify_input_dark_parked_car.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/crops/diversify_input_dark_parked_car.png new file mode 100644 index 0000000000000000000000000000000000000000..832fea744d6dfff54766c4d800fd374a51739d26 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/crops/diversify_input_dark_parked_car.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f587dd47852a1f223356b5730f0c8302c0aeb1ab6fc182293b917aefffefef6b +size 135843 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/crops/diversify_input_pedestrian_walker.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/crops/diversify_input_pedestrian_walker.png new file mode 100644 index 0000000000000000000000000000000000000000..965ec53119bc3aac18e3c3b4004df3ce5a4f99b1 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/crops/diversify_input_pedestrian_walker.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/crops/diversify_input_plain_delivery_truck.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/crops/diversify_input_plain_delivery_truck.png new file mode 100644 index 0000000000000000000000000000000000000000..ef72a2ca6058baf7dee74366db04740fe2acb35e --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/crops/diversify_input_plain_delivery_truck.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a14ef0cabd2e6c945db9908dbf319ebb18fa1d775e8ae4a52381ed92e4d84b6 +size 405300 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/crops/diversify_input_red_traffic_light.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/crops/diversify_input_red_traffic_light.png new file mode 100644 index 0000000000000000000000000000000000000000..5c4208c3ba9eef9239bfbbecd7e8b345cad4598d Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/crops/diversify_input_red_traffic_light.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/crops/diversify_input_street_lines.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/crops/diversify_input_street_lines.png new file mode 100644 index 0000000000000000000000000000000000000000..7713d3114a799013ba1d582e24d6d03f04884801 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/crops/diversify_input_street_lines.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14c439726a7b13919141da03a3e68a98f95f2a9646d1ffb767fc6d63334e92d8 +size 828726 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/detections.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/detections.json new file mode 100644 index 0000000000000000000000000000000000000000..8d716a608a48950285efbd0e8dbcf6d35a76e3a7 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/detections.json @@ -0,0 +1,97 @@ +[ + { + "name": "pedestrian_walker", + "present": true, + "bbox": [ + 0.5948, + 0.3939, + 0.6378, + 0.5698 + ], + "confidence": 0.98, + "notes": "Walker matches description: man walking, blue hoodie, grey pants, dark shoes.", + "coarse_bbox": [ + 0.594, + 0.391, + 0.638, + 0.57 + ], + "refine_crop": "crops/detect_refine_pedestrian_walker.png" + }, + { + "name": "red_traffic_light", + "present": true, + "bbox": [ + 0.4668, + 0.0722, + 0.5093, + 0.1896 + ], + "confidence": 0.95, + "notes": "Bounded the entire traffic light assembly including the mounting hardware and visors.", + "coarse_bbox": [ + 0.466, + 0.072, + 0.508, + 0.188 + ], + "refine_crop": "crops/detect_refine_red_traffic_light.png" + }, + { + "name": "plain_delivery_truck", + "present": true, + "bbox": [ + 0.6504, + 0.2022, + 0.966, + 0.6212 + ], + "confidence": 0.99, + "notes": "The plain white delivery truck is prominently visible in the center of the image.", + "coarse_bbox": [ + 0.668, + 0.203, + 0.968, + 0.62 + ], + "refine_crop": "crops/detect_refine_plain_delivery_truck.png" + }, + { + "name": "dark_parked_car", + "present": true, + "bbox": [ + 0.8339, + 0.4566, + 0.9965, + 0.7781 + ], + "confidence": 0.98, + "notes": "A dark-colored passenger vehicle is visible parked behind the delivery truck.", + "coarse_bbox": [ + 0.833, + 0.459, + 0.996, + 0.774 + ], + "refine_crop": "crops/detect_refine_dark_parked_car.png" + }, + { + "name": "street_lines", + "present": true, + "bbox": [ + 0.003, + 0.432, + 0.971, + 0.794 + ], + "confidence": 0.95, + "notes": "refine failed; using coarse bbox", + "coarse_bbox": [ + 0.003, + 0.432, + 0.971, + 0.794 + ], + "refine_crop": "crops/detect_refine_street_lines.png" + } +] diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/main_image.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/main_image.png new file mode 100644 index 0000000000000000000000000000000000000000..5935f6251eed500679349f5c8d2819a0cdbde049 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/main_image.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a664cdc836cc3ebbc59524c967b1a76df39be8027a9867dfba55c287c9505e0 +size 1652896 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/plan.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/plan.json new file mode 100644 index 0000000000000000000000000000000000000000..609cca39065cb744162a5b159958d36c5da61e4c --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/plan.json @@ -0,0 +1,145 @@ +{ + "sample_id": "sample_000004", + "target_total": 5, + "target_people": 1, + "target_objects": 4, + "compose_prompt": { + "format": "structured_json_prompt", + "canvas": { + "size": [ + 1248, + 832 + ], + "aspect_ratio": "3:2", + "style": "photorealistic" + }, + "scene": { + "setting": "An urban city street intersection during daytime from a driver's perspective.", + "activity": "A man is walking near a crosswalk at an intersection with a red traffic light, while a delivery truck and a dark car sit parked along the side of the road.", + "composition": "The camera is positioned from a vehicle driver's eye level looking forward down the street. Bright, natural daylight. The street lines lead the viewer's eye toward the crosswalk and the suspended red traffic light. The dark car and large white box truck are parked along the right curb, providing depth. The pedestrian is positioned near the corner of the crosswalk.", + "constraints": [ + "no text", + "no labels", + "no watermarks", + "true 3:2 composition", + "final canvas size 1248x832", + "normal human and object proportions", + "no squeezed perspective", + "no anamorphic stretching", + "every listed person and object must be visibly present", + "the foreground may contain only the listed people and objects", + "no extra foreground people, animals, vehicles, props, tools, products, signs, readable text, logos, or other localizable subjects absent from this JSON", + "the entire image must be a coherent driving scenario", + "the scene must include an outdoor road, street, highway, intersection, curbside, parking, or traffic environment", + "integrate all listed subjects as road users, vehicles, roadside objects, traffic infrastructure, or driving-context objects", + "do not render an office, kitchen, studio, product catalog, indoor room, or unrelated non-driving scene" + ] + }, + "people": [ + { + "name": "pedestrian_walker", + "source_index": 0, + "source_image_id": "CrowdHuman:data/data_51/273275,145927000354f7525.jpg:person:10", + "source_name": "walker", + "description": "A man walking, wearing a plain blue hoodie, grey pants, and dark shoes.", + "role_in_scene": "Walking near the crosswalk on the side of the street." + } + ], + "objects": [ + { + "name": "red_traffic_light", + "source_index": 0, + "source_image_id": "BDD100K:b6df605f-51c158b8:object:6", + "source_name": "traffic light", + "description": "A traffic signal suspended over the intersection, illuminated with a bright red light.", + "role_in_scene": "Hanging high above the center of the intersection in the driver's line of sight." + }, + { + "name": "plain_delivery_truck", + "source_index": 2, + "source_image_id": "BDD100K:b3a7b21a-48bcf2b8:object:2", + "source_name": "delivery truck", + "description": "A large, plain white box truck without any visible markings or graphics.", + "role_in_scene": "Parked alongside the right edge of the street curb." + }, + { + "name": "dark_parked_car", + "source_index": 3, + "source_image_id": "BDD100K:bcd37eef-1b958ae3:object:5", + "source_name": "dark parked car", + "description": "A dark-colored passenger vehicle.", + "role_in_scene": "Parked parallel to the curb directly behind the delivery truck." + }, + { + "name": "street_lines", + "source_index": 4, + "source_image_id": "BDD100K:c0c9ec9a-d3638a82:object:6", + "source_name": "street lines", + "description": "Double yellow center lines separating traffic directions, and solid white painted lines forming a distinct crosswalk.", + "role_in_scene": "Painted on the asphalt, guiding traffic and defining the pedestrian crossing area in the foreground." + } + ] + }, + "expected_subjects": [ + { + "name": "pedestrian_walker", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_51/273275,145927000354f7525.jpg:person:10", + "source_name": "walker", + "source_description": "A man walking, wearing a blue hoodie with 'EMO' written on it, grey pants, and dark shoes. Source dataset: CrowdHuman. Scene context: A group of people, some wearing matching white t-shirts, are walking and jogging along a paved path next to a road, with a grey SUV parked on a grassy hill in the background.", + "sub_caption": "walker: A man walking, wearing a plain blue hoodie, grey pants, and dark shoes.. Scene role: Walking near the crosswalk on the side of the street.", + "ref_style": "white_bg_full_body_front" + }, + { + "name": "red_traffic_light", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b6df605f-51c158b8:object:6", + "source_name": "traffic light", + "source_description": "A set of traffic lights suspended over the intersection, showing a red light. Source dataset: BDD100K. Scene context: A view from a vehicle driving down a city street on a sunny day with a U-Haul truck in the opposite lane, parked cars along the right curb, and pedestrians crossing a crosswalk.", + "sub_caption": "traffic light: A traffic signal suspended over the intersection, illuminated with a bright red light.. Scene role: Hanging high above the center of the intersection in the driver's line of sight.", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "plain_delivery_truck", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b3a7b21a-48bcf2b8:object:2", + "source_name": "delivery truck", + "source_description": "A large white box truck parked behind the fence on the right, with visible green and purple logos. Source dataset: BDD100K. Scene context: A view from a vehicle driving down a wide, paved road flanked by an industrial area with fences and delivery trucks under a cloudy sky.", + "sub_caption": "delivery truck: A large, plain white box truck without any visible markings or graphics.. Scene role: Parked alongside the right edge of the street curb.", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "dark_parked_car", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:bcd37eef-1b958ae3:object:5", + "source_name": "dark parked car", + "source_description": "Another dark-colored car parked on the right side of the street. Source dataset: BDD100K. Scene context: A nighttime view from a vehicle driving down a multi-lane city street with parked cars on the right and oncoming traffic on the left.", + "sub_caption": "dark parked car: A dark-colored passenger vehicle.. Scene role: Parked parallel to the curb directly behind the delivery truck.", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "street_lines", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c0c9ec9a-d3638a82:object:6", + "source_name": "street lines", + "source_description": "Double yellow lines separating traffic directions and white painted lines indicating lanes and crosswalks. Source dataset: BDD100K. Scene context: A view from inside a car driving on a city street, approaching an intersection underneath an elevated railway structure.", + "sub_caption": "street lines: Double yellow center lines separating traffic directions, and solid white painted lines forming a distinct crosswalk.. Scene role: Painted on the asphalt, guiding traffic and defining the pedestrian crossing area in the foreground.", + "ref_style": "white_bg_encyclopedia_photo" + } + ], + "vocab_task_path": "sample_000004/vocab_task.json", + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/references.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/references.json new file mode 100644 index 0000000000000000000000000000000000000000..e6e13f955ef2256343fbac7b8fc99e0be8f5db1d --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/references.json @@ -0,0 +1,165 @@ +{ + "references": [ + { + "name": "pedestrian_walker", + "ref_image": "references/ref_pedestrian_walker.png", + "raw_ref_image": "references/raw_ref_pedestrian_walker_attempt_01.png", + "diversify_input": "crops/diversify_input_pedestrian_walker.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000004/references/raw_ref_pedestrian_walker_attempt_01.png", + "output": "references/ref_pedestrian_walker.png", + "mask": "references/sam_mask_pedestrian_walker.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 334.0, + 56.0, + 706.0, + 996.0 + ], + "mask_score": 3.43302, + "mask_area_ratio": 0.160827, + "elapsed_seconds": 9.8914 + }, + "reference_verify": "references/reference_verify_pedestrian_walker.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "red_traffic_light", + "ref_image": "references/ref_red_traffic_light.png", + "raw_ref_image": "references/raw_ref_red_traffic_light_attempt_01.png", + "diversify_input": "crops/diversify_input_red_traffic_light.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000004/references/raw_ref_red_traffic_light_attempt_01.png", + "output": "references/ref_red_traffic_light.png", + "mask": "references/sam_mask_red_traffic_light.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 286.0, + 103.0, + 1023.0, + 893.0 + ], + "mask_score": 3.25218, + "mask_area_ratio": 0.200515, + "elapsed_seconds": 8.1927 + }, + "reference_verify": "references/reference_verify_red_traffic_light.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "plain_delivery_truck", + "ref_image": "references/ref_plain_delivery_truck.png", + "raw_ref_image": "references/raw_ref_plain_delivery_truck_attempt_01.png", + "diversify_input": "crops/diversify_input_plain_delivery_truck.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000004/references/raw_ref_plain_delivery_truck_attempt_01.png", + "output": "references/ref_plain_delivery_truck.png", + "mask": "references/sam_mask_plain_delivery_truck.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 9.0, + 166.0, + 1017.0, + 852.0 + ], + "mask_score": 3.45107, + "mask_area_ratio": 0.437578, + "elapsed_seconds": 10.0386 + }, + "reference_verify": "references/reference_verify_plain_delivery_truck.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "dark_parked_car", + "ref_image": "references/ref_dark_parked_car.png", + "raw_ref_image": "references/raw_ref_dark_parked_car_attempt_01.png", + "diversify_input": "crops/diversify_input_dark_parked_car.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000004/references/raw_ref_dark_parked_car_attempt_01.png", + "output": "references/ref_dark_parked_car.png", + "mask": "references/sam_mask_dark_parked_car.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 301.0, + 1023.0, + 694.0 + ], + "mask_score": 3.113868, + "mask_area_ratio": 0.207836, + "elapsed_seconds": 8.5697 + }, + "reference_verify": "references/reference_verify_dark_parked_car.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "street_lines", + "ref_image": "references/ref_street_lines.png", + "raw_ref_image": "references/raw_ref_street_lines_attempt_01.png", + "diversify_input": "crops/diversify_input_street_lines.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000004/references/raw_ref_street_lines_attempt_01.png", + "output": "references/ref_street_lines.png", + "mask": "references/sam_mask_street_lines.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 384.0, + 98.0, + 639.0, + 925.0 + ], + "mask_score": 3.44596, + "mask_area_ratio": 0.067441, + "elapsed_seconds": 8.1646 + }, + "reference_verify": "references/reference_verify_street_lines.json", + "reference_verify_passed": true, + "reference_attempts": 1 + } + ], + "reference_errors": {} +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/references/ref_dark_parked_car.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/references/ref_dark_parked_car.png new file mode 100644 index 0000000000000000000000000000000000000000..2c8f053fc306775c4106e0821298e15cb311086b --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/references/ref_dark_parked_car.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0949771600749c094297b5eedaa5e7aab382b6a87ffd51ee9cb71d4fc7ed9764 +size 386517 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/references/ref_pedestrian_walker.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/references/ref_pedestrian_walker.png new file mode 100644 index 0000000000000000000000000000000000000000..88c7a4fd27419a74c8a35ba9a62d83f8b2a38044 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/references/ref_pedestrian_walker.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84f7cb7d8235ac158501e74a1a42dd8d75e235bce5a4b844beb773e72173aaa5 +size 316951 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/references/ref_plain_delivery_truck.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/references/ref_plain_delivery_truck.png new file mode 100644 index 0000000000000000000000000000000000000000..a4048b7a3b07b090471e394ef21cf7e789fe6150 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/references/ref_plain_delivery_truck.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5557d81d4ea0552fdd91d421c93eb4ac5a6aa3bdd7a307ec284cfd6e75675a69 +size 678110 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/references/ref_red_traffic_light.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/references/ref_red_traffic_light.png new file mode 100644 index 0000000000000000000000000000000000000000..c00621f36af57884d465e1928bc84f697a94dfec --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/references/ref_red_traffic_light.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2582f76566ca4e88601314a654c5b21d5960ec7ec608143fc1fab50a737fb5fa +size 387238 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/references/ref_street_lines.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/references/ref_street_lines.png new file mode 100644 index 0000000000000000000000000000000000000000..b47e1dc2d77052bb57bee4d563d8355be835ec47 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/references/ref_street_lines.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2dfa3a9b564ec301be6ea9d1ba978e9a251ce07ccf643e909e12bcfe9abfd21 +size 169083 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/references/reference_verify_dark_parked_car.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/references/reference_verify_dark_parked_car.json new file mode 100644 index 0000000000000000000000000000000000000000..a66b863d51f101a949b2538c7f42bd1727f2ccb3 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/references/reference_verify_dark_parked_car.json @@ -0,0 +1,46 @@ +{ + "name": "dark_parked_car", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_dark_parked_car_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_dark_parked_car_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_dark_parked_car_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000004/references/raw_ref_dark_parked_car_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000004/references/candidate_ref_dark_parked_car_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000004/references/candidate_sam_mask_dark_parked_car_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 301.0, + 1023.0, + 694.0 + ], + "mask_score": 3.113868, + "mask_area_ratio": 0.207836, + "elapsed_seconds": 8.5697 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The dark passenger vehicle is fully visible, completely isolated on a white background, and satisfies all requirements." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/references/reference_verify_pedestrian_walker.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/references/reference_verify_pedestrian_walker.json new file mode 100644 index 0000000000000000000000000000000000000000..efbe5c8288f5cf129c5046fad1d9a4e9c12601b0 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/references/reference_verify_pedestrian_walker.json @@ -0,0 +1,46 @@ +{ + "name": "pedestrian_walker", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_pedestrian_walker_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_pedestrian_walker_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_pedestrian_walker_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000004/references/raw_ref_pedestrian_walker_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000004/references/candidate_ref_pedestrian_walker_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000004/references/candidate_sam_mask_pedestrian_walker_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 334.0, + 56.0, + 706.0, + 996.0 + ], + "mask_score": 3.43302, + "mask_area_ratio": 0.160827, + "elapsed_seconds": 9.8914 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows the full body of the person isolated on a white background with sufficient margin and no parts cropped." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/references/reference_verify_plain_delivery_truck.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/references/reference_verify_plain_delivery_truck.json new file mode 100644 index 0000000000000000000000000000000000000000..de9b0bc69856d3347c184a31aacf3efdbcd070c9 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/references/reference_verify_plain_delivery_truck.json @@ -0,0 +1,46 @@ +{ + "name": "plain_delivery_truck", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_plain_delivery_truck_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_plain_delivery_truck_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_plain_delivery_truck_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000004/references/raw_ref_plain_delivery_truck_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000004/references/candidate_ref_plain_delivery_truck_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000004/references/candidate_sam_mask_plain_delivery_truck_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 9.0, + 166.0, + 1017.0, + 852.0 + ], + "mask_score": 3.45107, + "mask_area_ratio": 0.437578, + "elapsed_seconds": 10.0386 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows a plain white delivery truck with no visible markings or graphics on a white background. The truck is fully visible and not cropped." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/references/reference_verify_red_traffic_light.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/references/reference_verify_red_traffic_light.json new file mode 100644 index 0000000000000000000000000000000000000000..caff0aa770e858654c166178a611875061138540 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/references/reference_verify_red_traffic_light.json @@ -0,0 +1,46 @@ +{ + "name": "red_traffic_light", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_red_traffic_light_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_red_traffic_light_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_red_traffic_light_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000004/references/raw_ref_red_traffic_light_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000004/references/candidate_ref_red_traffic_light_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000004/references/candidate_sam_mask_red_traffic_light_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 286.0, + 103.0, + 1023.0, + 893.0 + ], + "mask_score": 3.25218, + "mask_area_ratio": 0.200515, + "elapsed_seconds": 8.1927 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The traffic light is fully visible and isolated on a white background. The mounting arm extends out of frame, but the main subject is intact." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/references/reference_verify_street_lines.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/references/reference_verify_street_lines.json new file mode 100644 index 0000000000000000000000000000000000000000..f6da0cfe6db321762aac7d4fdc2dc8a04caa0e09 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/references/reference_verify_street_lines.json @@ -0,0 +1,46 @@ +{ + "name": "street_lines", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_street_lines_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_street_lines_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_street_lines_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000004/references/raw_ref_street_lines_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000004/references/candidate_ref_street_lines_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000004/references/candidate_sam_mask_street_lines_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 384.0, + 98.0, + 639.0, + 925.0 + ], + "mask_score": 3.44596, + "mask_area_ratio": 0.067441, + "elapsed_seconds": 8.1646 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": false, + "cropped_or_truncated": true, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "Shows double yellow street lines isolated on a white background. Missing the white crosswalk lines mentioned in the caption, but acceptable as a representative crop of the continuous subject." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/references/sam_mask_dark_parked_car.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/references/sam_mask_dark_parked_car.png new file mode 100644 index 0000000000000000000000000000000000000000..2872de93dbb1cc5b5533960910e62ebc38408232 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/references/sam_mask_dark_parked_car.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/references/sam_mask_pedestrian_walker.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/references/sam_mask_pedestrian_walker.png new file mode 100644 index 0000000000000000000000000000000000000000..1f188bbb58c352f6cb2499028d65f96dbc36e9d2 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/references/sam_mask_pedestrian_walker.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/references/sam_mask_plain_delivery_truck.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/references/sam_mask_plain_delivery_truck.png new file mode 100644 index 0000000000000000000000000000000000000000..5829042d6f8c379df9a6853f3718651a678259ad Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/references/sam_mask_plain_delivery_truck.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/references/sam_mask_red_traffic_light.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/references/sam_mask_red_traffic_light.png new file mode 100644 index 0000000000000000000000000000000000000000..88ee209eef6a9643d6b557f4244d53b4f06ddf96 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/references/sam_mask_red_traffic_light.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/references/sam_mask_street_lines.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/references/sam_mask_street_lines.png new file mode 100644 index 0000000000000000000000000000000000000000..c31948ab970ccc492ec31303e14baf7bbc4fa70f Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/references/sam_mask_street_lines.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/row.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/row.json new file mode 100644 index 0000000000000000000000000000000000000000..e1a3d24315d3196b0908cdbb4bc5a2418ad63ab1 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/row.json @@ -0,0 +1,256 @@ +{ + "sample_id": "sample_000004", + "target_total": 5, + "target_people": 1, + "target_objects": 4, + "canvas_size": [ + 1248, + 832 + ], + "canvas_aspect_ratio": "3:2", + "main_image": "main_image.png", + "bbox_overlay": "bbox_overlay.png", + "plan": "plan.json", + "detections": "detections.json", + "vocab_task": "vocab_task.json", + "n_planned": 5, + "n_detected": 5, + "n_subjects": 5, + "subjects": [ + { + "name": "pedestrian_walker", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_51/273275,145927000354f7525.jpg:person:10", + "source_name": "walker", + "source_description": "A man walking, wearing a blue hoodie with 'EMO' written on it, grey pants, and dark shoes. Source dataset: CrowdHuman. Scene context: A group of people, some wearing matching white t-shirts, are walking and jogging along a paved path next to a road, with a grey SUV parked on a grassy hill in the background.", + "sub_caption": "walker: A man walking, wearing a plain blue hoodie, grey pants, and dark shoes.. Scene role: Walking near the crosswalk on the side of the street.", + "measured_bbox": [ + 0.5948, + 0.3939, + 0.6378, + 0.5698 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian_walker.png", + "raw_ref_image": "references/raw_ref_pedestrian_walker_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian_walker.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000004/references/raw_ref_pedestrian_walker_attempt_01.png", + "output": "references/ref_pedestrian_walker.png", + "mask": "references/sam_mask_pedestrian_walker.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 334.0, + 56.0, + 706.0, + 996.0 + ], + "mask_score": 3.43302, + "mask_area_ratio": 0.160827, + "elapsed_seconds": 9.8914 + } + }, + { + "name": "red_traffic_light", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b6df605f-51c158b8:object:6", + "source_name": "traffic light", + "source_description": "A set of traffic lights suspended over the intersection, showing a red light. Source dataset: BDD100K. Scene context: A view from a vehicle driving down a city street on a sunny day with a U-Haul truck in the opposite lane, parked cars along the right curb, and pedestrians crossing a crosswalk.", + "sub_caption": "traffic light: A traffic signal suspended over the intersection, illuminated with a bright red light.. Scene role: Hanging high above the center of the intersection in the driver's line of sight.", + "measured_bbox": [ + 0.4668, + 0.0722, + 0.5093, + 0.1896 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_red_traffic_light.png", + "raw_ref_image": "references/raw_ref_red_traffic_light_attempt_01.png", + "reference_verify": "references/reference_verify_red_traffic_light.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000004/references/raw_ref_red_traffic_light_attempt_01.png", + "output": "references/ref_red_traffic_light.png", + "mask": "references/sam_mask_red_traffic_light.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 286.0, + 103.0, + 1023.0, + 893.0 + ], + "mask_score": 3.25218, + "mask_area_ratio": 0.200515, + "elapsed_seconds": 8.1927 + } + }, + { + "name": "plain_delivery_truck", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b3a7b21a-48bcf2b8:object:2", + "source_name": "delivery truck", + "source_description": "A large white box truck parked behind the fence on the right, with visible green and purple logos. Source dataset: BDD100K. Scene context: A view from a vehicle driving down a wide, paved road flanked by an industrial area with fences and delivery trucks under a cloudy sky.", + "sub_caption": "delivery truck: A large, plain white box truck without any visible markings or graphics.. Scene role: Parked alongside the right edge of the street curb.", + "measured_bbox": [ + 0.6504, + 0.2022, + 0.966, + 0.6212 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_plain_delivery_truck.png", + "raw_ref_image": "references/raw_ref_plain_delivery_truck_attempt_01.png", + "reference_verify": "references/reference_verify_plain_delivery_truck.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000004/references/raw_ref_plain_delivery_truck_attempt_01.png", + "output": "references/ref_plain_delivery_truck.png", + "mask": "references/sam_mask_plain_delivery_truck.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 9.0, + 166.0, + 1017.0, + 852.0 + ], + "mask_score": 3.45107, + "mask_area_ratio": 0.437578, + "elapsed_seconds": 10.0386 + } + }, + { + "name": "dark_parked_car", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:bcd37eef-1b958ae3:object:5", + "source_name": "dark parked car", + "source_description": "Another dark-colored car parked on the right side of the street. Source dataset: BDD100K. Scene context: A nighttime view from a vehicle driving down a multi-lane city street with parked cars on the right and oncoming traffic on the left.", + "sub_caption": "dark parked car: A dark-colored passenger vehicle.. Scene role: Parked parallel to the curb directly behind the delivery truck.", + "measured_bbox": [ + 0.8339, + 0.4566, + 0.9965, + 0.7781 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_dark_parked_car.png", + "raw_ref_image": "references/raw_ref_dark_parked_car_attempt_01.png", + "reference_verify": "references/reference_verify_dark_parked_car.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000004/references/raw_ref_dark_parked_car_attempt_01.png", + "output": "references/ref_dark_parked_car.png", + "mask": "references/sam_mask_dark_parked_car.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 301.0, + 1023.0, + 694.0 + ], + "mask_score": 3.113868, + "mask_area_ratio": 0.207836, + "elapsed_seconds": 8.5697 + } + }, + { + "name": "street_lines", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c0c9ec9a-d3638a82:object:6", + "source_name": "street lines", + "source_description": "Double yellow lines separating traffic directions and white painted lines indicating lanes and crosswalks. Source dataset: BDD100K. Scene context: A view from inside a car driving on a city street, approaching an intersection underneath an elevated railway structure.", + "sub_caption": "street lines: Double yellow center lines separating traffic directions, and solid white painted lines forming a distinct crosswalk.. Scene role: Painted on the asphalt, guiding traffic and defining the pedestrian crossing area in the foreground.", + "measured_bbox": [ + 0.003, + 0.432, + 0.971, + 0.794 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_street_lines.png", + "raw_ref_image": "references/raw_ref_street_lines_attempt_01.png", + "reference_verify": "references/reference_verify_street_lines.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000004/references/raw_ref_street_lines_attempt_01.png", + "output": "references/ref_street_lines.png", + "mask": "references/sam_mask_street_lines.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 384.0, + 98.0, + 639.0, + 925.0 + ], + "mask_score": 3.44596, + "mask_area_ratio": 0.067441, + "elapsed_seconds": 8.1646 + } + } + ], + "not_emitted": [], + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/vocab_task.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/vocab_task.json new file mode 100644 index 0000000000000000000000000000000000000000..7e207e0b938f36d9ecb9415fdcb824a6f5ec3e1f --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/vocab_task.json @@ -0,0 +1,84 @@ +{ + "task_id": "sample_000004", + "sample_id": "sample_000004", + "sample_index": 4, + "target_total": 5, + "target_people": 1, + "target_objects": 4, + "people_candidates": [ + { + "candidate_index": 0, + "source_offset": 122464, + "image_id": "CrowdHuman:data/data_51/273275,145927000354f7525.jpg:person:10", + "name": "walker", + "description": "A man walking, wearing a blue hoodie with 'EMO' written on it, grey pants, and dark shoes. Source dataset: CrowdHuman. Scene context: A group of people, some wearing matching white t-shirts, are walking and jogging along a paved path next to a road, with a grey SUV parked on a grassy hill in the background." + }, + { + "candidate_index": 1, + "source_offset": 38483, + "image_id": "CrowdHuman:data/data_21/283554,2385f000b2018513.jpg:person:44", + "name": "pedestrian", + "description": "Walking in the background, mostly obscured. Source dataset: CrowdHuman. Scene context: A crowded pedestrian area with various shops and a large KFC billboard." + } + ], + "object_candidates": [ + { + "candidate_index": 0, + "source_offset": 156913, + "image_id": "BDD100K:b6df605f-51c158b8:object:6", + "name": "traffic light", + "description": "A set of traffic lights suspended over the intersection, showing a red light. Source dataset: BDD100K. Scene context: A view from a vehicle driving down a city street on a sunny day with a U-Haul truck in the opposite lane, parked cars along the right curb, and pedestrians crossing a crosswalk." + }, + { + "candidate_index": 1, + "source_offset": 181614, + "image_id": "BDD100K:bc692855-7c087cf6:object:5", + "name": "turn only sign", + "description": "A white rectangular sign with a black curved arrow indicating 'ONLY' for the lane. Source dataset: BDD100K. Scene context: Nighttime driving scene on a multi-lane city street with several cars and illuminated buildings." + }, + { + "candidate_index": 2, + "source_offset": 141997, + "image_id": "BDD100K:b3a7b21a-48bcf2b8:object:2", + "name": "delivery truck", + "description": "A large white box truck parked behind the fence on the right, with visible green and purple logos. Source dataset: BDD100K. Scene context: A view from a vehicle driving down a wide, paved road flanked by an industrial area with fences and delivery trucks under a cloudy sky." + }, + { + "candidate_index": 3, + "source_offset": 186080, + "image_id": "BDD100K:bcd37eef-1b958ae3:object:5", + "name": "dark parked car", + "description": "Another dark-colored car parked on the right side of the street. Source dataset: BDD100K. Scene context: A nighttime view from a vehicle driving down a multi-lane city street with parked cars on the right and oncoming traffic on the left." + }, + { + "candidate_index": 4, + "source_offset": 200940, + "image_id": "BDD100K:c0c9ec9a-d3638a82:object:6", + "name": "street lines", + "description": "Double yellow lines separating traffic directions and white painted lines indicating lanes and crosswalks. Source dataset: BDD100K. Scene context: A view from inside a car driving on a city street, approaching an intersection underneath an elevated railway structure." + }, + { + "candidate_index": 5, + "source_offset": 225017, + "image_id": "BDD100K:c5b2506d-6e15c4c3:object:8", + "name": "traffic light", + "description": "A traffic signal visible far ahead at an intersection. Source dataset: BDD100K. Scene context: A view from a car driving down a multi-lane city street flanked by tall buildings, trees, and other vehicles." + }, + { + "candidate_index": 6, + "source_offset": 148655, + "image_id": "BDD100K:b5172858-da5e71cc:object:5", + "name": "red advertisement signs", + "description": "Two rectangular red signs with white text, positioned on the sidewalk near the storefront entrance. Source dataset: BDD100K. Scene context: A nighttime city street scene with cars parked and driving, and a lit storefront on the right where people are standing." + }, + { + "candidate_index": 7, + "source_offset": 238418, + "image_id": "BDD100K:c80cf60a-8bb33a63:object:10", + "name": "street light pole", + "description": "tall metal pole for street lighting along the highway Source dataset: BDD100K. Scene context: View from inside a vehicle driving on a multi-lane highway, with a flatbed tow truck carrying a street sweeper prominently visible on the right." + } + ], + "rng_seed": 1782346909, + "created_at": 1782223460.2807603 +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/bbox_overlay.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/bbox_overlay.png new file mode 100644 index 0000000000000000000000000000000000000000..5f0cd1862e12d259b66cc9ce40ce1338ea598289 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/bbox_overlay.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8cf57163d05cd2fcb09dc2879b3ecb18ca0601148f182b01bee33984774fae8 +size 1820714 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/compose_prompt.txt b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/compose_prompt.txt new file mode 100644 index 0000000000000000000000000000000000000000..a448e894bc9d35bff9d46931a08ee92692b30694 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/compose_prompt.txt @@ -0,0 +1,87 @@ +Render the following JSON scene specification as a photorealistic 1248x832 image using a true 3:2 canvas. The rendered image must be a coherent driving scenario with an outdoor road, street, highway, intersection, curbside, parking, or traffic environment; do not render a non-driving indoor scene or product-catalog scene. Every listed person and object must appear visibly in the image. Keep normal proportions and the requested aspect ratio. The foreground must contain only subjects explicitly listed in the JSON scene specification. Do not add any unlisted foreground people, animals, vehicles, props, tools, products, signs, readable text, logos, or other localizable subjects. Background context may include non-localizable scenery only when it does not introduce a distinct foreground subject. No text, no labels, no logos, no watermarks. +JSON scene specification: +{ + "format": "structured_json_prompt", + "canvas": { + "size": [ + 1248, + 832 + ], + "aspect_ratio": "3:2", + "style": "photorealistic" + }, + "scene": { + "setting": "A bustling city street intersection on a clear day.", + "activity": "A silver car waits at a red traffic light while pedestrians, including a person in a bright yellow top, a young girl, and a man in a suit, cross the street in front of it.", + "composition": "First-person driver's perspective from a vehicle stopped directly behind the silver car. The silver car is centered in the foreground. The crosswalk and pedestrians are in the midground. The traffic light hangs prominently above the scene. Trees line the sidewalks in the background on both sides, framing the urban environment.", + "constraints": [ + "no text", + "no labels", + "no watermarks", + "true 3:2 composition", + "final canvas size 1248x832", + "normal human and object proportions", + "no squeezed perspective", + "no anamorphic stretching", + "every listed person and object must be visibly present", + "the foreground may contain only the listed people and objects", + "no extra foreground people, animals, vehicles, props, tools, products, signs, readable text, logos, or other localizable subjects absent from this JSON", + "the entire image must be a coherent driving scenario", + "the scene must include an outdoor road, street, highway, intersection, curbside, parking, or traffic environment", + "integrate all listed subjects as road users, vehicles, roadside objects, traffic infrastructure, or driving-context objects", + "do not render an office, kitchen, studio, product catalog, indoor room, or unrelated non-driving scene" + ] + }, + "people": [ + { + "name": "person_yellow_top", + "source_index": 0, + "source_image_id": "CrowdHuman:data/data_24/282555,49a4e000d44469c6.jpg:person:49", + "source_name": "person", + "description": "Standing, wearing a bright yellow top", + "role_in_scene": "Crossing the street on the crosswalk in front of the stopped silver car" + }, + { + "name": "man_in_suit", + "source_index": 1, + "source_image_id": "CrowdHuman:data/data_28/273278,b62280001bda6f1a.jpg:person:19", + "source_name": "crowd member", + "description": "A person wearing a professional suit.", + "role_in_scene": "Walking alongside the other pedestrians across the crosswalk" + }, + { + "name": "young_girl", + "source_index": 2, + "source_image_id": "CrowdHuman:data/data_51/282555,57b9f000182048e6.jpg:person:13", + "source_name": "pedestrian", + "description": "Young girl with brown hair, wearing a blue patterned top.", + "role_in_scene": "Walking across the intersection near the person in the yellow top" + } + ], + "objects": [ + { + "name": "traffic_light", + "source_index": 2, + "source_image_id": "CrowdHuman:data/data_36/273275,6a11d000f52c34a9.jpg:object:0", + "source_name": "traffic light", + "description": "A black multi-lens traffic light fixture mounted on a pole above the street.", + "role_in_scene": "Suspended over the intersection, showing a red light to halt the vehicles" + }, + { + "name": "street_trees", + "source_index": 4, + "source_image_id": "CrowdHuman:data/data_37/282555,a4aef000d9c10933.jpg:object:3", + "source_name": "trees", + "description": "Various green trees and shrubs lining the pathway and visible in the background gardens.", + "role_in_scene": "Planted along the sidewalks on both sides of the street, providing urban greenery" + }, + { + "name": "silver_car", + "source_index": 5, + "source_image_id": "BDD100K:be3d3a81-326a032d:object:0", + "source_name": "silver car", + "description": "A silver compact hatchback car facing forward, waiting at an intersection. Its brake lights are on.", + "role_in_scene": "Stopped in the traffic lane in the foreground, waiting for the pedestrians to cross" + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/crops/detect_refine_man_in_suit.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/crops/detect_refine_man_in_suit.png new file mode 100644 index 0000000000000000000000000000000000000000..84868599110e99cc6e0b70a9aa23baa1c1b6c7c7 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/crops/detect_refine_man_in_suit.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/crops/detect_refine_person_yellow_top.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/crops/detect_refine_person_yellow_top.png new file mode 100644 index 0000000000000000000000000000000000000000..61da851eda1978d4f35ebb43f821246c279a64ce Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/crops/detect_refine_person_yellow_top.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/crops/detect_refine_silver_car.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/crops/detect_refine_silver_car.png new file mode 100644 index 0000000000000000000000000000000000000000..0ff025acfec2b1544987c3caa1afc7381a15eb85 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/crops/detect_refine_silver_car.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4f8b4fa2e507713847babc4a42f9a52772dc841492e9af9d6d79ad0c0c276f6 +size 201403 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/crops/detect_refine_street_trees.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/crops/detect_refine_street_trees.png new file mode 100644 index 0000000000000000000000000000000000000000..e491e38f71b7eba33ce6c11eb5d5191c13f677a9 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/crops/detect_refine_street_trees.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7dafe486422e301d258da005941140f16e4fcd7539f09de40185da6bf9e6799a +size 1580563 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/crops/detect_refine_traffic_light.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/crops/detect_refine_traffic_light.png new file mode 100644 index 0000000000000000000000000000000000000000..d9d17ee4ce4f97c95da9eb3a124dad2f54c3ec1b Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/crops/detect_refine_traffic_light.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/crops/detect_refine_young_girl.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/crops/detect_refine_young_girl.png new file mode 100644 index 0000000000000000000000000000000000000000..1965887b646529ca13b364c15098d0dcb490eacd Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/crops/detect_refine_young_girl.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/crops/diversify_input_man_in_suit.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/crops/diversify_input_man_in_suit.png new file mode 100644 index 0000000000000000000000000000000000000000..583c41e3b438ad4c94cdada1d8e6aa0322fe3b49 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/crops/diversify_input_man_in_suit.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/crops/diversify_input_person_yellow_top.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/crops/diversify_input_person_yellow_top.png new file mode 100644 index 0000000000000000000000000000000000000000..38d3a1b8dc9db82a24f5fac96048da53a53ad44a Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/crops/diversify_input_person_yellow_top.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/crops/diversify_input_silver_car.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/crops/diversify_input_silver_car.png new file mode 100644 index 0000000000000000000000000000000000000000..81ebdb3b0a9a57c2a9646bdf110fe0faed6c162f --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/crops/diversify_input_silver_car.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2323a8b05c520d3b8ab4c1deb9ecf64ae0533f991c920787c799e55eac29ca03 +size 289497 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/crops/diversify_input_street_trees.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/crops/diversify_input_street_trees.png new file mode 100644 index 0000000000000000000000000000000000000000..b9dffc96732165c9d2712144e51adda9c9c898c9 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/crops/diversify_input_street_trees.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/crops/diversify_input_traffic_light.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/crops/diversify_input_traffic_light.png new file mode 100644 index 0000000000000000000000000000000000000000..ea909d6054468d0d1a0248157130777cdf91f860 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/crops/diversify_input_traffic_light.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/crops/diversify_input_young_girl.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/crops/diversify_input_young_girl.png new file mode 100644 index 0000000000000000000000000000000000000000..ddab19a930c495551bef6a11db7505524b330be8 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/crops/diversify_input_young_girl.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/detections.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/detections.json new file mode 100644 index 0000000000000000000000000000000000000000..82454b65da892cbf0fec4f7d0fe01a704e5025dd --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/detections.json @@ -0,0 +1,116 @@ +[ + { + "name": "person_yellow_top", + "present": true, + "bbox": [ + 0.5309, + 0.4516, + 0.5607, + 0.6301 + ], + "confidence": 0.95, + "notes": "Person in a yellow top and dark pants is clearly visible walking across the crosswalk.", + "coarse_bbox": [ + 0.526, + 0.451, + 0.561, + 0.628 + ], + "refine_crop": "crops/detect_refine_person_yellow_top.png" + }, + { + "name": "man_in_suit", + "present": true, + "bbox": [ + 0.5767, + 0.4388, + 0.6397, + 0.6278 + ], + "confidence": 0.99, + "notes": "A man wearing a grey suit walking across a crosswalk.", + "coarse_bbox": [ + 0.581, + 0.439, + 0.636, + 0.625 + ], + "refine_crop": "crops/detect_refine_man_in_suit.png" + }, + { + "name": "young_girl", + "present": true, + "bbox": [ + 0.6354, + 0.4889, + 0.6677, + 0.6337 + ], + "confidence": 0.98, + "notes": "Young girl with brown hair and blue patterned top clearly visible.", + "coarse_bbox": [ + 0.639, + 0.487, + 0.668, + 0.632 + ], + "refine_crop": "crops/detect_refine_young_girl.png" + }, + { + "name": "traffic_light", + "present": true, + "bbox": [ + 0.5513, + 0.0408, + 0.6462, + 0.1518 + ], + "confidence": 0.99, + "notes": "A black multi-lens traffic light fixture mounted on a pole above the street.", + "coarse_bbox": [ + 0.552, + 0.035, + 0.647, + 0.152 + ], + "refine_crop": "crops/detect_refine_traffic_light.png" + }, + { + "name": "street_trees", + "present": true, + "bbox": [ + 0.542, + 0.2363, + 0.636, + 0.493 + ], + "confidence": 0.9, + "notes": "Tight bounding box around the group of trees on the right side of the street.", + "coarse_bbox": [ + 0.0, + 0.002, + 1.0, + 0.709 + ], + "refine_crop": "crops/detect_refine_street_trees.png" + }, + { + "name": "silver_car", + "present": true, + "bbox": [ + 0.3062, + 0.4281, + 0.5436, + 0.7674 + ], + "confidence": 0.99, + "notes": "The silver car occupies the central foreground.", + "coarse_bbox": [ + 0.303, + 0.455, + 0.543, + 0.769 + ], + "refine_crop": "crops/detect_refine_silver_car.png" + } +] diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/main_image.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/main_image.png new file mode 100644 index 0000000000000000000000000000000000000000..29a15d22cd42c0334e01c14cfc3f5296fda3e7da --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/main_image.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be1b75db32f566024b8fde24c286d4049ec26a747bbede2d967154e251e3ce2b +size 1873766 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/plan.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/plan.json new file mode 100644 index 0000000000000000000000000000000000000000..fb3faccfdedc443eaf0ad5a703226bd232a3fdbc --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/plan.json @@ -0,0 +1,164 @@ +{ + "sample_id": "sample_000005", + "target_total": 6, + "target_people": 3, + "target_objects": 3, + "compose_prompt": { + "format": "structured_json_prompt", + "canvas": { + "size": [ + 1248, + 832 + ], + "aspect_ratio": "3:2", + "style": "photorealistic" + }, + "scene": { + "setting": "A bustling city street intersection on a clear day.", + "activity": "A silver car waits at a red traffic light while pedestrians, including a person in a bright yellow top, a young girl, and a man in a suit, cross the street in front of it.", + "composition": "First-person driver's perspective from a vehicle stopped directly behind the silver car. The silver car is centered in the foreground. The crosswalk and pedestrians are in the midground. The traffic light hangs prominently above the scene. Trees line the sidewalks in the background on both sides, framing the urban environment.", + "constraints": [ + "no text", + "no labels", + "no watermarks", + "true 3:2 composition", + "final canvas size 1248x832", + "normal human and object proportions", + "no squeezed perspective", + "no anamorphic stretching", + "every listed person and object must be visibly present", + "the foreground may contain only the listed people and objects", + "no extra foreground people, animals, vehicles, props, tools, products, signs, readable text, logos, or other localizable subjects absent from this JSON", + "the entire image must be a coherent driving scenario", + "the scene must include an outdoor road, street, highway, intersection, curbside, parking, or traffic environment", + "integrate all listed subjects as road users, vehicles, roadside objects, traffic infrastructure, or driving-context objects", + "do not render an office, kitchen, studio, product catalog, indoor room, or unrelated non-driving scene" + ] + }, + "people": [ + { + "name": "person_yellow_top", + "source_index": 0, + "source_image_id": "CrowdHuman:data/data_24/282555,49a4e000d44469c6.jpg:person:49", + "source_name": "person", + "description": "Standing, wearing a bright yellow top", + "role_in_scene": "Crossing the street on the crosswalk in front of the stopped silver car" + }, + { + "name": "man_in_suit", + "source_index": 1, + "source_image_id": "CrowdHuman:data/data_28/273278,b62280001bda6f1a.jpg:person:19", + "source_name": "crowd member", + "description": "A person wearing a professional suit.", + "role_in_scene": "Walking alongside the other pedestrians across the crosswalk" + }, + { + "name": "young_girl", + "source_index": 2, + "source_image_id": "CrowdHuman:data/data_51/282555,57b9f000182048e6.jpg:person:13", + "source_name": "pedestrian", + "description": "Young girl with brown hair, wearing a blue patterned top.", + "role_in_scene": "Walking across the intersection near the person in the yellow top" + } + ], + "objects": [ + { + "name": "traffic_light", + "source_index": 2, + "source_image_id": "CrowdHuman:data/data_36/273275,6a11d000f52c34a9.jpg:object:0", + "source_name": "traffic light", + "description": "A black multi-lens traffic light fixture mounted on a pole above the street.", + "role_in_scene": "Suspended over the intersection, showing a red light to halt the vehicles" + }, + { + "name": "street_trees", + "source_index": 4, + "source_image_id": "CrowdHuman:data/data_37/282555,a4aef000d9c10933.jpg:object:3", + "source_name": "trees", + "description": "Various green trees and shrubs lining the pathway and visible in the background gardens.", + "role_in_scene": "Planted along the sidewalks on both sides of the street, providing urban greenery" + }, + { + "name": "silver_car", + "source_index": 5, + "source_image_id": "BDD100K:be3d3a81-326a032d:object:0", + "source_name": "silver car", + "description": "A silver compact hatchback car facing forward, waiting at an intersection. Its brake lights are on.", + "role_in_scene": "Stopped in the traffic lane in the foreground, waiting for the pedestrians to cross" + } + ] + }, + "expected_subjects": [ + { + "name": "person_yellow_top", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_24/282555,49a4e000d44469c6.jpg:person:49", + "source_name": "person", + "source_description": "Standing, wearing a bright yellow top Source dataset: CrowdHuman. Scene context: A large crowd of tourists is gathered in front of the Louvre museum and its iconic glass pyramid on a sunny day.", + "sub_caption": "person: Standing, wearing a bright yellow top. Scene role: Crossing the street on the crosswalk in front of the stopped silver car", + "ref_style": "white_bg_full_body_front" + }, + { + "name": "man_in_suit", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_28/273278,b62280001bda6f1a.jpg:person:19", + "source_name": "crowd member", + "source_description": "A person far right in the background wearing a suit. Source dataset: CrowdHuman. Scene context: A crowd of people gathers in front of an old stone building with a prominent arched doorway and two large animal statues.", + "sub_caption": "crowd member: A person wearing a professional suit.. Scene role: Walking alongside the other pedestrians across the crosswalk", + "ref_style": "white_bg_full_body_front" + }, + { + "name": "young_girl", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_51/282555,57b9f000182048e6.jpg:person:13", + "source_name": "pedestrian", + "source_description": "Young girl with brown hair, wearing a blue patterned top. Source dataset: CrowdHuman. Scene context: A sunny outdoor scene featuring the red entrance arch to Navy Pier Beer Garden and a tall brick tower, with a diverse crowd of people walking along the waterfront promenade.", + "sub_caption": "pedestrian: Young girl with brown hair, wearing a blue patterned top.. Scene role: Walking across the intersection near the person in the yellow top", + "ref_style": "white_bg_full_body_front" + }, + { + "name": "traffic_light", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_36/273275,6a11d000f52c34a9.jpg:object:0", + "source_name": "traffic light", + "source_description": "A black multi-lens traffic light fixture mounted on a pole above the street. Source dataset: CrowdHuman. Scene context: A male tour guide is speaking to a group of people standing on a city sidewalk next to a road crossing.", + "sub_caption": "traffic light: A black multi-lens traffic light fixture mounted on a pole above the street.. Scene role: Suspended over the intersection, showing a red light to halt the vehicles", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "street_trees", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_37/282555,a4aef000d9c10933.jpg:object:3", + "source_name": "trees", + "source_description": "Various green trees and shrubs lining the pathway and visible in the background gardens. Source dataset: CrowdHuman. Scene context: A large crowd of tourists walks along the pathway towards the Taj Mahal on a clear day.", + "sub_caption": "trees: Various green trees and shrubs lining the pathway and visible in the background gardens.. Scene role: Planted along the sidewalks on both sides of the street, providing urban greenery", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "silver_car", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:be3d3a81-326a032d:object:0", + "source_name": "silver car", + "source_description": "A silver compact hatchback car facing forward, waiting at an intersection. Its brake lights are on. Source dataset: BDD100K. Scene context: A rainy street scene showing cars waiting at an intersection surrounded by tall buildings and urban infrastructure.", + "sub_caption": "silver car: A silver compact hatchback car facing forward, waiting at an intersection. Its brake lights are on.. Scene role: Stopped in the traffic lane in the foreground, waiting for the pedestrians to cross", + "ref_style": "white_bg_encyclopedia_photo" + } + ], + "vocab_task_path": "sample_000005/vocab_task.json", + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references.json new file mode 100644 index 0000000000000000000000000000000000000000..b636cd242919532d2673bcfafffac7fff4a5eda4 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references.json @@ -0,0 +1,197 @@ +{ + "references": [ + { + "name": "person_yellow_top", + "ref_image": "references/ref_person_yellow_top.png", + "raw_ref_image": "references/raw_ref_person_yellow_top_attempt_01.png", + "diversify_input": "crops/diversify_input_person_yellow_top.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_person_yellow_top_attempt_01.png", + "output": "references/ref_person_yellow_top.png", + "mask": "references/sam_mask_person_yellow_top.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 329.0, + 42.0, + 701.0, + 1012.0 + ], + "mask_score": 3.348943, + "mask_area_ratio": 0.150169, + "elapsed_seconds": 8.25 + }, + "reference_verify": "references/reference_verify_person_yellow_top.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "man_in_suit", + "ref_image": "references/ref_man_in_suit.png", + "raw_ref_image": "references/raw_ref_man_in_suit_attempt_01.png", + "diversify_input": "crops/diversify_input_man_in_suit.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_man_in_suit_attempt_01.png", + "output": "references/ref_man_in_suit.png", + "mask": "references/sam_mask_man_in_suit.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 351.0, + 27.0, + 671.0, + 1004.0 + ], + "mask_score": 3.48496, + "mask_area_ratio": 0.144686, + "elapsed_seconds": 9.7885 + }, + "reference_verify": "references/reference_verify_man_in_suit.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "young_girl", + "ref_image": "references/ref_young_girl.png", + "raw_ref_image": "references/raw_ref_young_girl_attempt_01.png", + "diversify_input": "crops/diversify_input_young_girl.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_young_girl_attempt_01.png", + "output": "references/ref_young_girl.png", + "mask": "references/sam_mask_young_girl.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 369.0, + 52.0, + 661.0, + 1003.0 + ], + "mask_score": 3.482282, + "mask_area_ratio": 0.133298, + "elapsed_seconds": 8.3216 + }, + "reference_verify": "references/reference_verify_young_girl.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "traffic_light", + "ref_image": "references/ref_traffic_light.png", + "raw_ref_image": "references/raw_ref_traffic_light_attempt_01.png", + "diversify_input": "crops/diversify_input_traffic_light.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_traffic_light_attempt_01.png", + "output": "references/ref_traffic_light.png", + "mask": "references/sam_mask_traffic_light.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 113.0, + 201.0, + 923.0, + 826.0 + ], + "mask_score": 3.467034, + "mask_area_ratio": 0.289252, + "elapsed_seconds": 9.874 + }, + "reference_verify": "references/reference_verify_traffic_light.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "street_trees", + "ref_image": "references/ref_street_trees.png", + "raw_ref_image": "references/raw_ref_street_trees_attempt_03.png", + "diversify_input": "crops/diversify_input_street_trees.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_street_trees_attempt_03.png", + "output": "references/ref_street_trees.png", + "mask": "references/sam_mask_street_trees.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 22.0, + 60.0, + 1003.0, + 968.0 + ], + "mask_score": 3.301958, + "mask_area_ratio": 0.393952, + "elapsed_seconds": 8.2223 + }, + "reference_verify": "references/reference_verify_street_trees.json", + "reference_verify_passed": true, + "reference_attempts": 3 + }, + { + "name": "silver_car", + "ref_image": "references/ref_silver_car.png", + "raw_ref_image": "references/raw_ref_silver_car_attempt_01.png", + "diversify_input": "crops/diversify_input_silver_car.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_silver_car_attempt_01.png", + "output": "references/ref_silver_car.png", + "mask": "references/sam_mask_silver_car.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 80.0, + 167.0, + 957.0, + 937.0 + ], + "mask_score": 3.434142, + "mask_area_ratio": 0.414005, + "elapsed_seconds": 8.3073 + }, + "reference_verify": "references/reference_verify_silver_car.json", + "reference_verify_passed": true, + "reference_attempts": 1 + } + ], + "reference_errors": {} +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/ref_man_in_suit.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/ref_man_in_suit.png new file mode 100644 index 0000000000000000000000000000000000000000..c7e41f74c4c096715515c8aae7542b8752a67ed3 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/ref_man_in_suit.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4cfce9f423d69ea5ba9a4908e45b015594cb8ebed1b959cd8f10181988fe55a +size 287446 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/ref_person_yellow_top.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/ref_person_yellow_top.png new file mode 100644 index 0000000000000000000000000000000000000000..b004d61c6ec87a6dc6dfd3efd955672adbc1f94b --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/ref_person_yellow_top.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aab4219e648e104ec779a12731e20475450d82cf38e3aa071de0d01a3adfdedf +size 275936 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/ref_silver_car.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/ref_silver_car.png new file mode 100644 index 0000000000000000000000000000000000000000..fad4049cc5535ca7acca8d33d56b6e5a65c3d4b5 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/ref_silver_car.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c6c5c67192a3a1057088614882a8da90e98adb3bf148e13aaa871c5c26abe07 +size 688327 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/ref_street_trees.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/ref_street_trees.png new file mode 100644 index 0000000000000000000000000000000000000000..5684caf5a6979a5abd9f5346bd998341b7e17805 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/ref_street_trees.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ccaee2c5fac51ed0331176e6c14180547b14db07681304610de1203df3b093ca +size 1146709 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/ref_traffic_light.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/ref_traffic_light.png new file mode 100644 index 0000000000000000000000000000000000000000..fd62e540c402097f9129fc3689322b0daca650d4 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/ref_traffic_light.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:845adf3043c0f26b3c5ef62456ed719eb46ede77e0f2ff19f0bd706cbf9b12cb +size 523096 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/ref_young_girl.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/ref_young_girl.png new file mode 100644 index 0000000000000000000000000000000000000000..8a5c5ae93ec68d7c4c52b93b4abe99f9250d19a1 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/ref_young_girl.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12960f84bbb6a17dcf069ee11e6f43017c36e06e9111eb26a0e835dcace02fec +size 300115 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/reference_verify_man_in_suit.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/reference_verify_man_in_suit.json new file mode 100644 index 0000000000000000000000000000000000000000..df463a42ea80027e5c0acffe9a90d096990bfb71 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/reference_verify_man_in_suit.json @@ -0,0 +1,46 @@ +{ + "name": "man_in_suit", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_man_in_suit_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_man_in_suit_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_man_in_suit_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_man_in_suit_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000005/references/candidate_ref_man_in_suit_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000005/references/candidate_sam_mask_man_in_suit_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 351.0, + 27.0, + 671.0, + 1004.0 + ], + "mask_score": 3.48496, + "mask_area_ratio": 0.144686, + "elapsed_seconds": 9.7885 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The man in the suit is fully visible from head to toe on a clean white background." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/reference_verify_person_yellow_top.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/reference_verify_person_yellow_top.json new file mode 100644 index 0000000000000000000000000000000000000000..3e765551b03322e5bf4685a3ae84a3d88b7e13d7 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/reference_verify_person_yellow_top.json @@ -0,0 +1,46 @@ +{ + "name": "person_yellow_top", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_person_yellow_top_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_person_yellow_top_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_person_yellow_top_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_person_yellow_top_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000005/references/candidate_ref_person_yellow_top_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000005/references/candidate_sam_mask_person_yellow_top_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 329.0, + 42.0, + 701.0, + 1012.0 + ], + "mask_score": 3.348943, + "mask_area_ratio": 0.150169, + "elapsed_seconds": 8.25 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "Full body visible, isolated on white background with sufficient margin." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/reference_verify_silver_car.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/reference_verify_silver_car.json new file mode 100644 index 0000000000000000000000000000000000000000..25d3758a36887d38c567cf7e77bfa6d2ce926e32 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/reference_verify_silver_car.json @@ -0,0 +1,46 @@ +{ + "name": "silver_car", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_silver_car_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_silver_car_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_silver_car_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_silver_car_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000005/references/candidate_ref_silver_car_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000005/references/candidate_sam_mask_silver_car_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 80.0, + 167.0, + 957.0, + 937.0 + ], + "mask_score": 3.434142, + "mask_area_ratio": 0.414005, + "elapsed_seconds": 8.3073 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The car is shown from the rear rather than facing forward, but it is a complete, well-isolated reference image of the silver car." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/reference_verify_street_trees.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/reference_verify_street_trees.json new file mode 100644 index 0000000000000000000000000000000000000000..3bc436f8f37fa89c330cebae28ab80e0d1d3a497 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/reference_verify_street_trees.json @@ -0,0 +1,130 @@ +{ + "name": "street_trees", + "passed": true, + "accepted_attempt": 3, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_street_trees_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_street_trees_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_street_trees_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_street_trees_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000005/references/candidate_ref_street_trees_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000005/references/candidate_sam_mask_street_trees_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 319.0, + 0.0, + 732.0, + 1023.0 + ], + "mask_score": 1.881913, + "mask_area_ratio": 0.009517, + "elapsed_seconds": 8.1823 + }, + "verify": { + "passed": false, + "subject_visible": false, + "complete_subject": false, + "cropped_or_truncated": false, + "single_main_subject": false, + "white_background": true, + "failure_reasons": [ + "Intended subject (street trees) is absent.", + "Image contains a completely unrecognizable fragmented vertical artifact instead of trees or shrubs." + ], + "notes": "The image shows a thin, broken vertical line on a white background, which does not resemble trees or shrubs in any way." + } + }, + { + "attempt": 2, + "raw_ref_image": "references/raw_ref_street_trees_attempt_02.png", + "candidate_ref_image": "references/candidate_ref_street_trees_attempt_02.png", + "candidate_sam_mask": "references/candidate_sam_mask_street_trees_attempt_02.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_street_trees_attempt_02.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000005/references/candidate_ref_street_trees_attempt_02.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000005/references/candidate_sam_mask_street_trees_attempt_02.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 366.0, + 0.0, + 675.0, + 1023.0 + ], + "mask_score": 3.209849, + "mask_area_ratio": 0.022717, + "elapsed_seconds": 9.8095 + }, + "verify": { + "passed": false, + "subject_visible": false, + "complete_subject": false, + "cropped_or_truncated": false, + "single_main_subject": false, + "white_background": true, + "failure_reasons": [ + "The intended subject (street trees) is absent.", + "The image shows a streetlamp instead of trees." + ], + "notes": "The image features an isolated streetlamp on a white background, completely lacking the requested street trees." + } + }, + { + "attempt": 3, + "raw_ref_image": "references/raw_ref_street_trees_attempt_03.png", + "candidate_ref_image": "references/candidate_ref_street_trees_attempt_03.png", + "candidate_sam_mask": "references/candidate_sam_mask_street_trees_attempt_03.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_street_trees_attempt_03.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000005/references/candidate_ref_street_trees_attempt_03.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000005/references/candidate_sam_mask_street_trees_attempt_03.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 22.0, + 60.0, + 1003.0, + 968.0 + ], + "mask_score": 3.301958, + "mask_area_ratio": 0.393952, + "elapsed_seconds": 8.2223 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows a representative group of trees isolated on a white background, which is perfectly acceptable as a reference for 'street_trees'." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/reference_verify_traffic_light.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/reference_verify_traffic_light.json new file mode 100644 index 0000000000000000000000000000000000000000..635e7a81faa1e2d0f558fa50767374e36d47d681 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/reference_verify_traffic_light.json @@ -0,0 +1,46 @@ +{ + "name": "traffic_light", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_traffic_light_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_traffic_light_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_traffic_light_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_traffic_light_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000005/references/candidate_ref_traffic_light_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000005/references/candidate_sam_mask_traffic_light_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 113.0, + 201.0, + 923.0, + 826.0 + ], + "mask_score": 3.467034, + "mask_area_ratio": 0.289252, + "elapsed_seconds": 9.874 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows a complete, isolated traffic light fixture on a white background, which meets all requirements for a non-person subject reference image." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/reference_verify_young_girl.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/reference_verify_young_girl.json new file mode 100644 index 0000000000000000000000000000000000000000..1488207cba53ab9e4002f570f08e2b3b560f7222 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/reference_verify_young_girl.json @@ -0,0 +1,46 @@ +{ + "name": "young_girl", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_young_girl_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_young_girl_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_young_girl_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_young_girl_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000005/references/candidate_ref_young_girl_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000005/references/candidate_sam_mask_young_girl_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 369.0, + 52.0, + 661.0, + 1003.0 + ], + "mask_score": 3.482282, + "mask_area_ratio": 0.133298, + "elapsed_seconds": 8.3216 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows a full body of a young girl standing isolated on a white background with no cropping." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/sam_mask_man_in_suit.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/sam_mask_man_in_suit.png new file mode 100644 index 0000000000000000000000000000000000000000..78b1e83edbf51bf605d1df4143e1472a3eaed890 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/sam_mask_man_in_suit.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/sam_mask_person_yellow_top.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/sam_mask_person_yellow_top.png new file mode 100644 index 0000000000000000000000000000000000000000..e7eac5d19332234a207623c19ab0f1289473cbf4 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/sam_mask_person_yellow_top.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/sam_mask_silver_car.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/sam_mask_silver_car.png new file mode 100644 index 0000000000000000000000000000000000000000..b3bc75ce2dd9335fa4618332f7f6130ace6bfabb Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/sam_mask_silver_car.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/sam_mask_street_trees.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/sam_mask_street_trees.png new file mode 100644 index 0000000000000000000000000000000000000000..e5abedd77be679b154f972414f33db5211942d4b Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/sam_mask_street_trees.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/sam_mask_traffic_light.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/sam_mask_traffic_light.png new file mode 100644 index 0000000000000000000000000000000000000000..278725944904f84962e62088ee6de748ce6d8136 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/sam_mask_traffic_light.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/sam_mask_young_girl.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/sam_mask_young_girl.png new file mode 100644 index 0000000000000000000000000000000000000000..3e7ea217678c762302713bb20d40a2b9f95ca0fc Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/sam_mask_young_girl.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/row.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/row.json new file mode 100644 index 0000000000000000000000000000000000000000..030ad1a12b026f0f651944838179229027727a84 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/row.json @@ -0,0 +1,302 @@ +{ + "sample_id": "sample_000005", + "target_total": 6, + "target_people": 3, + "target_objects": 3, + "canvas_size": [ + 1248, + 832 + ], + "canvas_aspect_ratio": "3:2", + "main_image": "main_image.png", + "bbox_overlay": "bbox_overlay.png", + "plan": "plan.json", + "detections": "detections.json", + "vocab_task": "vocab_task.json", + "n_planned": 6, + "n_detected": 6, + "n_subjects": 6, + "subjects": [ + { + "name": "person_yellow_top", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_24/282555,49a4e000d44469c6.jpg:person:49", + "source_name": "person", + "source_description": "Standing, wearing a bright yellow top Source dataset: CrowdHuman. Scene context: A large crowd of tourists is gathered in front of the Louvre museum and its iconic glass pyramid on a sunny day.", + "sub_caption": "person: Standing, wearing a bright yellow top. Scene role: Crossing the street on the crosswalk in front of the stopped silver car", + "measured_bbox": [ + 0.5309, + 0.4516, + 0.5607, + 0.6301 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_person_yellow_top.png", + "raw_ref_image": "references/raw_ref_person_yellow_top_attempt_01.png", + "reference_verify": "references/reference_verify_person_yellow_top.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_person_yellow_top_attempt_01.png", + "output": "references/ref_person_yellow_top.png", + "mask": "references/sam_mask_person_yellow_top.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 329.0, + 42.0, + 701.0, + 1012.0 + ], + "mask_score": 3.348943, + "mask_area_ratio": 0.150169, + "elapsed_seconds": 8.25 + } + }, + { + "name": "man_in_suit", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_28/273278,b62280001bda6f1a.jpg:person:19", + "source_name": "crowd member", + "source_description": "A person far right in the background wearing a suit. Source dataset: CrowdHuman. Scene context: A crowd of people gathers in front of an old stone building with a prominent arched doorway and two large animal statues.", + "sub_caption": "crowd member: A person wearing a professional suit.. Scene role: Walking alongside the other pedestrians across the crosswalk", + "measured_bbox": [ + 0.5767, + 0.4388, + 0.6397, + 0.6278 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_man_in_suit.png", + "raw_ref_image": "references/raw_ref_man_in_suit_attempt_01.png", + "reference_verify": "references/reference_verify_man_in_suit.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_man_in_suit_attempt_01.png", + "output": "references/ref_man_in_suit.png", + "mask": "references/sam_mask_man_in_suit.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 351.0, + 27.0, + 671.0, + 1004.0 + ], + "mask_score": 3.48496, + "mask_area_ratio": 0.144686, + "elapsed_seconds": 9.7885 + } + }, + { + "name": "young_girl", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_51/282555,57b9f000182048e6.jpg:person:13", + "source_name": "pedestrian", + "source_description": "Young girl with brown hair, wearing a blue patterned top. Source dataset: CrowdHuman. Scene context: A sunny outdoor scene featuring the red entrance arch to Navy Pier Beer Garden and a tall brick tower, with a diverse crowd of people walking along the waterfront promenade.", + "sub_caption": "pedestrian: Young girl with brown hair, wearing a blue patterned top.. Scene role: Walking across the intersection near the person in the yellow top", + "measured_bbox": [ + 0.6354, + 0.4889, + 0.6677, + 0.6337 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_young_girl.png", + "raw_ref_image": "references/raw_ref_young_girl_attempt_01.png", + "reference_verify": "references/reference_verify_young_girl.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_young_girl_attempt_01.png", + "output": "references/ref_young_girl.png", + "mask": "references/sam_mask_young_girl.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 369.0, + 52.0, + 661.0, + 1003.0 + ], + "mask_score": 3.482282, + "mask_area_ratio": 0.133298, + "elapsed_seconds": 8.3216 + } + }, + { + "name": "traffic_light", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_36/273275,6a11d000f52c34a9.jpg:object:0", + "source_name": "traffic light", + "source_description": "A black multi-lens traffic light fixture mounted on a pole above the street. Source dataset: CrowdHuman. Scene context: A male tour guide is speaking to a group of people standing on a city sidewalk next to a road crossing.", + "sub_caption": "traffic light: A black multi-lens traffic light fixture mounted on a pole above the street.. Scene role: Suspended over the intersection, showing a red light to halt the vehicles", + "measured_bbox": [ + 0.5513, + 0.0408, + 0.6462, + 0.1518 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_traffic_light.png", + "raw_ref_image": "references/raw_ref_traffic_light_attempt_01.png", + "reference_verify": "references/reference_verify_traffic_light.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_traffic_light_attempt_01.png", + "output": "references/ref_traffic_light.png", + "mask": "references/sam_mask_traffic_light.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 113.0, + 201.0, + 923.0, + 826.0 + ], + "mask_score": 3.467034, + "mask_area_ratio": 0.289252, + "elapsed_seconds": 9.874 + } + }, + { + "name": "street_trees", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_37/282555,a4aef000d9c10933.jpg:object:3", + "source_name": "trees", + "source_description": "Various green trees and shrubs lining the pathway and visible in the background gardens. Source dataset: CrowdHuman. Scene context: A large crowd of tourists walks along the pathway towards the Taj Mahal on a clear day.", + "sub_caption": "trees: Various green trees and shrubs lining the pathway and visible in the background gardens.. Scene role: Planted along the sidewalks on both sides of the street, providing urban greenery", + "measured_bbox": [ + 0.542, + 0.2363, + 0.636, + 0.493 + ], + "detection_confidence": 0.9, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_street_trees.png", + "raw_ref_image": "references/raw_ref_street_trees_attempt_03.png", + "reference_verify": "references/reference_verify_street_trees.json", + "reference_verify_passed": true, + "reference_attempts": 3, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_street_trees_attempt_03.png", + "output": "references/ref_street_trees.png", + "mask": "references/sam_mask_street_trees.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 22.0, + 60.0, + 1003.0, + 968.0 + ], + "mask_score": 3.301958, + "mask_area_ratio": 0.393952, + "elapsed_seconds": 8.2223 + } + }, + { + "name": "silver_car", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:be3d3a81-326a032d:object:0", + "source_name": "silver car", + "source_description": "A silver compact hatchback car facing forward, waiting at an intersection. Its brake lights are on. Source dataset: BDD100K. Scene context: A rainy street scene showing cars waiting at an intersection surrounded by tall buildings and urban infrastructure.", + "sub_caption": "silver car: A silver compact hatchback car facing forward, waiting at an intersection. Its brake lights are on.. Scene role: Stopped in the traffic lane in the foreground, waiting for the pedestrians to cross", + "measured_bbox": [ + 0.3062, + 0.4281, + 0.5436, + 0.7674 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_silver_car.png", + "raw_ref_image": "references/raw_ref_silver_car_attempt_01.png", + "reference_verify": "references/reference_verify_silver_car.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_silver_car_attempt_01.png", + "output": "references/ref_silver_car.png", + "mask": "references/sam_mask_silver_car.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 80.0, + 167.0, + 957.0, + 937.0 + ], + "mask_score": 3.434142, + "mask_area_ratio": 0.414005, + "elapsed_seconds": 8.3073 + } + } + ], + "not_emitted": [], + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/vocab_task.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/vocab_task.json new file mode 100644 index 0000000000000000000000000000000000000000..4cf0dccdf419d38552a93cb5d19a68345dcad56f --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/vocab_task.json @@ -0,0 +1,98 @@ +{ + "task_id": "sample_000005", + "sample_id": "sample_000005", + "sample_index": 5, + "target_total": 6, + "target_people": 3, + "target_objects": 3, + "people_candidates": [ + { + "candidate_index": 0, + "source_offset": 46975, + "image_id": "CrowdHuman:data/data_24/282555,49a4e000d44469c6.jpg:person:49", + "name": "person", + "description": "Standing, wearing a bright yellow top Source dataset: CrowdHuman. Scene context: A large crowd of tourists is gathered in front of the Louvre museum and its iconic glass pyramid on a sunny day." + }, + { + "candidate_index": 1, + "source_offset": 55111, + "image_id": "CrowdHuman:data/data_28/273278,b62280001bda6f1a.jpg:person:19", + "name": "crowd member", + "description": "A person far right in the background wearing a suit. Source dataset: CrowdHuman. Scene context: A crowd of people gathers in front of an old stone building with a prominent arched doorway and two large animal statues." + }, + { + "candidate_index": 2, + "source_offset": 123738, + "image_id": "CrowdHuman:data/data_51/282555,57b9f000182048e6.jpg:person:13", + "name": "pedestrian", + "description": "Young girl with brown hair, wearing a blue patterned top. Source dataset: CrowdHuman. Scene context: A sunny outdoor scene featuring the red entrance arch to Navy Pier Beer Garden and a tall brick tower, with a diverse crowd of people walking along the waterfront promenade." + }, + { + "candidate_index": 3, + "source_offset": 32630, + "image_id": "CrowdHuman:data/data_2/282555,e0af90003451118a.jpg:person:8", + "name": "firefighter", + "description": "Wearing a dark uniform with yellow reflective stripes and a white helmet, standing facing away near the fire truck. Source dataset: CrowdHuman. Scene context: Emergency response personnel, including firefighters and ambulance crew, are gathered outside a large classical building with pillars and banners, accompanied by emergency vehicles." + }, + { + "candidate_index": 4, + "source_offset": 2956, + "image_id": "CrowdHuman:data/data_1/273275,f68c20007e0bf148.jpg:person:3", + "name": "uniformed officer", + "description": "wearing a khaki uniform and helmet, holding a baton, looking towards the left Source dataset: CrowdHuman. Scene context: A large crowd of people, including some in uniform with batons and helmets, stands in front of a red and yellow building." + }, + { + "candidate_index": 5, + "source_offset": 49906, + "image_id": "CrowdHuman:data/data_26/273278,110f89000f6dc4f9f.jpg:person:23", + "name": "pedestrian far left background", + "description": "A person walking in the background on the left. Source dataset: CrowdHuman. Scene context: A slightly elevated view of a city street and sidewalk, showing a subway entrance, pedestrians, runners, cars, and trees." + } + ], + "object_candidates": [ + { + "candidate_index": 0, + "source_offset": 6027, + "image_id": "CrowdHuman:data/data_10/273275,36cc4000c1fb4fda.jpg:object:3", + "name": "C'BON Cosmetics sign", + "description": "A large green 'C'BON COSMETICS' sign across the middle of the cylindrical glass building. Source dataset: CrowdHuman. Scene context: A bustling city street corner featuring several tall commercial buildings covered with prominent advertisements and brand logos." + }, + { + "candidate_index": 1, + "source_offset": 233879, + "image_id": "BDD100K:c789ecd3-819d4445:object:9", + "name": "sign", + "description": "A small diamond-shaped yellow sign mounted along the barrier. Source dataset: BDD100K. Scene context: Night driving on a highway with traffic ahead, streetlights, and a barrier wall." + }, + { + "candidate_index": 2, + "source_offset": 53140, + "image_id": "CrowdHuman:data/data_36/273275,6a11d000f52c34a9.jpg:object:0", + "name": "traffic light", + "description": "A black multi-lens traffic light fixture mounted on a pole above the street. Source dataset: CrowdHuman. Scene context: A male tour guide is speaking to a group of people standing on a city sidewalk next to a road crossing." + }, + { + "candidate_index": 3, + "source_offset": 242371, + "image_id": "BDD100K:c946c532-07177e0a:object:11", + "name": "concrete barrier", + "description": "A continuous low concrete wall acting as a barrier on the right side of the road. Source dataset: BDD100K. Scene context: View from inside a vehicle driving on a multi-lane highway during the day, with construction or industrial sites visible alongside." + }, + { + "candidate_index": 4, + "source_offset": 57443, + "image_id": "CrowdHuman:data/data_37/282555,a4aef000d9c10933.jpg:object:3", + "name": "trees", + "description": "Various green trees and shrubs lining the pathway and visible in the background gardens. Source dataset: CrowdHuman. Scene context: A large crowd of tourists walks along the pathway towards the Taj Mahal on a clear day." + }, + { + "candidate_index": 5, + "source_offset": 189425, + "image_id": "BDD100K:be3d3a81-326a032d:object:0", + "name": "silver car", + "description": "A silver compact hatchback car facing forward, waiting at an intersection. Its brake lights are on. Source dataset: BDD100K. Scene context: A rainy street scene showing cars waiting at an intersection surrounded by tall buildings and urban infrastructure." + } + ], + "rng_seed": 1782451638, + "created_at": 1782223460.3017533 +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/bbox_overlay.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/bbox_overlay.png new file mode 100644 index 0000000000000000000000000000000000000000..34ffbd4b3c03b90ab4162b58a418875baa732630 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/bbox_overlay.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af5b11aa68a3140dc64a158548dc38c3dd69e5f030aa2f1a2ce94828a16c2734 +size 1604060 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/compose_prompt.txt b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/compose_prompt.txt new file mode 100644 index 0000000000000000000000000000000000000000..41727cfcb40cfeae913be8b443a4671e76153088 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/compose_prompt.txt @@ -0,0 +1,103 @@ +Render the following JSON scene specification as a photorealistic 1280x720 image using a true 16:9 canvas. The rendered image must be a coherent driving scenario with an outdoor road, street, highway, intersection, curbside, parking, or traffic environment; do not render a non-driving indoor scene or product-catalog scene. Every listed person and object must appear visibly in the image. Keep normal proportions and the requested aspect ratio. The foreground must contain only subjects explicitly listed in the JSON scene specification. Do not add any unlisted foreground people, animals, vehicles, props, tools, products, signs, readable text, logos, or other localizable subjects. Background context may include non-localizable scenery only when it does not introduce a distinct foreground subject. No text, no labels, no logos, no watermarks. +JSON scene specification: +{ + "format": "structured_json_prompt", + "canvas": { + "size": [ + 1280, + 720 + ], + "aspect_ratio": "16:9", + "style": "photorealistic" + }, + "scene": { + "setting": "A bustling city street on a sunny day, near a classical building and a park area.", + "activity": "A dashcam view following a white van down a road, with a pedestrian crossing ahead while an emergency vehicle flashes its lights in the distance.", + "composition": "Wide 16:9 perspective from a driving vehicle. Double solid white lines guide the eye down the center of the asphalt. A white van occupies the midground right lane. The left background features a crosswalk, an emergency vehicle, and a crossing pedestrian. The right side shows a sidewalk with a foreground trash can, blooming trees, and a building facade displaying a large banner.", + "constraints": [ + "no text", + "no labels", + "no watermarks", + "true 16:9 composition", + "final canvas size 1280x720", + "normal human and object proportions", + "no squeezed perspective", + "no anamorphic stretching", + "every listed person and object must be visibly present", + "the foreground may contain only the listed people and objects", + "no extra foreground people, animals, vehicles, props, tools, products, signs, readable text, logos, or other localizable subjects absent from this JSON", + "the entire image must be a coherent driving scenario", + "the scene must include an outdoor road, street, highway, intersection, curbside, parking, or traffic environment", + "integrate all listed subjects as road users, vehicles, roadside objects, traffic infrastructure, or driving-context objects", + "do not render an office, kitchen, studio, product catalog, indoor room, or unrelated non-driving scene" + ] + }, + "people": [ + { + "name": "pedestrian", + "source_index": 1, + "source_image_id": "CrowdHuman:data/data_73/283991,17cd800008079067.jpg:person:18", + "source_name": "pedestrian", + "description": "A person walking across the street in the distance.", + "role_in_scene": "Crossing the street near the left background." + } + ], + "objects": [ + { + "name": "emergency_vehicle", + "source_index": 1, + "source_image_id": "BDD100K:b99f250d-886111c5:object:5", + "source_name": "vehicle", + "description": "A dark-colored vehicle with blue emergency lights flashing.", + "role_in_scene": "Stopped or parked in the distant left background." + }, + { + "name": "curbside_trash_can", + "source_index": 2, + "source_image_id": "CrowdHuman:data/data_10/273275,5da3d000f5358c3c.jpg:object:9", + "source_name": "trash can", + "description": "A dark blue cylindrical bin.", + "role_in_scene": "Placed on the sidewalk curb in the lower right foreground." + }, + { + "name": "museum_banner", + "source_index": 3, + "source_image_id": "CrowdHuman:data/data_24/282555,14d900042c7b9a4.jpg:object:3", + "source_name": "exhibition banner", + "description": "A large, rectangular maroon banner hanging on a building's facade.", + "role_in_scene": "Hanging from the classical architecture on the right side of the street." + }, + { + "name": "white_panel_van", + "source_index": 5, + "source_image_id": "BDD100K:b5047c50-e1facff6:object:2", + "source_name": "white van", + "description": "A large white panel van with illuminated red taillights.", + "role_in_scene": "Driving ahead in the right lane of the road." + }, + { + "name": "double_solid_white_line", + "source_index": 6, + "source_image_id": "BDD100K:bc886d37-5b22c313:object:7", + "source_name": "double solid white line", + "description": "Two continuous white painted lines on dark asphalt.", + "role_in_scene": "Dividing the traffic lanes down the center of the street." + }, + { + "name": "blooming_trees", + "source_index": 8, + "source_image_id": "CrowdHuman:data/data_26/283991,91fa000cda68156.jpg:object:5", + "source_name": "blooming trees", + "description": "Trees featuring vibrant pink blossoms.", + "role_in_scene": "Lining the sidewalk and park area on the right side of the street." + }, + { + "name": "green_street_sign", + "source_index": 12, + "source_image_id": "CrowdHuman:data/data_50/273278,febe100057ca94db.jpg:object:5", + "source_name": "street sign", + "description": "A green rectangular street sign.", + "role_in_scene": "Mounted on a pole on the left side of the street near the crosswalk." + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/crops/detect_refine_blooming_trees.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/crops/detect_refine_blooming_trees.png new file mode 100644 index 0000000000000000000000000000000000000000..7527f4f7f22dc4c85ef76f59d64d715d35abffa2 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/crops/detect_refine_blooming_trees.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:519802f6a19228621406e73416eba717bc1144c20f98afde23948c50bdb0b62a +size 138184 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/crops/detect_refine_curbside_trash_can.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/crops/detect_refine_curbside_trash_can.png new file mode 100644 index 0000000000000000000000000000000000000000..4df9a4893097f9f96bf0cc4645aeae21fd85ccc6 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/crops/detect_refine_curbside_trash_can.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/crops/detect_refine_double_solid_white_line.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/crops/detect_refine_double_solid_white_line.png new file mode 100644 index 0000000000000000000000000000000000000000..900e3ab56ac323e6a055bf12bbcdc93cf80e85f3 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/crops/detect_refine_double_solid_white_line.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2951d65161a07711db21dbb511221eb569010b44f3cfdb6b3b2250f34b5114a +size 125616 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/crops/detect_refine_emergency_vehicle.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/crops/detect_refine_emergency_vehicle.png new file mode 100644 index 0000000000000000000000000000000000000000..b44d88cb91b97158e1e046ce619285b372b4d66c Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/crops/detect_refine_emergency_vehicle.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/crops/detect_refine_green_street_sign.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/crops/detect_refine_green_street_sign.png new file mode 100644 index 0000000000000000000000000000000000000000..2fe7dc69f686b0ee2857e83afc25eb414d35dea8 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/crops/detect_refine_green_street_sign.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/crops/detect_refine_museum_banner.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/crops/detect_refine_museum_banner.png new file mode 100644 index 0000000000000000000000000000000000000000..b4eb1ea1aec655740449b6924d61fa31d75f6271 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/crops/detect_refine_museum_banner.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/crops/detect_refine_pedestrian.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/crops/detect_refine_pedestrian.png new file mode 100644 index 0000000000000000000000000000000000000000..a95ec1ba16a0564cc92069aacb1f7b329a10217e Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/crops/detect_refine_pedestrian.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/crops/detect_refine_white_panel_van.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/crops/detect_refine_white_panel_van.png new file mode 100644 index 0000000000000000000000000000000000000000..f3530e187ee577c41b850a2880862d28fd7d8994 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/crops/detect_refine_white_panel_van.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:906651d52c4c2fcebaa97ca5139720463423fe3e0c83067d092272dff70a26f7 +size 132627 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/crops/diversify_input_blooming_trees.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/crops/diversify_input_blooming_trees.png new file mode 100644 index 0000000000000000000000000000000000000000..352dc0f0372238f1e96d452e24d21ba8fafe8df4 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/crops/diversify_input_blooming_trees.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:575f2f60f5d18a03d0f22e4b08fc3192f478318ce46f47031cacfae778a63a7a +size 209904 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/crops/diversify_input_curbside_trash_can.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/crops/diversify_input_curbside_trash_can.png new file mode 100644 index 0000000000000000000000000000000000000000..77ea6d4ef6b1e0027fa2007575e1a95ad6c9fc58 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/crops/diversify_input_curbside_trash_can.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/crops/diversify_input_double_solid_white_line.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/crops/diversify_input_double_solid_white_line.png new file mode 100644 index 0000000000000000000000000000000000000000..aad495c6c8c98b907cc834648c684bb4bb541a67 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/crops/diversify_input_double_solid_white_line.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f20184a49ec77fe176286d5cb82f31b0ee0f242337a2b6d958244a0fec31f11f +size 156433 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/crops/diversify_input_emergency_vehicle.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/crops/diversify_input_emergency_vehicle.png new file mode 100644 index 0000000000000000000000000000000000000000..5e06806aed80cfc669f684ec90626d1c462ab886 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/crops/diversify_input_emergency_vehicle.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/crops/diversify_input_green_street_sign.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/crops/diversify_input_green_street_sign.png new file mode 100644 index 0000000000000000000000000000000000000000..7b61ae76db39b7f2ba326367b4978263c9aea7f3 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/crops/diversify_input_green_street_sign.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/crops/diversify_input_museum_banner.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/crops/diversify_input_museum_banner.png new file mode 100644 index 0000000000000000000000000000000000000000..f6aa137aa6a9436a457cc5f75cd987a5b225ad6e Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/crops/diversify_input_museum_banner.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/crops/diversify_input_pedestrian.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/crops/diversify_input_pedestrian.png new file mode 100644 index 0000000000000000000000000000000000000000..c8ba711a77f1052006b0a2be9e97acafefb109b2 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/crops/diversify_input_pedestrian.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/crops/diversify_input_white_panel_van.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/crops/diversify_input_white_panel_van.png new file mode 100644 index 0000000000000000000000000000000000000000..3c06bcfa6e8d157a1dca7cace4bc869903417d54 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/crops/diversify_input_white_panel_van.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6fa33eb92247288ec96641140d1092557f4079df5e17e36f30c72342d2ad7bb2 +size 188509 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/detections.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/detections.json new file mode 100644 index 0000000000000000000000000000000000000000..5dfa3c573008643651e776a985d7284ddd649bb1 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/detections.json @@ -0,0 +1,154 @@ +[ + { + "name": "pedestrian", + "present": true, + "bbox": [ + 0.2151, + 0.4819, + 0.2507, + 0.5947 + ], + "confidence": "high", + "notes": "A person walking across the street in the distance.", + "coarse_bbox": [ + 0.215, + 0.48, + 0.25, + 0.592 + ], + "refine_crop": "crops/detect_refine_pedestrian.png" + }, + { + "name": "emergency_vehicle", + "present": true, + "bbox": [ + 0.325, + 0.4787, + 0.3786, + 0.5486 + ], + "confidence": 0.95, + "notes": "Tight bounding box around the visible portion of the dark-colored vehicle with flashing emergency lights.", + "coarse_bbox": [ + 0.324, + 0.48, + 0.375, + 0.55 + ], + "refine_crop": "crops/detect_refine_emergency_vehicle.png" + }, + { + "name": "curbside_trash_can", + "present": true, + "bbox": [ + 0.8371, + 0.5448, + 0.9204, + 0.7599 + ], + "confidence": 0.99, + "notes": "A dark blue cylindrical trash can is clearly visible, occupying most of the image. The box captures its full extent.", + "coarse_bbox": [ + 0.839, + 0.542, + 0.917, + 0.753 + ], + "refine_crop": "crops/detect_refine_curbside_trash_can.png" + }, + { + "name": "museum_banner", + "present": true, + "bbox": [ + 0.8013, + 0.0164, + 0.8543, + 0.2771 + ], + "confidence": 0.99, + "notes": "Tight bounding box around the maroon exhibition banner.", + "coarse_bbox": [ + 0.799, + 0.014, + 0.855, + 0.278 + ], + "refine_crop": "crops/detect_refine_museum_banner.png" + }, + { + "name": "white_panel_van", + "present": true, + "bbox": [ + 0.5122, + 0.3306, + 0.6867, + 0.6936 + ], + "confidence": 0.99, + "notes": "Tight bounding box around the visible white panel van including its wheels and mirrors.", + "coarse_bbox": [ + 0.512, + 0.332, + 0.685, + 0.675 + ], + "refine_crop": "crops/detect_refine_white_panel_van.png" + }, + { + "name": "double_solid_white_line", + "present": true, + "bbox": [ + 0.2383, + 0.6095, + 0.4221, + 0.8925 + ], + "confidence": "high", + "notes": "Double solid white line visible in the center of the road.", + "coarse_bbox": [ + 0.237, + 0.61, + 0.422, + 0.89 + ], + "refine_crop": "crops/detect_refine_double_solid_white_line.png" + }, + { + "name": "blooming_trees", + "present": true, + "bbox": [ + 0.6279, + 0.2153, + 0.8163, + 0.5163 + ], + "confidence": 0.95, + "notes": "Tight bounding box capturing the visible extent of the blooming pink trees in the background, carefully excluding as much of the foreground white van and surrounding buildings/poles as possible while still containing the entire visible bloom.", + "coarse_bbox": [ + 0.632, + 0.24, + 0.812, + 0.493 + ], + "refine_crop": "crops/detect_refine_blooming_trees.png" + }, + { + "name": "green_street_sign", + "present": true, + "bbox": [ + 0.1915, + 0.2239, + 0.2775, + 0.2533 + ], + "confidence": 0.9, + "notes": "A green rectangular street sign is visible in the center of the crop.", + "coarse_bbox": [ + 0.191, + 0.217, + 0.278, + 0.261 + ], + "refine_crop": "crops/detect_refine_green_street_sign.png" + } +] diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/main_image.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/main_image.png new file mode 100644 index 0000000000000000000000000000000000000000..8daa9c9e836d551fe709eb5f7c6a343b3320eab1 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/main_image.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:196224b33b3d2cd01f14ffd54488269afcd7435eb8beff88d4711c012f125c15 +size 1712095 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/plan.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/plan.json new file mode 100644 index 0000000000000000000000000000000000000000..3386febfdfada845acecc0ff1c4ef272c8bc2062 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/plan.json @@ -0,0 +1,202 @@ +{ + "sample_id": "sample_000006", + "target_total": 8, + "target_people": 1, + "target_objects": 7, + "compose_prompt": { + "format": "structured_json_prompt", + "canvas": { + "size": [ + 1280, + 720 + ], + "aspect_ratio": "16:9", + "style": "photorealistic" + }, + "scene": { + "setting": "A bustling city street on a sunny day, near a classical building and a park area.", + "activity": "A dashcam view following a white van down a road, with a pedestrian crossing ahead while an emergency vehicle flashes its lights in the distance.", + "composition": "Wide 16:9 perspective from a driving vehicle. Double solid white lines guide the eye down the center of the asphalt. A white van occupies the midground right lane. The left background features a crosswalk, an emergency vehicle, and a crossing pedestrian. The right side shows a sidewalk with a foreground trash can, blooming trees, and a building facade displaying a large banner.", + "constraints": [ + "no text", + "no labels", + "no watermarks", + "true 16:9 composition", + "final canvas size 1280x720", + "normal human and object proportions", + "no squeezed perspective", + "no anamorphic stretching", + "every listed person and object must be visibly present", + "the foreground may contain only the listed people and objects", + "no extra foreground people, animals, vehicles, props, tools, products, signs, readable text, logos, or other localizable subjects absent from this JSON", + "the entire image must be a coherent driving scenario", + "the scene must include an outdoor road, street, highway, intersection, curbside, parking, or traffic environment", + "integrate all listed subjects as road users, vehicles, roadside objects, traffic infrastructure, or driving-context objects", + "do not render an office, kitchen, studio, product catalog, indoor room, or unrelated non-driving scene" + ] + }, + "people": [ + { + "name": "pedestrian", + "source_index": 1, + "source_image_id": "CrowdHuman:data/data_73/283991,17cd800008079067.jpg:person:18", + "source_name": "pedestrian", + "description": "A person walking across the street in the distance.", + "role_in_scene": "Crossing the street near the left background." + } + ], + "objects": [ + { + "name": "emergency_vehicle", + "source_index": 1, + "source_image_id": "BDD100K:b99f250d-886111c5:object:5", + "source_name": "vehicle", + "description": "A dark-colored vehicle with blue emergency lights flashing.", + "role_in_scene": "Stopped or parked in the distant left background." + }, + { + "name": "curbside_trash_can", + "source_index": 2, + "source_image_id": "CrowdHuman:data/data_10/273275,5da3d000f5358c3c.jpg:object:9", + "source_name": "trash can", + "description": "A dark blue cylindrical bin.", + "role_in_scene": "Placed on the sidewalk curb in the lower right foreground." + }, + { + "name": "museum_banner", + "source_index": 3, + "source_image_id": "CrowdHuman:data/data_24/282555,14d900042c7b9a4.jpg:object:3", + "source_name": "exhibition banner", + "description": "A large, rectangular maroon banner hanging on a building's facade.", + "role_in_scene": "Hanging from the classical architecture on the right side of the street." + }, + { + "name": "white_panel_van", + "source_index": 5, + "source_image_id": "BDD100K:b5047c50-e1facff6:object:2", + "source_name": "white van", + "description": "A large white panel van with illuminated red taillights.", + "role_in_scene": "Driving ahead in the right lane of the road." + }, + { + "name": "double_solid_white_line", + "source_index": 6, + "source_image_id": "BDD100K:bc886d37-5b22c313:object:7", + "source_name": "double solid white line", + "description": "Two continuous white painted lines on dark asphalt.", + "role_in_scene": "Dividing the traffic lanes down the center of the street." + }, + { + "name": "blooming_trees", + "source_index": 8, + "source_image_id": "CrowdHuman:data/data_26/283991,91fa000cda68156.jpg:object:5", + "source_name": "blooming trees", + "description": "Trees featuring vibrant pink blossoms.", + "role_in_scene": "Lining the sidewalk and park area on the right side of the street." + }, + { + "name": "green_street_sign", + "source_index": 12, + "source_image_id": "CrowdHuman:data/data_50/273278,febe100057ca94db.jpg:object:5", + "source_name": "street sign", + "description": "A green rectangular street sign.", + "role_in_scene": "Mounted on a pole on the left side of the street near the crosswalk." + } + ] + }, + "expected_subjects": [ + { + "name": "pedestrian", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_73/283991,17cd800008079067.jpg:person:18", + "source_name": "pedestrian", + "source_description": "Another person in the distant background near the green structure. Source dataset: CrowdHuman. Scene context: A large crowd of people walking across a street with trees, streetlamps, and classic architecture in the background.", + "sub_caption": "pedestrian: A person walking across the street in the distance.. Scene role: Crossing the street near the left background.", + "ref_style": "white_bg_full_body_front" + }, + { + "name": "emergency_vehicle", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b99f250d-886111c5:object:5", + "source_name": "vehicle", + "source_description": "A dark-colored vehicle partially visible in the left background with blue emergency lights flashing. Source dataset: BDD100K. Scene context: A nighttime city street intersection showing a crosswalk, construction barriers, and illuminated traffic signals.", + "sub_caption": "vehicle: A dark-colored vehicle with blue emergency lights flashing.. Scene role: Stopped or parked in the distant left background.", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "curbside_trash_can", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_10/273275,5da3d000f5358c3c.jpg:object:9", + "source_name": "trash can", + "source_description": "dark blue cylindrical bin partially visible in the foreground Source dataset: CrowdHuman. Scene context: A group of children and an adult pose for a photo in front of a roller coaster at an amusement park.", + "sub_caption": "trash can: A dark blue cylindrical bin.. Scene role: Placed on the sidewalk curb in the lower right foreground.", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "museum_banner", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_24/282555,14d900042c7b9a4.jpg:object:3", + "source_name": "exhibition banner", + "source_description": "A large, rectangular banner hanging on the building's facade, featuring a maroon background and text. Source dataset: CrowdHuman. Scene context: A large, classical building, likely a museum, with many people sitting and standing on its wide front steps, while a yellow taxi speeds past in the foreground.", + "sub_caption": "exhibition banner: A large, rectangular maroon banner hanging on a building's facade.. Scene role: Hanging from the classical architecture on the right side of the street.", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "white_panel_van", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b5047c50-e1facff6:object:2", + "source_name": "white van", + "source_description": "A large white panel van with red taillights illuminated, driving in the right lane ahead. Source dataset: BDD100K. Scene context: View from inside a vehicle driving down a multi-lane city street on a sunny day with moderate traffic.", + "sub_caption": "white van: A large white panel van with illuminated red taillights.. Scene role: Driving ahead in the right lane of the road.", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "double_solid_white_line", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:bc886d37-5b22c313:object:7", + "source_name": "double solid white line", + "source_description": "Two continuous white painted lines on the dark asphalt road surface, separating the two lanes of traffic. Source dataset: BDD100K. Scene context: A view from inside a car driving through a brightly lit tunnel with tiled walls, following a silver SUV and a dark compact car.", + "sub_caption": "double solid white line: Two continuous white painted lines on dark asphalt.. Scene role: Dividing the traffic lanes down the center of the street.", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "blooming_trees", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_26/283991,91fa000cda68156.jpg:object:5", + "source_name": "blooming trees", + "source_description": "Trees with pink blossoms in the background park area. Source dataset: CrowdHuman. Scene context: People are walking along a sidewalk next to a street lined with trees, some in bloom, with a police officer standing near a parked car.", + "sub_caption": "blooming trees: Trees featuring vibrant pink blossoms.. Scene role: Lining the sidewalk and park area on the right side of the street.", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "green_street_sign", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_50/273278,febe100057ca94db.jpg:object:5", + "source_name": "street sign", + "source_description": "A green street sign with white text visible on the left side. Source dataset: CrowdHuman. Scene context: Several pedestrians are crossing a street at a crosswalk on a sunny day.", + "sub_caption": "street sign: A green rectangular street sign.. Scene role: Mounted on a pole on the left side of the street near the crosswalk.", + "ref_style": "white_bg_encyclopedia_photo" + } + ], + "vocab_task_path": "sample_000006/vocab_task.json", + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references.json new file mode 100644 index 0000000000000000000000000000000000000000..4511af79e7b48537b81bfe4cf0ca37469a641bdd --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references.json @@ -0,0 +1,261 @@ +{ + "references": [ + { + "name": "pedestrian", + "ref_image": "references/ref_pedestrian.png", + "raw_ref_image": "references/raw_ref_pedestrian_attempt_01.png", + "diversify_input": "crops/diversify_input_pedestrian.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_pedestrian_attempt_01.png", + "output": "references/ref_pedestrian.png", + "mask": "references/sam_mask_pedestrian.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 353.0, + 44.0, + 672.0, + 1013.0 + ], + "mask_score": 3.456561, + "mask_area_ratio": 0.147466, + "elapsed_seconds": 8.2841 + }, + "reference_verify": "references/reference_verify_pedestrian.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "emergency_vehicle", + "ref_image": "references/ref_emergency_vehicle.png", + "raw_ref_image": "references/raw_ref_emergency_vehicle_attempt_01.png", + "diversify_input": "crops/diversify_input_emergency_vehicle.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_emergency_vehicle_attempt_01.png", + "output": "references/ref_emergency_vehicle.png", + "mask": "references/sam_mask_emergency_vehicle.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 261.0, + 1023.0, + 782.0 + ], + "mask_score": 3.339466, + "mask_area_ratio": 0.300308, + "elapsed_seconds": 8.2719 + }, + "reference_verify": "references/reference_verify_emergency_vehicle.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "curbside_trash_can", + "ref_image": "references/ref_curbside_trash_can.png", + "raw_ref_image": "references/raw_ref_curbside_trash_can_attempt_01.png", + "diversify_input": "crops/diversify_input_curbside_trash_can.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_curbside_trash_can_attempt_01.png", + "output": "references/ref_curbside_trash_can.png", + "mask": "references/sam_mask_curbside_trash_can.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 175.0, + 42.0, + 837.0, + 982.0 + ], + "mask_score": 3.480803, + "mask_area_ratio": 0.406976, + "elapsed_seconds": 8.7724 + }, + "reference_verify": "references/reference_verify_curbside_trash_can.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "museum_banner", + "ref_image": "references/ref_museum_banner.png", + "raw_ref_image": "references/raw_ref_museum_banner_attempt_01.png", + "diversify_input": "crops/diversify_input_museum_banner.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_museum_banner_attempt_01.png", + "output": "references/ref_museum_banner.png", + "mask": "references/sam_mask_museum_banner.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 317.0, + 20.0, + 703.0, + 981.0 + ], + "mask_score": 3.453619, + "mask_area_ratio": 0.268547, + "elapsed_seconds": 8.1747 + }, + "reference_verify": "references/reference_verify_museum_banner.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "white_panel_van", + "ref_image": "references/ref_white_panel_van.png", + "raw_ref_image": "references/raw_ref_white_panel_van_attempt_01.png", + "diversify_input": "crops/diversify_input_white_panel_van.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_white_panel_van_attempt_01.png", + "output": "references/ref_white_panel_van.png", + "mask": "references/sam_mask_white_panel_van.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 102.0, + 91.0, + 877.0, + 937.0 + ], + "mask_score": 3.457159, + "mask_area_ratio": 0.429852, + "elapsed_seconds": 10.1474 + }, + "reference_verify": "references/reference_verify_white_panel_van.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "double_solid_white_line", + "ref_image": "references/ref_double_solid_white_line.png", + "raw_ref_image": "references/raw_ref_double_solid_white_line_attempt_01.png", + "diversify_input": "crops/diversify_input_double_solid_white_line.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_double_solid_white_line_attempt_01.png", + "output": "references/ref_double_solid_white_line.png", + "mask": "references/sam_mask_double_solid_white_line.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 229.0, + 72.0, + 795.0, + 951.0 + ], + "mask_score": 3.470715, + "mask_area_ratio": 0.39155, + "elapsed_seconds": 9.6388 + }, + "reference_verify": "references/reference_verify_double_solid_white_line.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "blooming_trees", + "ref_image": "references/ref_blooming_trees.png", + "raw_ref_image": "references/raw_ref_blooming_trees_attempt_01.png", + "diversify_input": "crops/diversify_input_blooming_trees.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_blooming_trees_attempt_01.png", + "output": "references/ref_blooming_trees.png", + "mask": "references/sam_mask_blooming_trees.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 93.0, + 1023.0, + 967.0 + ], + "mask_score": 3.479366, + "mask_area_ratio": 0.423422, + "elapsed_seconds": 8.1597 + }, + "reference_verify": "references/reference_verify_blooming_trees.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "green_street_sign", + "ref_image": "references/ref_green_street_sign.png", + "raw_ref_image": "references/raw_ref_green_street_sign_attempt_01.png", + "diversify_input": "crops/diversify_input_green_street_sign.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_green_street_sign_attempt_01.png", + "output": "references/ref_green_street_sign.png", + "mask": "references/sam_mask_green_street_sign.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 26.0, + 378.0, + 998.0, + 645.0 + ], + "mask_score": 3.473778, + "mask_area_ratio": 0.191363, + "elapsed_seconds": 9.6707 + }, + "reference_verify": "references/reference_verify_green_street_sign.json", + "reference_verify_passed": true, + "reference_attempts": 1 + } + ], + "reference_errors": {} +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/ref_blooming_trees.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/ref_blooming_trees.png new file mode 100644 index 0000000000000000000000000000000000000000..52506e9076fe98888e7fac656ed2192d44fd6ef2 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/ref_blooming_trees.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23942453edbd4cd57d36e4a9da3e727e1f252464a4f9bf45fd6af265a3d48a74 +size 1173056 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/ref_curbside_trash_can.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/ref_curbside_trash_can.png new file mode 100644 index 0000000000000000000000000000000000000000..132f085082eb6250ad6f75fe58dd62906bb990f3 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/ref_curbside_trash_can.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b392558cdfd5c7347af0dbb1f0791df792b9beb6d17a4034c17af7c025736b8 +size 625599 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/ref_double_solid_white_line.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/ref_double_solid_white_line.png new file mode 100644 index 0000000000000000000000000000000000000000..ac2253c40d46cdad987def4b3c93427d75a55684 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/ref_double_solid_white_line.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b63a7985c36025b08ef882eb8fbbc040d7d345e09fc4bb8e1c153eea5dadf33c +size 971366 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/ref_emergency_vehicle.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/ref_emergency_vehicle.png new file mode 100644 index 0000000000000000000000000000000000000000..fc2e9da955b47abeb683bc469587f1c6206eb4fc --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/ref_emergency_vehicle.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb00c2d57fbd82aa11df107cf816ba6d5d754ee353f5608da8d90af6df2cc99e +size 578601 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/ref_green_street_sign.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/ref_green_street_sign.png new file mode 100644 index 0000000000000000000000000000000000000000..dc4b3c0040596254f81def55f9848a3621f31fad --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/ref_green_street_sign.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:377bff65dcc95ec38c62db1d69ce68d4652c0d76dea6ebe480c3bb33e924ae6c +size 185759 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/ref_museum_banner.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/ref_museum_banner.png new file mode 100644 index 0000000000000000000000000000000000000000..89a4458356650df479e8b7885b5fa0009a321c38 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/ref_museum_banner.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb161833c72ad114317db238d2086aa4731961303a9b53f62ea9346cc8ffb150 +size 469254 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/ref_pedestrian.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/ref_pedestrian.png new file mode 100644 index 0000000000000000000000000000000000000000..0a2eccfab780d4bd02ac138aad5f0d00804e7e89 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/ref_pedestrian.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c70105044394983b459503ee5bfdecf3fbd63c48d4ef0c382279805594d68504 +size 304942 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/ref_white_panel_van.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/ref_white_panel_van.png new file mode 100644 index 0000000000000000000000000000000000000000..32d2b14e0ae5bff02ebd869c29f754c47424636d --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/ref_white_panel_van.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a6156b83277b19e471f92a15475cf020b53c90c49b8d52508539023034fa8aa +size 640865 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/reference_verify_blooming_trees.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/reference_verify_blooming_trees.json new file mode 100644 index 0000000000000000000000000000000000000000..b23bd8b31e5dcefeee6b485c3e5dfd49ae0f1359 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/reference_verify_blooming_trees.json @@ -0,0 +1,46 @@ +{ + "name": "blooming_trees", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_blooming_trees_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_blooming_trees_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_blooming_trees_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_blooming_trees_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/candidate_ref_blooming_trees_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/candidate_sam_mask_blooming_trees_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 93.0, + 1023.0, + 967.0 + ], + "mask_score": 3.479366, + "mask_area_ratio": 0.423422, + "elapsed_seconds": 8.1597 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "Isolated blooming trees with pink blossoms on a white background." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/reference_verify_curbside_trash_can.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/reference_verify_curbside_trash_can.json new file mode 100644 index 0000000000000000000000000000000000000000..614d66d11add280831abe321546c06a0ee71bb12 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/reference_verify_curbside_trash_can.json @@ -0,0 +1,46 @@ +{ + "name": "curbside_trash_can", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_curbside_trash_can_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_curbside_trash_can_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_curbside_trash_can_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_curbside_trash_can_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/candidate_ref_curbside_trash_can_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/candidate_sam_mask_curbside_trash_can_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 175.0, + 42.0, + 837.0, + 982.0 + ], + "mask_score": 3.480803, + "mask_area_ratio": 0.406976, + "elapsed_seconds": 8.7724 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image perfectly depicts an isolated dark blue trash can against a white background, satisfying all requirements for an object reference." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/reference_verify_double_solid_white_line.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/reference_verify_double_solid_white_line.json new file mode 100644 index 0000000000000000000000000000000000000000..51f1e45123ab1e3a8ee2546f91ee385935ea27a1 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/reference_verify_double_solid_white_line.json @@ -0,0 +1,46 @@ +{ + "name": "double_solid_white_line", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_double_solid_white_line_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_double_solid_white_line_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_double_solid_white_line_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_double_solid_white_line_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/candidate_ref_double_solid_white_line_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/candidate_sam_mask_double_solid_white_line_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 229.0, + 72.0, + 795.0, + 951.0 + ], + "mask_score": 3.470715, + "mask_area_ratio": 0.39155, + "elapsed_seconds": 9.6388 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The double solid white lines are clearly visible on the dark asphalt, functioning as a representative crop for this continuous surface feature." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/reference_verify_emergency_vehicle.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/reference_verify_emergency_vehicle.json new file mode 100644 index 0000000000000000000000000000000000000000..6396b0cb564d62049f3c0a4267494ce501f6f123 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/reference_verify_emergency_vehicle.json @@ -0,0 +1,46 @@ +{ + "name": "emergency_vehicle", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_emergency_vehicle_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_emergency_vehicle_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_emergency_vehicle_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_emergency_vehicle_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/candidate_ref_emergency_vehicle_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/candidate_sam_mask_emergency_vehicle_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 261.0, + 1023.0, + 782.0 + ], + "mask_score": 3.339466, + "mask_area_ratio": 0.300308, + "elapsed_seconds": 8.2719 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows a complete, isolated emergency vehicle on a white background, which is perfectly acceptable as a dataset reference." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/reference_verify_green_street_sign.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/reference_verify_green_street_sign.json new file mode 100644 index 0000000000000000000000000000000000000000..f8edd812661f997daa7aaa1be069d191ae03a1bf --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/reference_verify_green_street_sign.json @@ -0,0 +1,46 @@ +{ + "name": "green_street_sign", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_green_street_sign_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_green_street_sign_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_green_street_sign_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_green_street_sign_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/candidate_ref_green_street_sign_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/candidate_sam_mask_green_street_sign_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 26.0, + 378.0, + 998.0, + 645.0 + ], + "mask_score": 3.473778, + "mask_area_ratio": 0.191363, + "elapsed_seconds": 9.6707 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image clearly shows a green rectangular street sign isolated on a white background. It satisfies all hard requirements." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/reference_verify_museum_banner.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/reference_verify_museum_banner.json new file mode 100644 index 0000000000000000000000000000000000000000..d5a4f370479175618a1e20c29b0471c551d76a4a --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/reference_verify_museum_banner.json @@ -0,0 +1,46 @@ +{ + "name": "museum_banner", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_museum_banner_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_museum_banner_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_museum_banner_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_museum_banner_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/candidate_ref_museum_banner_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/candidate_sam_mask_museum_banner_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 317.0, + 20.0, + 703.0, + 981.0 + ], + "mask_score": 3.453619, + "mask_area_ratio": 0.268547, + "elapsed_seconds": 8.1747 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The exhibition banner is fully visible, isolated on a white background, and not cropped." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/reference_verify_pedestrian.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/reference_verify_pedestrian.json new file mode 100644 index 0000000000000000000000000000000000000000..314798539a1383f1e107e9ff6d17020aed8cee7c --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/reference_verify_pedestrian.json @@ -0,0 +1,46 @@ +{ + "name": "pedestrian", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_pedestrian_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_pedestrian_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_pedestrian_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_pedestrian_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/candidate_ref_pedestrian_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/candidate_sam_mask_pedestrian_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 353.0, + 44.0, + 672.0, + 1013.0 + ], + "mask_score": 3.456561, + "mask_area_ratio": 0.147466, + "elapsed_seconds": 8.2841 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows a full body of a person on a white background with no cropping or truncation. All hard requirements are satisfied." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/reference_verify_white_panel_van.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/reference_verify_white_panel_van.json new file mode 100644 index 0000000000000000000000000000000000000000..ccc7bd7fd5babac055de28470b7ef0b0def67825 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/reference_verify_white_panel_van.json @@ -0,0 +1,46 @@ +{ + "name": "white_panel_van", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_white_panel_van_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_white_panel_van_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_white_panel_van_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_white_panel_van_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/candidate_ref_white_panel_van_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/candidate_sam_mask_white_panel_van_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 102.0, + 91.0, + 877.0, + 937.0 + ], + "mask_score": 3.457159, + "mask_area_ratio": 0.429852, + "elapsed_seconds": 10.1474 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The white panel van is fully visible, centered, and isolated on a white background without any cropping." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/sam_mask_blooming_trees.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/sam_mask_blooming_trees.png new file mode 100644 index 0000000000000000000000000000000000000000..b8858df83a826a6e849f41c825bda4a8832d3ada Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/sam_mask_blooming_trees.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/sam_mask_curbside_trash_can.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/sam_mask_curbside_trash_can.png new file mode 100644 index 0000000000000000000000000000000000000000..12c22f34e2b68056eddce69b080357572df22bc1 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/sam_mask_curbside_trash_can.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/sam_mask_double_solid_white_line.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/sam_mask_double_solid_white_line.png new file mode 100644 index 0000000000000000000000000000000000000000..724684339a9caec222e741b9d08ad4c9b64a8684 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/sam_mask_double_solid_white_line.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/sam_mask_emergency_vehicle.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/sam_mask_emergency_vehicle.png new file mode 100644 index 0000000000000000000000000000000000000000..b830eae7cd77920178749f3ec9e91bd75f1624f3 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/sam_mask_emergency_vehicle.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/sam_mask_green_street_sign.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/sam_mask_green_street_sign.png new file mode 100644 index 0000000000000000000000000000000000000000..709089270e42da97e0f26e3ae19178df5cfbbed3 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/sam_mask_green_street_sign.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/sam_mask_museum_banner.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/sam_mask_museum_banner.png new file mode 100644 index 0000000000000000000000000000000000000000..1f54e8b83e3f6d25a4fe65b9120748910c94d0c6 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/sam_mask_museum_banner.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/sam_mask_pedestrian.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/sam_mask_pedestrian.png new file mode 100644 index 0000000000000000000000000000000000000000..6d3197a725bc43e71037bbee806d4229d572ddee Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/sam_mask_pedestrian.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/sam_mask_white_panel_van.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/sam_mask_white_panel_van.png new file mode 100644 index 0000000000000000000000000000000000000000..e81a4178d59f5f34c7e50d160ed29a487399c74a Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/sam_mask_white_panel_van.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/row.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/row.json new file mode 100644 index 0000000000000000000000000000000000000000..72dab8fc148337e1f8ef4b7864be488bc394f817 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/row.json @@ -0,0 +1,394 @@ +{ + "sample_id": "sample_000006", + "target_total": 8, + "target_people": 1, + "target_objects": 7, + "canvas_size": [ + 1280, + 720 + ], + "canvas_aspect_ratio": "16:9", + "main_image": "main_image.png", + "bbox_overlay": "bbox_overlay.png", + "plan": "plan.json", + "detections": "detections.json", + "vocab_task": "vocab_task.json", + "n_planned": 8, + "n_detected": 8, + "n_subjects": 8, + "subjects": [ + { + "name": "pedestrian", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_73/283991,17cd800008079067.jpg:person:18", + "source_name": "pedestrian", + "source_description": "Another person in the distant background near the green structure. Source dataset: CrowdHuman. Scene context: A large crowd of people walking across a street with trees, streetlamps, and classic architecture in the background.", + "sub_caption": "pedestrian: A person walking across the street in the distance.. Scene role: Crossing the street near the left background.", + "measured_bbox": [ + 0.2151, + 0.4819, + 0.2507, + 0.5947 + ], + "detection_confidence": "high", + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian.png", + "raw_ref_image": "references/raw_ref_pedestrian_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_pedestrian_attempt_01.png", + "output": "references/ref_pedestrian.png", + "mask": "references/sam_mask_pedestrian.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 353.0, + 44.0, + 672.0, + 1013.0 + ], + "mask_score": 3.456561, + "mask_area_ratio": 0.147466, + "elapsed_seconds": 8.2841 + } + }, + { + "name": "emergency_vehicle", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b99f250d-886111c5:object:5", + "source_name": "vehicle", + "source_description": "A dark-colored vehicle partially visible in the left background with blue emergency lights flashing. Source dataset: BDD100K. Scene context: A nighttime city street intersection showing a crosswalk, construction barriers, and illuminated traffic signals.", + "sub_caption": "vehicle: A dark-colored vehicle with blue emergency lights flashing.. Scene role: Stopped or parked in the distant left background.", + "measured_bbox": [ + 0.325, + 0.4787, + 0.3786, + 0.5486 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_emergency_vehicle.png", + "raw_ref_image": "references/raw_ref_emergency_vehicle_attempt_01.png", + "reference_verify": "references/reference_verify_emergency_vehicle.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_emergency_vehicle_attempt_01.png", + "output": "references/ref_emergency_vehicle.png", + "mask": "references/sam_mask_emergency_vehicle.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 261.0, + 1023.0, + 782.0 + ], + "mask_score": 3.339466, + "mask_area_ratio": 0.300308, + "elapsed_seconds": 8.2719 + } + }, + { + "name": "curbside_trash_can", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_10/273275,5da3d000f5358c3c.jpg:object:9", + "source_name": "trash can", + "source_description": "dark blue cylindrical bin partially visible in the foreground Source dataset: CrowdHuman. Scene context: A group of children and an adult pose for a photo in front of a roller coaster at an amusement park.", + "sub_caption": "trash can: A dark blue cylindrical bin.. Scene role: Placed on the sidewalk curb in the lower right foreground.", + "measured_bbox": [ + 0.8371, + 0.5448, + 0.9204, + 0.7599 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_curbside_trash_can.png", + "raw_ref_image": "references/raw_ref_curbside_trash_can_attempt_01.png", + "reference_verify": "references/reference_verify_curbside_trash_can.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_curbside_trash_can_attempt_01.png", + "output": "references/ref_curbside_trash_can.png", + "mask": "references/sam_mask_curbside_trash_can.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 175.0, + 42.0, + 837.0, + 982.0 + ], + "mask_score": 3.480803, + "mask_area_ratio": 0.406976, + "elapsed_seconds": 8.7724 + } + }, + { + "name": "museum_banner", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_24/282555,14d900042c7b9a4.jpg:object:3", + "source_name": "exhibition banner", + "source_description": "A large, rectangular banner hanging on the building's facade, featuring a maroon background and text. Source dataset: CrowdHuman. Scene context: A large, classical building, likely a museum, with many people sitting and standing on its wide front steps, while a yellow taxi speeds past in the foreground.", + "sub_caption": "exhibition banner: A large, rectangular maroon banner hanging on a building's facade.. Scene role: Hanging from the classical architecture on the right side of the street.", + "measured_bbox": [ + 0.8013, + 0.0164, + 0.8543, + 0.2771 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_museum_banner.png", + "raw_ref_image": "references/raw_ref_museum_banner_attempt_01.png", + "reference_verify": "references/reference_verify_museum_banner.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_museum_banner_attempt_01.png", + "output": "references/ref_museum_banner.png", + "mask": "references/sam_mask_museum_banner.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 317.0, + 20.0, + 703.0, + 981.0 + ], + "mask_score": 3.453619, + "mask_area_ratio": 0.268547, + "elapsed_seconds": 8.1747 + } + }, + { + "name": "white_panel_van", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b5047c50-e1facff6:object:2", + "source_name": "white van", + "source_description": "A large white panel van with red taillights illuminated, driving in the right lane ahead. Source dataset: BDD100K. Scene context: View from inside a vehicle driving down a multi-lane city street on a sunny day with moderate traffic.", + "sub_caption": "white van: A large white panel van with illuminated red taillights.. Scene role: Driving ahead in the right lane of the road.", + "measured_bbox": [ + 0.5122, + 0.3306, + 0.6867, + 0.6936 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_white_panel_van.png", + "raw_ref_image": "references/raw_ref_white_panel_van_attempt_01.png", + "reference_verify": "references/reference_verify_white_panel_van.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_white_panel_van_attempt_01.png", + "output": "references/ref_white_panel_van.png", + "mask": "references/sam_mask_white_panel_van.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 102.0, + 91.0, + 877.0, + 937.0 + ], + "mask_score": 3.457159, + "mask_area_ratio": 0.429852, + "elapsed_seconds": 10.1474 + } + }, + { + "name": "double_solid_white_line", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:bc886d37-5b22c313:object:7", + "source_name": "double solid white line", + "source_description": "Two continuous white painted lines on the dark asphalt road surface, separating the two lanes of traffic. Source dataset: BDD100K. Scene context: A view from inside a car driving through a brightly lit tunnel with tiled walls, following a silver SUV and a dark compact car.", + "sub_caption": "double solid white line: Two continuous white painted lines on dark asphalt.. Scene role: Dividing the traffic lanes down the center of the street.", + "measured_bbox": [ + 0.2383, + 0.6095, + 0.4221, + 0.8925 + ], + "detection_confidence": "high", + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_double_solid_white_line.png", + "raw_ref_image": "references/raw_ref_double_solid_white_line_attempt_01.png", + "reference_verify": "references/reference_verify_double_solid_white_line.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_double_solid_white_line_attempt_01.png", + "output": "references/ref_double_solid_white_line.png", + "mask": "references/sam_mask_double_solid_white_line.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 229.0, + 72.0, + 795.0, + 951.0 + ], + "mask_score": 3.470715, + "mask_area_ratio": 0.39155, + "elapsed_seconds": 9.6388 + } + }, + { + "name": "blooming_trees", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_26/283991,91fa000cda68156.jpg:object:5", + "source_name": "blooming trees", + "source_description": "Trees with pink blossoms in the background park area. Source dataset: CrowdHuman. Scene context: People are walking along a sidewalk next to a street lined with trees, some in bloom, with a police officer standing near a parked car.", + "sub_caption": "blooming trees: Trees featuring vibrant pink blossoms.. Scene role: Lining the sidewalk and park area on the right side of the street.", + "measured_bbox": [ + 0.6279, + 0.2153, + 0.8163, + 0.5163 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_blooming_trees.png", + "raw_ref_image": "references/raw_ref_blooming_trees_attempt_01.png", + "reference_verify": "references/reference_verify_blooming_trees.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_blooming_trees_attempt_01.png", + "output": "references/ref_blooming_trees.png", + "mask": "references/sam_mask_blooming_trees.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 93.0, + 1023.0, + 967.0 + ], + "mask_score": 3.479366, + "mask_area_ratio": 0.423422, + "elapsed_seconds": 8.1597 + } + }, + { + "name": "green_street_sign", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_50/273278,febe100057ca94db.jpg:object:5", + "source_name": "street sign", + "source_description": "A green street sign with white text visible on the left side. Source dataset: CrowdHuman. Scene context: Several pedestrians are crossing a street at a crosswalk on a sunny day.", + "sub_caption": "street sign: A green rectangular street sign.. Scene role: Mounted on a pole on the left side of the street near the crosswalk.", + "measured_bbox": [ + 0.1915, + 0.2239, + 0.2775, + 0.2533 + ], + "detection_confidence": 0.9, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_green_street_sign.png", + "raw_ref_image": "references/raw_ref_green_street_sign_attempt_01.png", + "reference_verify": "references/reference_verify_green_street_sign.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000006/references/raw_ref_green_street_sign_attempt_01.png", + "output": "references/ref_green_street_sign.png", + "mask": "references/sam_mask_green_street_sign.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 26.0, + 378.0, + 998.0, + 645.0 + ], + "mask_score": 3.473778, + "mask_area_ratio": 0.191363, + "elapsed_seconds": 9.6707 + } + } + ], + "not_emitted": [], + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/vocab_task.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/vocab_task.json new file mode 100644 index 0000000000000000000000000000000000000000..6de0c11a11255372341292b7d0426466df93e178 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/vocab_task.json @@ -0,0 +1,126 @@ +{ + "task_id": "sample_000006", + "sample_id": "sample_000006", + "sample_index": 6, + "target_total": 8, + "target_people": 1, + "target_objects": 7, + "people_candidates": [ + { + "candidate_index": 0, + "source_offset": 110679, + "image_id": "CrowdHuman:data/data_47/273278,7956e0000b6bb646.jpg:person:10", + "name": "female athlete", + "description": "Sitting in the front middle, wearing a light blue t-shirt and shorts, with a soccer ball at her feet. Source dataset: CrowdHuman. Scene context: A group portrait of college athletes in uniform, posed against a backdrop of a city skyline at dusk." + }, + { + "candidate_index": 1, + "source_offset": 186464, + "image_id": "CrowdHuman:data/data_73/283991,17cd800008079067.jpg:person:18", + "name": "pedestrian", + "description": "Another person in the distant background near the green structure. Source dataset: CrowdHuman. Scene context: A large crowd of people walking across a street with trees, streetlamps, and classic architecture in the background." + } + ], + "object_candidates": [ + { + "candidate_index": 0, + "source_offset": 174964, + "image_id": "BDD100K:bb1b7e42-9608265e:object:6", + "name": "street sign", + "description": "A vertical 'PARK' sign illuminated on the right side of the street, indicating a parking garage. Source dataset: BDD100K. Scene context: A dashcam view from a vehicle driving down a city street with tall buildings on both sides, following a yellow taxi, with other cars parked and driving." + }, + { + "candidate_index": 1, + "source_offset": 168910, + "image_id": "BDD100K:b99f250d-886111c5:object:5", + "name": "vehicle", + "description": "A dark-colored vehicle partially visible in the left background with blue emergency lights flashing. Source dataset: BDD100K. Scene context: A nighttime city street intersection showing a crosswalk, construction barriers, and illuminated traffic signals." + }, + { + "candidate_index": 2, + "source_offset": 3756, + "image_id": "CrowdHuman:data/data_10/273275,5da3d000f5358c3c.jpg:object:9", + "name": "trash can", + "description": "dark blue cylindrical bin partially visible in the foreground Source dataset: CrowdHuman. Scene context: A group of children and an adult pose for a photo in front of a roller coaster at an amusement park." + }, + { + "candidate_index": 3, + "source_offset": 31444, + "image_id": "CrowdHuman:data/data_24/282555,14d900042c7b9a4.jpg:object:3", + "name": "exhibition banner", + "description": "A large, rectangular banner hanging on the building's facade, featuring a maroon background and text. Source dataset: CrowdHuman. Scene context: A large, classical building, likely a museum, with many people sitting and standing on its wide front steps, while a yellow taxi speeds past in the foreground." + }, + { + "candidate_index": 4, + "source_offset": 61592, + "image_id": "CrowdHuman:data/data_4/283991,1ec000a212ec26.jpg:object:6", + "name": "stall sign", + "description": "A bright yellow sign with red text and a drawing of a face. Source dataset: CrowdHuman. Scene context: A bustling night market scene with people walking and looking at stalls, with parked motor scooters in the foreground." + }, + { + "candidate_index": 5, + "source_offset": 148584, + "image_id": "BDD100K:b5047c50-e1facff6:object:2", + "name": "white van", + "description": "A large white panel van with red taillights illuminated, driving in the right lane ahead. Source dataset: BDD100K. Scene context: View from inside a vehicle driving down a multi-lane city street on a sunny day with moderate traffic." + }, + { + "candidate_index": 6, + "source_offset": 181315, + "image_id": "BDD100K:bc886d37-5b22c313:object:7", + "name": "double solid white line", + "description": "Two continuous white painted lines on the dark asphalt road surface, separating the two lanes of traffic. Source dataset: BDD100K. Scene context: A view from inside a car driving through a brightly lit tunnel with tiled walls, following a silver SUV and a dark compact car." + }, + { + "candidate_index": 7, + "source_offset": 236118, + "image_id": "BDD100K:c807cb19-7e09cb11:object:8", + "name": "building facade", + "description": "Dark outlines of buildings lining the street on both sides, with some lit windows. Source dataset: BDD100K. Scene context: Nighttime driving view on a multi-lane city street with traffic lights and vehicles ahead." + }, + { + "candidate_index": 8, + "source_offset": 34903, + "image_id": "CrowdHuman:data/data_26/283991,91fa000cda68156.jpg:object:5", + "name": "blooming trees", + "description": "Trees with pink blossoms in the background park area. Source dataset: CrowdHuman. Scene context: People are walking along a sidewalk next to a street lined with trees, some in bloom, with a police officer standing near a parked car." + }, + { + "candidate_index": 9, + "source_offset": 69573, + "image_id": "CrowdHuman:data/data_44/273278,231ab000d9efcb71.jpg:object:1", + "name": "bench", + "description": "A low, dark rectangular seating structure on the floor. Source dataset: CrowdHuman. Scene context: A grand, high-ceilinged indoor train station concourse with a polished tile floor reflecting overhead lights, lined with various small shops, kiosks, and passing pedestrians." + }, + { + "candidate_index": 10, + "source_offset": 198849, + "image_id": "BDD100K:c06d23aa-cb9ae751:object:6", + "name": "building corner", + "description": "The corner of a building on the right side, with an orange or red awning and some lit signs. Source dataset: BDD100K. Scene context: Nighttime driving scene at an intersection with a stop sign and a large black SUV passing on the right." + }, + { + "candidate_index": 11, + "source_offset": 25812, + "image_id": "CrowdHuman:data/data_21/282555,93df2000dd2b5468.jpg:object:5", + "name": "gymnastics floor", + "description": "A light beige, flat, smooth mat covering the competition area. Source dataset: CrowdHuman. Scene context: Five rhythmic gymnasts in matching maroon and pink leotards pose on the floor, holding ribbons and balls, with spectators and a colorful background in the distance." + }, + { + "candidate_index": 12, + "source_offset": 82290, + "image_id": "CrowdHuman:data/data_50/273278,febe100057ca94db.jpg:object:5", + "name": "street sign", + "description": "A green street sign with white text visible on the left side. Source dataset: CrowdHuman. Scene context: Several pedestrians are crossing a street at a crosswalk on a sunny day." + }, + { + "candidate_index": 13, + "source_offset": 125175, + "image_id": "CrowdHuman:data/data_72/282555,6c6850003beacb74.jpg:object:1", + "name": "paved ground", + "description": "The ground surface made of light-colored, irregularly shaped flat stones or concrete pieces. Source dataset: CrowdHuman. Scene context: A large group of young adults is posing for a group photo in an outdoor paved area, with a massive, intricate steel stadium structure in the background." + } + ], + "rng_seed": 1782556367, + "created_at": 1782223460.3233705 +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000007/compose_prompt.txt b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000007/compose_prompt.txt new file mode 100644 index 0000000000000000000000000000000000000000..7b2c7b18b9dad4e2de1e061f3635f381b48b264f --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000007/compose_prompt.txt @@ -0,0 +1,151 @@ +Render the following JSON scene specification as a photorealistic 1280x720 image using a true 16:9 canvas. The rendered image must be a coherent driving scenario with an outdoor road, street, highway, intersection, curbside, parking, or traffic environment; do not render a non-driving indoor scene or product-catalog scene. Every listed person and object must appear visibly in the image. Keep normal proportions and the requested aspect ratio. The foreground must contain only subjects explicitly listed in the JSON scene specification. Do not add any unlisted foreground people, animals, vehicles, props, tools, products, signs, readable text, logos, or other localizable subjects. Background context may include non-localizable scenery only when it does not introduce a distinct foreground subject. No text, no labels, no logos, no watermarks. +JSON scene specification: +{ + "format": "structured_json_prompt", + "canvas": { + "size": [ + 1280, + 720 + ], + "aspect_ratio": "16:9", + "style": "photorealistic" + }, + "scene": { + "setting": "An urban city street at early evening, viewed from the interior of a driving car.", + "activity": "Vehicles navigate the bustling street while a pedestrian waits at a marked crosswalk under city lights.", + "composition": "First-person dashcam perspective with the dashboard anchoring the bottom edge. The street leads deep into the frame toward a distant skyline, flanked by tall brick buildings, balconies, and illuminated signs. Cars are distributed in moving and parked lanes, with overhead wires and street poles adding vertical structure.", + "constraints": [ + "no text", + "no labels", + "no watermarks", + "true 16:9 composition", + "final canvas size 1280x720", + "normal human and object proportions", + "no squeezed perspective", + "no anamorphic stretching", + "every listed person and object must be visibly present", + "the foreground may contain only the listed people and objects", + "no extra foreground people, animals, vehicles, props, tools, products, signs, readable text, logos, or other localizable subjects absent from this JSON", + "the entire image must be a coherent driving scenario", + "the scene must include an outdoor road, street, highway, intersection, curbside, parking, or traffic environment", + "integrate all listed subjects as road users, vehicles, roadside objects, traffic infrastructure, or driving-context objects", + "do not render an office, kitchen, studio, product catalog, indoor room, or unrelated non-driving scene" + ] + }, + "people": [ + { + "name": "woman_in_dress", + "source_index": 0, + "source_image_id": "CrowdHuman:data/data_23/284193,226b7000e784dddb.jpg:person:7", + "source_name": "bridesmaid", + "description": "A woman with dark hair wearing a dark knee-length dress.", + "role_in_scene": "Waiting on the sidewalk edge near the crosswalk as a pedestrian." + } + ], + "objects": [ + { + "name": "dashboard", + "source_index": 0, + "source_image_id": "BDD100K:c5e32cf6-89b8fb88:object:10", + "source_name": "dashboard", + "description": "The dashboard of the camera vehicle.", + "role_in_scene": "Visible along the very bottom of the frame, establishing the point of view from inside a car." + }, + { + "name": "overhead_wires", + "source_index": 2, + "source_image_id": "BDD100K:bc7caf3c-da14eed9:object:11", + "source_name": "overhead wires", + "description": "Power and communication lines stretching across the sky.", + "role_in_scene": "Crisscrossing the sky above the urban street." + }, + { + "name": "multi_story_building_left", + "source_index": 12, + "source_image_id": "BDD100K:b9df54a4-91295fbc:object:10", + "source_name": "building on left", + "description": "Multi-story brick buildings with numerous windows and fire escapes.", + "role_in_scene": "Forming the architectural boundary on the left side of the street." + }, + { + "name": "street_light_pole", + "source_index": 13, + "source_image_id": "BDD100K:c41585dc-6fe06ca1:object:5", + "source_name": "street light pole", + "description": "A tall, curved metal street light pole.", + "role_in_scene": "Positioned on the sidewalk, arching over the roadway to illuminate the street." + }, + { + "name": "illuminated_store_sign", + "source_index": 14, + "source_image_id": "CrowdHuman:data/data_47/273278,13690500030bdbb93.jpg:object:1", + "source_name": "illuminated store sign", + "description": "Vertical, brightly lit glowing signs in various colors.", + "role_in_scene": "Attached to the building facades, adding colorful ambient light to the streetscape." + }, + { + "name": "white_sedan", + "source_index": 16, + "source_image_id": "BDD100K:c754ce77-a105a975:object:3", + "source_name": "white sedan", + "description": "A white passenger car.", + "role_in_scene": "Driving in the adjacent lane ahead of the camera vehicle." + }, + { + "name": "dark_moving_car", + "source_index": 18, + "source_image_id": "BDD100K:bbfcd002-f8531a65:object:1", + "source_name": "dark car", + "description": "A dark-colored sedan.", + "role_in_scene": "Driving in the opposite direction on the left side of the street." + }, + { + "name": "distant_skyline", + "source_index": 19, + "source_image_id": "BDD100K:b61f19ba-2f34ba9f:object:17", + "source_name": "skyline", + "description": "Silhouettes of tall buildings visible in the distance.", + "role_in_scene": "Appearing far down the vanishing point of the road against the evening sky." + }, + { + "name": "dark_parked_car", + "source_index": 20, + "source_image_id": "BDD100K:baee6fb9-f28ac93d:object:12", + "source_name": "dark car 2", + "description": "A dark-colored car parked alongside the curb.", + "role_in_scene": "Parked on the right side of the street near the buildings." + }, + { + "name": "red_brick_building", + "source_index": 22, + "source_image_id": "BDD100K:c2186a76-5444a563:object:5", + "source_name": "brick building", + "description": "A tall, multi-story red brick building featuring arched windows.", + "role_in_scene": "Anchoring the urban architecture on the right side of the street." + }, + { + "name": "metal_sidewalk_pole", + "source_index": 23, + "source_image_id": "BDD100K:c411687d-73471431:object:14", + "source_name": "pole", + "description": "A thin, straight metal pole standing upright.", + "role_in_scene": "Located on the sidewalk near the crosswalk area." + }, + { + "name": "crosswalk_lines", + "source_index": 24, + "source_image_id": "BDD100K:ba8b1e05-8ec0219a:object:5", + "source_name": "crosswalk markings", + "description": "White painted lines on the road surface indicating a pedestrian crosswalk.", + "role_in_scene": "Painted across the street asphalt directly ahead of the camera vehicle." + }, + { + "name": "building_balcony", + "source_index": 25, + "source_image_id": "CrowdHuman:data/data_55/283991,169b7000c139b5de.jpg:object:6", + "source_name": "balcony", + "description": "A dark, wrought-iron balcony.", + "role_in_scene": "Protruding from one of the upper floor windows of the right-side brick building." + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000007/crops/detect_refine_building_balcony.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000007/crops/detect_refine_building_balcony.png new file mode 100644 index 0000000000000000000000000000000000000000..406cba65dfe20a0376770b5d86d0bf6b65d7cb49 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000007/crops/detect_refine_building_balcony.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000007/crops/detect_refine_crosswalk_lines.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000007/crops/detect_refine_crosswalk_lines.png new file mode 100644 index 0000000000000000000000000000000000000000..a4fd169b56207eb3bbc30da2eed2fb15677203ee Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000007/crops/detect_refine_crosswalk_lines.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000007/crops/detect_refine_dark_moving_car.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000007/crops/detect_refine_dark_moving_car.png new file mode 100644 index 0000000000000000000000000000000000000000..0db2f3a19df8876903f599496f5fdd89bd8e18ee Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000007/crops/detect_refine_dark_moving_car.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000007/crops/detect_refine_dark_parked_car.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000007/crops/detect_refine_dark_parked_car.png new file mode 100644 index 0000000000000000000000000000000000000000..a8364e6bf07e7d60bb0eaad03c17911d437b67d4 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000007/crops/detect_refine_dark_parked_car.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000007/crops/detect_refine_dashboard.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000007/crops/detect_refine_dashboard.png new file mode 100644 index 0000000000000000000000000000000000000000..7c6ba627efbbea1fa5783b7518beb28331310558 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000007/crops/detect_refine_dashboard.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fc557396a82f8d3d92589aef30f7a5faf07d48dc0ee707b9dc4ed9b74d693d5 +size 237720 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000007/crops/detect_refine_distant_skyline.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000007/crops/detect_refine_distant_skyline.png new file mode 100644 index 0000000000000000000000000000000000000000..188c745fac5be069621493e5e507582af281867f Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000007/crops/detect_refine_distant_skyline.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000007/crops/detect_refine_metal_sidewalk_pole.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000007/crops/detect_refine_metal_sidewalk_pole.png new file mode 100644 index 0000000000000000000000000000000000000000..c0fa33701aa65fe296fa6e6c1ddd99d9a88c6ac5 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000007/crops/detect_refine_metal_sidewalk_pole.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000007/crops/detect_refine_multi_story_building_left.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000007/crops/detect_refine_multi_story_building_left.png new file mode 100644 index 0000000000000000000000000000000000000000..14fb288a55c3372037697c1862a9675dfc449e44 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000007/crops/detect_refine_multi_story_building_left.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1438551d523fc9a637de4d781fc3763e0fcaafab66f3ccde14e146a996e9b644 +size 566500 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000007/crops/detect_refine_overhead_wires.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000007/crops/detect_refine_overhead_wires.png new file mode 100644 index 0000000000000000000000000000000000000000..210468db4db2eabd65a8327a549c05126a83ba84 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000007/crops/detect_refine_overhead_wires.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6c6f6223cd40e43e01cdc5485f9beefd837c8d9f3ffeedb95ae367d58375569 +size 545159 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000007/crops/detect_refine_red_brick_building.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000007/crops/detect_refine_red_brick_building.png new file mode 100644 index 0000000000000000000000000000000000000000..ff11d76de2b24bff8126b21150d06bd671b12b05 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000007/crops/detect_refine_red_brick_building.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b985235dc290d6caec805052dc85bb0c897575127914c9451637f6302c8610e8 +size 510577 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000007/crops/detect_refine_street_light_pole.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000007/crops/detect_refine_street_light_pole.png new file mode 100644 index 0000000000000000000000000000000000000000..3e7b38fb2cec2b2cb4b3211aedfa49fe20dd44fd --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000007/crops/detect_refine_street_light_pole.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f315d9f61687fdfd7ea72e7ca71b6e8092551227029e4f013a2f65a7b2059ef3 +size 218896 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000007/crops/detect_refine_white_sedan.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000007/crops/detect_refine_white_sedan.png new file mode 100644 index 0000000000000000000000000000000000000000..cf918be1387da1a445c76d73df0c6f7430c9b6a7 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000007/crops/detect_refine_white_sedan.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000007/crops/detect_refine_woman_in_dress.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000007/crops/detect_refine_woman_in_dress.png new file mode 100644 index 0000000000000000000000000000000000000000..2d23f8d46bf5a8b98b2eb87a09ab07ccdd03f9e9 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000007/crops/detect_refine_woman_in_dress.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000007/main_image.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000007/main_image.png new file mode 100644 index 0000000000000000000000000000000000000000..c37ff2d67dd0a6c1b9f00fd20af5a6b0ab5a364e --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000007/main_image.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4fe027bf893aafd5d7c4468dd36ea40fb8c884bd9ba63fefcd2692f5c663bb2c +size 1554263 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000007/plan.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000007/plan.json new file mode 100644 index 0000000000000000000000000000000000000000..f1adf2aad2657d5b01d076dbd99d510e11e7cf2b --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000007/plan.json @@ -0,0 +1,316 @@ +{ + "sample_id": "sample_000007", + "target_total": 14, + "target_people": 1, + "target_objects": 13, + "compose_prompt": { + "format": "structured_json_prompt", + "canvas": { + "size": [ + 1280, + 720 + ], + "aspect_ratio": "16:9", + "style": "photorealistic" + }, + "scene": { + "setting": "An urban city street at early evening, viewed from the interior of a driving car.", + "activity": "Vehicles navigate the bustling street while a pedestrian waits at a marked crosswalk under city lights.", + "composition": "First-person dashcam perspective with the dashboard anchoring the bottom edge. The street leads deep into the frame toward a distant skyline, flanked by tall brick buildings, balconies, and illuminated signs. Cars are distributed in moving and parked lanes, with overhead wires and street poles adding vertical structure.", + "constraints": [ + "no text", + "no labels", + "no watermarks", + "true 16:9 composition", + "final canvas size 1280x720", + "normal human and object proportions", + "no squeezed perspective", + "no anamorphic stretching", + "every listed person and object must be visibly present", + "the foreground may contain only the listed people and objects", + "no extra foreground people, animals, vehicles, props, tools, products, signs, readable text, logos, or other localizable subjects absent from this JSON", + "the entire image must be a coherent driving scenario", + "the scene must include an outdoor road, street, highway, intersection, curbside, parking, or traffic environment", + "integrate all listed subjects as road users, vehicles, roadside objects, traffic infrastructure, or driving-context objects", + "do not render an office, kitchen, studio, product catalog, indoor room, or unrelated non-driving scene" + ] + }, + "people": [ + { + "name": "woman_in_dress", + "source_index": 0, + "source_image_id": "CrowdHuman:data/data_23/284193,226b7000e784dddb.jpg:person:7", + "source_name": "bridesmaid", + "description": "A woman with dark hair wearing a dark knee-length dress.", + "role_in_scene": "Waiting on the sidewalk edge near the crosswalk as a pedestrian." + } + ], + "objects": [ + { + "name": "dashboard", + "source_index": 0, + "source_image_id": "BDD100K:c5e32cf6-89b8fb88:object:10", + "source_name": "dashboard", + "description": "The dashboard of the camera vehicle.", + "role_in_scene": "Visible along the very bottom of the frame, establishing the point of view from inside a car." + }, + { + "name": "overhead_wires", + "source_index": 2, + "source_image_id": "BDD100K:bc7caf3c-da14eed9:object:11", + "source_name": "overhead wires", + "description": "Power and communication lines stretching across the sky.", + "role_in_scene": "Crisscrossing the sky above the urban street." + }, + { + "name": "multi_story_building_left", + "source_index": 12, + "source_image_id": "BDD100K:b9df54a4-91295fbc:object:10", + "source_name": "building on left", + "description": "Multi-story brick buildings with numerous windows and fire escapes.", + "role_in_scene": "Forming the architectural boundary on the left side of the street." + }, + { + "name": "street_light_pole", + "source_index": 13, + "source_image_id": "BDD100K:c41585dc-6fe06ca1:object:5", + "source_name": "street light pole", + "description": "A tall, curved metal street light pole.", + "role_in_scene": "Positioned on the sidewalk, arching over the roadway to illuminate the street." + }, + { + "name": "illuminated_store_sign", + "source_index": 14, + "source_image_id": "CrowdHuman:data/data_47/273278,13690500030bdbb93.jpg:object:1", + "source_name": "illuminated store sign", + "description": "Vertical, brightly lit glowing signs in various colors.", + "role_in_scene": "Attached to the building facades, adding colorful ambient light to the streetscape." + }, + { + "name": "white_sedan", + "source_index": 16, + "source_image_id": "BDD100K:c754ce77-a105a975:object:3", + "source_name": "white sedan", + "description": "A white passenger car.", + "role_in_scene": "Driving in the adjacent lane ahead of the camera vehicle." + }, + { + "name": "dark_moving_car", + "source_index": 18, + "source_image_id": "BDD100K:bbfcd002-f8531a65:object:1", + "source_name": "dark car", + "description": "A dark-colored sedan.", + "role_in_scene": "Driving in the opposite direction on the left side of the street." + }, + { + "name": "distant_skyline", + "source_index": 19, + "source_image_id": "BDD100K:b61f19ba-2f34ba9f:object:17", + "source_name": "skyline", + "description": "Silhouettes of tall buildings visible in the distance.", + "role_in_scene": "Appearing far down the vanishing point of the road against the evening sky." + }, + { + "name": "dark_parked_car", + "source_index": 20, + "source_image_id": "BDD100K:baee6fb9-f28ac93d:object:12", + "source_name": "dark car 2", + "description": "A dark-colored car parked alongside the curb.", + "role_in_scene": "Parked on the right side of the street near the buildings." + }, + { + "name": "red_brick_building", + "source_index": 22, + "source_image_id": "BDD100K:c2186a76-5444a563:object:5", + "source_name": "brick building", + "description": "A tall, multi-story red brick building featuring arched windows.", + "role_in_scene": "Anchoring the urban architecture on the right side of the street." + }, + { + "name": "metal_sidewalk_pole", + "source_index": 23, + "source_image_id": "BDD100K:c411687d-73471431:object:14", + "source_name": "pole", + "description": "A thin, straight metal pole standing upright.", + "role_in_scene": "Located on the sidewalk near the crosswalk area." + }, + { + "name": "crosswalk_lines", + "source_index": 24, + "source_image_id": "BDD100K:ba8b1e05-8ec0219a:object:5", + "source_name": "crosswalk markings", + "description": "White painted lines on the road surface indicating a pedestrian crosswalk.", + "role_in_scene": "Painted across the street asphalt directly ahead of the camera vehicle." + }, + { + "name": "building_balcony", + "source_index": 25, + "source_image_id": "CrowdHuman:data/data_55/283991,169b7000c139b5de.jpg:object:6", + "source_name": "balcony", + "description": "A dark, wrought-iron balcony.", + "role_in_scene": "Protruding from one of the upper floor windows of the right-side brick building." + } + ] + }, + "expected_subjects": [ + { + "name": "woman_in_dress", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_23/284193,226b7000e784dddb.jpg:person:7", + "source_name": "bridesmaid", + "source_description": "A woman with dark hair wearing a dark knee-length dress, walking along the path. Source dataset: CrowdHuman. Scene context: A bride in a white gown and her bridesmaids in dark dresses are walking along a paved path next to a building with stairs, surrounded by trees and a white fence in a sunlit outdoor setting.", + "sub_caption": "bridesmaid: A woman with dark hair wearing a dark knee-length dress.. Scene role: Waiting on the sidewalk edge near the crosswalk as a pedestrian.", + "ref_style": "white_bg_full_body_front" + }, + { + "name": "dashboard", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c5e32cf6-89b8fb88:object:10", + "source_name": "dashboard", + "source_description": "The dashboard of the camera vehicle, visible at the bottom of the frame. Source dataset: BDD100K. Scene context: A view from inside a car driving on a city street, approaching a tunnel or underpass, with buildings on the left and a retaining wall on the right.", + "sub_caption": "dashboard: The dashboard of the camera vehicle.. Scene role: Visible along the very bottom of the frame, establishing the point of view from inside a car.", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "overhead_wires", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:bc7caf3c-da14eed9:object:11", + "source_name": "overhead wires", + "source_description": "Power and communication lines stretching across the sky above the street. Source dataset: BDD100K. Scene context: View from inside a vehicle driving down a residential street lined with parked cars and houses.", + "sub_caption": "overhead wires: Power and communication lines stretching across the sky.. Scene role: Crisscrossing the sky above the urban street.", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "multi_story_building_left", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b9df54a4-91295fbc:object:10", + "source_name": "building on left", + "source_description": "Multi-story brick buildings with numerous windows and fire escapes on the left side. Source dataset: BDD100K. Scene context: A narrow city street lined with parked cars on both sides, with residential and commercial buildings featuring fire escapes and awnings, illuminated by sunlight filtering through mature trees.", + "sub_caption": "building on left: Multi-story brick buildings with numerous windows and fire escapes.. Scene role: Forming the architectural boundary on the left side of the street.", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "street_light_pole", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c41585dc-6fe06ca1:object:5", + "source_name": "street light pole", + "source_description": "A tall, curved metal street light pole on the right side of the road, supporting the overhead sign. Source dataset: BDD100K. Scene context: A view from a car driving on a multi-lane road bordered by trees, with other vehicles and road signs visible.", + "sub_caption": "street light pole: A tall, curved metal street light pole.. Scene role: Positioned on the sidewalk, arching over the roadway to illuminate the street.", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "illuminated_store_sign", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_47/273278,13690500030bdbb93.jpg:object:1", + "source_name": "illuminated store sign", + "source_description": "Vertical, brightly lit signs in various colors like red, blue, purple, and white, with Korean and English text attached to building facades. Source dataset: CrowdHuman. Scene context: A crowded, brightly lit shopping street at night filled with pedestrians and lined with numerous colorful illuminated store signs and street stalls.", + "sub_caption": "illuminated store sign: Vertical, brightly lit glowing signs in various colors.. Scene role: Attached to the building facades, adding colorful ambient light to the streetscape.", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "white_sedan", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c754ce77-a105a975:object:3", + "source_name": "white sedan", + "source_description": "A white passenger car partially visible in the right lane next to the gold SUV. Source dataset: BDD100K. Scene context: View from inside a car driving in city traffic on a sunny day with multiple vehicles and urban infrastructure visible.", + "sub_caption": "white sedan: A white passenger car.. Scene role: Driving in the adjacent lane ahead of the camera vehicle.", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "dark_moving_car", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:bbfcd002-f8531a65:object:1", + "source_name": "dark car", + "source_description": "A dark-colored sedan visible on the left side of the street, partially obscured by rain. Source dataset: BDD100K. Scene context: View from inside a vehicle through a heavily rain-covered windshield, looking at city traffic and buildings.", + "sub_caption": "dark car: A dark-colored sedan.. Scene role: Driving in the opposite direction on the left side of the street.", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "distant_skyline", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b61f19ba-2f34ba9f:object:17", + "source_name": "skyline", + "source_description": "Silhouettes of buildings visible in the distance against the twilight sky. Source dataset: BDD100K. Scene context: A street scene at dusk with cars stopped at a red traffic light, snow on the ground, and various commercial buildings alongside the road.", + "sub_caption": "skyline: Silhouettes of tall buildings visible in the distance.. Scene role: Appearing far down the vanishing point of the road against the evening sky.", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "dark_parked_car", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:baee6fb9-f28ac93d:object:12", + "source_name": "dark car 2", + "source_description": "A dark-colored car parked on the right side of the street, ahead of the other dark car. Source dataset: BDD100K. Scene context: A dashcam view driving down a city street lined with parked cars on both sides and multi-story brick apartment buildings under a partly cloudy sky.", + "sub_caption": "dark car 2: A dark-colored car parked alongside the curb.. Scene role: Parked on the right side of the street near the buildings.", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "red_brick_building", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c2186a76-5444a563:object:5", + "source_name": "brick building", + "source_description": "A tall, multi-story red brick building on the left side of the street, featuring arched windows and a storefront. Source dataset: BDD100K. Scene context: View from inside a vehicle driving down a city street lined with parked cars and multi-story brick buildings.", + "sub_caption": "brick building: A tall, multi-story red brick building featuring arched windows.. Scene role: Anchoring the urban architecture on the right side of the street.", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "metal_sidewalk_pole", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c411687d-73471431:object:14", + "source_name": "pole", + "source_description": "A thin, straight metal pole standing upright on the sidewalk near the park area on the left. Source dataset: BDD100K. Scene context: A dashcam view looking down a slightly sloped residential city street with cars parked on both sides, trees bordering a park area to the left, and a tall building to the right, under a cloudy, overcast sky.", + "sub_caption": "pole: A thin, straight metal pole standing upright.. Scene role: Located on the sidewalk near the crosswalk area.", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "crosswalk_lines", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:ba8b1e05-8ec0219a:object:5", + "source_name": "crosswalk markings", + "source_description": "White painted lines on the road surface indicating a pedestrian crosswalk. Source dataset: BDD100K. Scene context: Nighttime driving scene approaching an intersection with traffic lights and a crosswalk.", + "sub_caption": "crosswalk markings: White painted lines on the road surface indicating a pedestrian crosswalk.. Scene role: Painted across the street asphalt directly ahead of the camera vehicle.", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "building_balcony", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_55/283991,169b7000c139b5de.jpg:object:6", + "source_name": "balcony", + "source_description": "A dark, wrought-iron balcony on a building. Source dataset: CrowdHuman. Scene context: People walk down a narrow, sunlit street lined with tall buildings.", + "sub_caption": "balcony: A dark, wrought-iron balcony.. Scene role: Protruding from one of the upper floor windows of the right-side brick building.", + "ref_style": "white_bg_encyclopedia_photo" + } + ], + "vocab_task_path": "sample_000007/vocab_task.json", + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000007/vocab_task.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000007/vocab_task.json new file mode 100644 index 0000000000000000000000000000000000000000..31df47a73c555ce81591af1c2d82a5ab77db0311 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000007/vocab_task.json @@ -0,0 +1,210 @@ +{ + "task_id": "sample_000007", + "sample_id": "sample_000007", + "sample_index": 7, + "target_total": 14, + "target_people": 1, + "target_objects": 13, + "people_candidates": [ + { + "candidate_index": 0, + "source_offset": 43568, + "image_id": "CrowdHuman:data/data_23/284193,226b7000e784dddb.jpg:person:7", + "name": "bridesmaid", + "description": "A woman with dark hair wearing a dark knee-length dress, walking along the path. Source dataset: CrowdHuman. Scene context: A bride in a white gown and her bridesmaids in dark dresses are walking along a paved path next to a building with stairs, surrounded by trees and a white fence in a sunlit outdoor setting." + }, + { + "candidate_index": 1, + "source_offset": 109525, + "image_id": "CrowdHuman:data/data_47/273275,11e9620009f0fe3f7.jpg:person:5", + "name": "hiker in light shirt", + "description": "A hiker partially visible wearing a light-colored long-sleeved shirt. Source dataset: CrowdHuman. Scene context: A group of hikers navigates a rocky, forested area with yellow wildflowers." + } + ], + "object_candidates": [ + { + "candidate_index": 0, + "source_offset": 226933, + "image_id": "BDD100K:c5e32cf6-89b8fb88:object:10", + "name": "dashboard", + "description": "The dashboard of the camera vehicle, visible at the bottom of the frame. Source dataset: BDD100K. Scene context: A view from inside a car driving on a city street, approaching a tunnel or underpass, with buildings on the left and a retaining wall on the right." + }, + { + "candidate_index": 1, + "source_offset": 236099, + "image_id": "BDD100K:c807d32d-e5383e74:object:6", + "name": "airport terminal building", + "description": "Large modern building on the right with a curved, overhanging roof and tall dark glass windows. Source dataset: BDD100K. Scene context: View from a vehicle driving slowly past an airport terminal with a line of metal bollards separating the roadway from the sidewalk." + }, + { + "candidate_index": 2, + "source_offset": 181248, + "image_id": "BDD100K:bc7caf3c-da14eed9:object:11", + "name": "overhead wires", + "description": "Power and communication lines stretching across the sky above the street. Source dataset: BDD100K. Scene context: View from inside a vehicle driving down a residential street lined with parked cars and houses." + }, + { + "candidate_index": 3, + "source_offset": 211638, + "image_id": "BDD100K:c2f92f94-43481d10:object:5", + "name": "building lights", + "description": "Illuminated signs and windows from buildings lining the right side of the street. Source dataset: BDD100K. Scene context: A dark, empty street at night viewed from inside a car, illuminated sparsely by streetlights and vehicle headlights." + }, + { + "candidate_index": 4, + "source_offset": 121991, + "image_id": "CrowdHuman:data/data_7/282555,1205160005cee3251.jpg:object:1", + "name": "stone gallery", + "description": "A long stone building extending along the left side, characterized by a dark, intricately carved exterior and a continuous wall with multiple large windows containing densely packed, turned stone pillars. Source dataset: CrowdHuman. Scene context: Tourists explore a large ancient stone temple complex featuring a prominent tiered tower and long columned galleries under a partly cloudy sky." + }, + { + "candidate_index": 5, + "source_offset": 131395, + "image_id": "CrowdHuman:data/data_9/273275,7ed7b0000bbb63c7.jpg:object:2", + "name": "sunglasses", + "description": "Dark sunglasses worn by the woman in the gray shirt. Source dataset: CrowdHuman. Scene context: A group of nine people, including adults and children, pose for a photo outdoors in front of a large, old tree." + }, + { + "candidate_index": 6, + "source_offset": 19389, + "image_id": "CrowdHuman:data/data_19/273271,2d803000c90cec0a.jpg:object:3", + "name": "motorized wheelchair", + "description": "green motorized wheelchair with a black metal cage-like guard attached to the front. Source dataset: CrowdHuman. Scene context: Several individuals in specialized motorized wheelchairs are playing power soccer on an outdoor court." + }, + { + "candidate_index": 7, + "source_offset": 116248, + "image_id": "CrowdHuman:data/data_68/273278,d37b500038386e31.jpg:object:0", + "name": "bunch of balloons", + "description": "A bunch of heart-shaped balloons, some pink and some red, tied to a wooden utility pole. Source dataset: CrowdHuman. Scene context: A group of people standing in a circle, holding hands on a street in a residential area." + }, + { + "candidate_index": 8, + "source_offset": 51323, + "image_id": "CrowdHuman:data/data_33/273275,a22dc000cd21038b.jpg:object:29", + "name": "wooden post", + "description": "A wooden post supporting the string lights or part of a fence. Source dataset: CrowdHuman. Scene context: A large group of people gathered outdoors on a paved area, waving and posing for a picture, with a small pond, bridge, and picnic tables in the background." + }, + { + "candidate_index": 9, + "source_offset": 90451, + "image_id": "CrowdHuman:data/data_55/273275,1dfaa0001f3d5a76.jpg:object:11", + "name": "flag", + "description": "the French national flag flying atop a building Source dataset: CrowdHuman. Scene context: A view of Paris with an equestrian statue in the foreground and the Eiffel Tower in the distance under a cloudy sky with sun rays." + }, + { + "candidate_index": 10, + "source_offset": 42073, + "image_id": "CrowdHuman:data/data_3/282555,5e3b40007052b80f.jpg:object:33", + "name": "black garment", + "description": "A black garment hanging on a rack. Source dataset: CrowdHuman. Scene context: A group of people standing in a line inside a brightly lit retail clothing store, with a mother carrying a baby in the foreground." + }, + { + "candidate_index": 11, + "source_offset": 86280, + "image_id": "CrowdHuman:data/data_52/283081,13fff00018862889.jpg:object:3", + "name": "white garbage bag", + "description": "large white plastic bag on the bottom right corner Source dataset: CrowdHuman. Scene context: A group of fifteen people posing for a photo in front of a colorful graffiti wall." + }, + { + "candidate_index": 12, + "source_offset": 169799, + "image_id": "BDD100K:b9df54a4-91295fbc:object:10", + "name": "building on left", + "description": "Multi-story brick buildings with numerous windows and fire escapes on the left side. Source dataset: BDD100K. Scene context: A narrow city street lined with parked cars on both sides, with residential and commercial buildings featuring fire escapes and awnings, illuminated by sunlight filtering through mature trees." + }, + { + "candidate_index": 13, + "source_offset": 217845, + "image_id": "BDD100K:c41585dc-6fe06ca1:object:5", + "name": "street light pole", + "description": "A tall, curved metal street light pole on the right side of the road, supporting the overhead sign. Source dataset: BDD100K. Scene context: A view from a car driving on a multi-lane road bordered by trees, with other vehicles and road signs visible." + }, + { + "candidate_index": 14, + "source_offset": 74994, + "image_id": "CrowdHuman:data/data_47/273278,13690500030bdbb93.jpg:object:1", + "name": "illuminated store sign", + "description": "Vertical, brightly lit signs in various colors like red, blue, purple, and white, with Korean and English text attached to building facades. Source dataset: CrowdHuman. Scene context: A crowded, brightly lit shopping street at night filled with pedestrians and lined with numerous colorful illuminated store signs and street stalls." + }, + { + "candidate_index": 15, + "source_offset": 7349, + "image_id": "CrowdHuman:data/data_12/273275,76ecb000e38a985e.jpg:object:6", + "name": "painting 4", + "description": "A rectangular framed painting near the doorway on the red wall. Source dataset: CrowdHuman. Scene context: A large group of tourists visits an ornate museum gallery filled with classic paintings and large chandeliers." + }, + { + "candidate_index": 16, + "source_offset": 233025, + "image_id": "BDD100K:c754ce77-a105a975:object:3", + "name": "white sedan", + "description": "A white passenger car partially visible in the right lane next to the gold SUV. Source dataset: BDD100K. Scene context: View from inside a car driving in city traffic on a sunny day with multiple vehicles and urban infrastructure visible." + }, + { + "candidate_index": 17, + "source_offset": 23374, + "image_id": "CrowdHuman:data/data_20/273275,8c907000fceb0e02.jpg:object:0", + "name": "stone steps", + "description": "wide, light-colored stone steps where the group is seated Source dataset: CrowdHuman. Scene context: A group of people sitting on outdoor steps listening to a man with a long white beard dressed in a dark robe." + }, + { + "candidate_index": 18, + "source_offset": 178939, + "image_id": "BDD100K:bbfcd002-f8531a65:object:1", + "name": "dark car", + "description": "A dark-colored sedan visible on the left side of the street, partially obscured by rain. Source dataset: BDD100K. Scene context: View from inside a vehicle through a heavily rain-covered windshield, looking at city traffic and buildings." + }, + { + "candidate_index": 19, + "source_offset": 153727, + "image_id": "BDD100K:b61f19ba-2f34ba9f:object:17", + "name": "skyline", + "description": "Silhouettes of buildings visible in the distance against the twilight sky. Source dataset: BDD100K. Scene context: A street scene at dusk with cars stopped at a red traffic light, snow on the ground, and various commercial buildings alongside the road." + }, + { + "candidate_index": 20, + "source_offset": 174624, + "image_id": "BDD100K:baee6fb9-f28ac93d:object:12", + "name": "dark car 2", + "description": "A dark-colored car parked on the right side of the street, ahead of the other dark car. Source dataset: BDD100K. Scene context: A dashcam view driving down a city street lined with parked cars on both sides and multi-story brick apartment buildings under a partly cloudy sky." + }, + { + "candidate_index": 21, + "source_offset": 112956, + "image_id": "CrowdHuman:data/data_66/283647,8133000946f9f6f.jpg:object:5", + "name": "camera", + "description": "A camera held by the photographer in the dark coat. Source dataset: CrowdHuman. Scene context: A group of men, including military personnel and civilians in long coats, walking across an airfield with airplanes in the background." + }, + { + "candidate_index": 22, + "source_offset": 208304, + "image_id": "BDD100K:c2186a76-5444a563:object:5", + "name": "brick building", + "description": "A tall, multi-story red brick building on the left side of the street, featuring arched windows and a storefront. Source dataset: BDD100K. Scene context: View from inside a vehicle driving down a city street lined with parked cars and multi-story brick buildings." + }, + { + "candidate_index": 23, + "source_offset": 217110, + "image_id": "BDD100K:c411687d-73471431:object:14", + "name": "pole", + "description": "A thin, straight metal pole standing upright on the sidewalk near the park area on the left. Source dataset: BDD100K. Scene context: A dashcam view looking down a slightly sloped residential city street with cars parked on both sides, trees bordering a park area to the left, and a tall building to the right, under a cloudy, overcast sky." + }, + { + "candidate_index": 24, + "source_offset": 172239, + "image_id": "BDD100K:ba8b1e05-8ec0219a:object:5", + "name": "crosswalk markings", + "description": "White painted lines on the road surface indicating a pedestrian crosswalk. Source dataset: BDD100K. Scene context: Nighttime driving scene approaching an intersection with traffic lights and a crosswalk." + }, + { + "candidate_index": 25, + "source_offset": 91774, + "image_id": "CrowdHuman:data/data_55/283991,169b7000c139b5de.jpg:object:6", + "name": "balcony", + "description": "A dark, wrought-iron balcony on a building. Source dataset: CrowdHuman. Scene context: People walk down a narrow, sunlit street lined with tall buildings." + } + ], + "rng_seed": 1782661096, + "created_at": 1782223460.3448656 +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/bbox_overlay.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/bbox_overlay.png new file mode 100644 index 0000000000000000000000000000000000000000..5ddfe080a9a5f1b740bd9f840ed92edb116ddb64 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/bbox_overlay.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69f818c554a6240258b40230ff0328e85cf4a9bea44987bf3e216746d7059890 +size 1431526 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/compose_prompt.txt b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/compose_prompt.txt new file mode 100644 index 0000000000000000000000000000000000000000..becb2a7cd6ae9d2150eeb99fe22950cb52769a3c --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/compose_prompt.txt @@ -0,0 +1,119 @@ +Render the following JSON scene specification as a photorealistic 1280x720 image using a true 16:9 canvas. The rendered image must be a coherent driving scenario with an outdoor road, street, highway, intersection, curbside, parking, or traffic environment; do not render a non-driving indoor scene or product-catalog scene. Every listed person and object must appear visibly in the image. Keep normal proportions and the requested aspect ratio. The foreground must contain only subjects explicitly listed in the JSON scene specification. Do not add any unlisted foreground people, animals, vehicles, props, tools, products, signs, readable text, logos, or other localizable subjects. Background context may include non-localizable scenery only when it does not introduce a distinct foreground subject. No text, no labels, no logos, no watermarks. +JSON scene specification: +{ + "format": "structured_json_prompt", + "canvas": { + "size": [ + 1280, + 720 + ], + "aspect_ratio": "16:9", + "style": "photorealistic" + }, + "scene": { + "setting": "Busy city intersection seen from the driver's perspective inside a vehicle stopped at a crosswalk.", + "activity": "Pedestrians are crossing the street and waiting at the corners, while a person holding a sign walks along the sidewalk.", + "composition": "Dashcam perspective, wide 16:9 aspect ratio. A vehicle dashboard reflection is visible at the very bottom edge. A street lamp stands tall on the right sidewalk. Pedestrians are spatially distributed across the crosswalk midground and grouped on the sidewalks, conveying realistic urban depth and occlusion.", + "constraints": [ + "no text", + "no labels", + "no watermarks", + "true 16:9 composition", + "final canvas size 1280x720", + "normal human and object proportions", + "no squeezed perspective", + "no anamorphic stretching", + "every listed person and object must be visibly present", + "the foreground may contain only the listed people and objects", + "no extra foreground people, animals, vehicles, props, tools, products, signs, readable text, logos, or other localizable subjects absent from this JSON", + "the entire image must be a coherent driving scenario", + "the scene must include an outdoor road, street, highway, intersection, curbside, parking, or traffic environment", + "integrate all listed subjects as road users, vehicles, roadside objects, traffic infrastructure, or driving-context objects", + "do not render an office, kitchen, studio, product catalog, indoor room, or unrelated non-driving scene" + ] + }, + "people": [ + { + "name": "passenger", + "source_index": 0, + "source_image_id": "CrowdHuman:data/data_36/273278,a753000396c56bd.jpg:person:6", + "source_name": "passenger", + "description": "A man wearing glasses, a black jacket, and a light-colored shirt underneath, walking away from the camera.", + "role_in_scene": "Walking away on the sidewalk ahead." + }, + { + "name": "shopper_waiting", + "source_index": 2, + "source_image_id": "CrowdHuman:data/data_4/284193,a17b00013852c31.jpg:person:2", + "source_name": "shopper", + "description": "A woman standing on the right, looking towards the street, wearing a black jacket, dark trousers, and carrying a brown handbag.", + "role_in_scene": "Standing on the corner curb, waiting to cross the street." + }, + { + "name": "shopper_standing", + "source_index": 3, + "source_image_id": "CrowdHuman:data/data_59/283554,3417e000ebf80bca.jpg:person:13", + "source_name": "shopper", + "description": "A person standing, wearing a blue jacket and blue jeans.", + "role_in_scene": "Standing near the crosswalk edge amidst the crowd." + }, + { + "name": "protester", + "source_index": 5, + "source_image_id": "CrowdHuman:data/data_22/273278,c58890002730d04e.jpg:person:8", + "source_name": "protester holding sign in back", + "description": "A person walking in the background on the right, holding up a large white sign.", + "role_in_scene": "Walking further down the right sidewalk carrying a sign." + }, + { + "name": "pedestrian_crossing", + "source_index": 9, + "source_image_id": "CrowdHuman:data/data_1/273275,1361a000ad290eee.jpg:person:11", + "source_name": "pedestrian", + "description": "A person in a white top and dark pants, walking towards the right.", + "role_in_scene": "Actively walking across the crosswalk in front of the vehicle." + }, + { + "name": "pedestrian_walking_away", + "source_index": 12, + "source_image_id": "CrowdHuman:data/data_66/273275,194f6000c5fdbe78.jpg:person:12", + "source_name": "pedestrian", + "description": "A person wearing a white top and dark pants, walking away from the camera.", + "role_in_scene": "Walking away on the left side of the street." + }, + { + "name": "young_man", + "source_index": 13, + "source_image_id": "CrowdHuman:data/data_22/282555,1b65800070e437ee.jpg:person:8", + "source_name": "young man", + "description": "Standing back row, wearing a dark blue hoodie.", + "role_in_scene": "Waiting in the crowd at the corner intersection." + }, + { + "name": "businessman", + "source_index": 15, + "source_image_id": "CrowdHuman:data/data_56/282555,cd31e000ef97ef85.jpg:person:23", + "source_name": "adult in dark suit", + "description": "Standing in back rows, wearing dark suit and tie.", + "role_in_scene": "Standing on the sidewalk behind other pedestrians." + } + ], + "objects": [ + { + "name": "street_lamp", + "source_index": 0, + "source_image_id": "CrowdHuman:data/data_10/273278,10841e0001c957a7c.jpg:object:6", + "source_name": "street lamp", + "description": "A tall street lamp pole partially visible on the right side of the street.", + "role_in_scene": "Towering above the right sidewalk corner, serving as city infrastructure." + }, + { + "name": "dashboard_reflection", + "source_index": 2, + "source_image_id": "BDD100K:c24e7c72-f53291ea:object:8", + "source_name": "vehicle dashboard reflection", + "description": "A reflection on the windshield showing the interior dashboard and a document or object with large blue text.", + "role_in_scene": "Visible along the bottom edge of the frame, establishing the camera's perspective from inside a car." + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/detect_refine_businessman.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/detect_refine_businessman.png new file mode 100644 index 0000000000000000000000000000000000000000..d7978444a4b6f18902281ee9a1fe330cb66a82c3 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/detect_refine_businessman.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/detect_refine_dashboard_reflection.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/detect_refine_dashboard_reflection.png new file mode 100644 index 0000000000000000000000000000000000000000..39196eca8b8c6c4b08ae1c201e96080faae70abc --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/detect_refine_dashboard_reflection.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d40097d4e867ca48849becb1f4441e9a296bb0ba61b53d6a3355b0ed215adfa8 +size 474993 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/detect_refine_passenger.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/detect_refine_passenger.png new file mode 100644 index 0000000000000000000000000000000000000000..0ae9650ade25da7e01b4f1bd4daa60dd179abd30 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/detect_refine_passenger.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54d65f618eab85ce758be5e5536b511e4239ca796756cd5ec3b92ce4bfa52e52 +size 108947 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/detect_refine_pedestrian_crossing.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/detect_refine_pedestrian_crossing.png new file mode 100644 index 0000000000000000000000000000000000000000..09d07fb0d4085adabac03297f64da32fa97662e9 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/detect_refine_pedestrian_crossing.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/detect_refine_pedestrian_walking_away.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/detect_refine_pedestrian_walking_away.png new file mode 100644 index 0000000000000000000000000000000000000000..03b8d203242202199e7d37ddd1dd4e2eb1fd4208 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/detect_refine_pedestrian_walking_away.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/detect_refine_protester.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/detect_refine_protester.png new file mode 100644 index 0000000000000000000000000000000000000000..f423e2218462972742a3428a56cc65db98e70055 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/detect_refine_protester.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/detect_refine_shopper_standing.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/detect_refine_shopper_standing.png new file mode 100644 index 0000000000000000000000000000000000000000..4eec1648da786d7ab6b50e386817b582692dc85a Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/detect_refine_shopper_standing.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/detect_refine_shopper_waiting.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/detect_refine_shopper_waiting.png new file mode 100644 index 0000000000000000000000000000000000000000..b00f65133de262115498e06e779e0cb3881ffa61 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/detect_refine_shopper_waiting.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/detect_refine_street_lamp.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/detect_refine_street_lamp.png new file mode 100644 index 0000000000000000000000000000000000000000..12977a6adfd6fca8af21ddbdcdb40134afe55d78 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/detect_refine_street_lamp.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/detect_refine_young_man.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/detect_refine_young_man.png new file mode 100644 index 0000000000000000000000000000000000000000..94201ae220edbbdb971dedd2d679dda8248d4c5b Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/detect_refine_young_man.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/diversify_input_businessman.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/diversify_input_businessman.png new file mode 100644 index 0000000000000000000000000000000000000000..5d16a843878dec15e99067cb52859194025b2146 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/diversify_input_businessman.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/diversify_input_dashboard_reflection.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/diversify_input_dashboard_reflection.png new file mode 100644 index 0000000000000000000000000000000000000000..eb519144a3470c975944ba0725852c51e2d9b7bd --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/diversify_input_dashboard_reflection.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:222640ecd862bc4a9e8f37ad1af8b5aba9a471e22c95887eee4140fed33066b5 +size 172634 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/diversify_input_passenger.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/diversify_input_passenger.png new file mode 100644 index 0000000000000000000000000000000000000000..e2b02379021e65165e5a8bf5df68a74d051ad106 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/diversify_input_passenger.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7a23458ca387325e8acae47a2102b5b3a4a6aada4080b1d04c19e75f6814883 +size 192826 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/diversify_input_pedestrian_crossing.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/diversify_input_pedestrian_crossing.png new file mode 100644 index 0000000000000000000000000000000000000000..45b8eeb35fe5bc25aebea88c56b172f61c1ce877 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/diversify_input_pedestrian_crossing.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e593fe1e506c2aa6bf4d13673378775dde644f927fc012a7f20cce4cae905097 +size 122281 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/diversify_input_pedestrian_walking_away.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/diversify_input_pedestrian_walking_away.png new file mode 100644 index 0000000000000000000000000000000000000000..918a906d4cf6ba4c7d1b82941916a936561b576f Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/diversify_input_pedestrian_walking_away.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/diversify_input_protester.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/diversify_input_protester.png new file mode 100644 index 0000000000000000000000000000000000000000..fb01edbea0348cacaa877bb3b1a125537a2d1762 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/diversify_input_protester.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/diversify_input_shopper_standing.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/diversify_input_shopper_standing.png new file mode 100644 index 0000000000000000000000000000000000000000..1349c892ee45ac7d22ad15e5ff85c4c5879219e9 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/diversify_input_shopper_standing.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/diversify_input_shopper_waiting.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/diversify_input_shopper_waiting.png new file mode 100644 index 0000000000000000000000000000000000000000..2e69c6ff42b8b95a9baca994bc43b833d9141728 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/diversify_input_shopper_waiting.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ca796e97e6a3c90693e7fde5ca179650d75e51032a7ab616607ab1456b4d052 +size 182771 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/diversify_input_street_lamp.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/diversify_input_street_lamp.png new file mode 100644 index 0000000000000000000000000000000000000000..c2ad3a218d6784d6a52e5f09e5c4357508297b57 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/diversify_input_street_lamp.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/diversify_input_young_man.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/diversify_input_young_man.png new file mode 100644 index 0000000000000000000000000000000000000000..a5d34dc5d9e2b677a2108c85d20d44d37d1484c0 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/diversify_input_young_man.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/detections.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/detections.json new file mode 100644 index 0000000000000000000000000000000000000000..1606367ecc100927bd4c42f5b424e6cd9f0696ea --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/detections.json @@ -0,0 +1,192 @@ +[ + { + "name": "passenger", + "present": true, + "bbox": [ + 0.8938, + 0.1807, + 0.9951, + 0.7157 + ], + "confidence": "high", + "notes": "The man matching the description is clearly visible.", + "coarse_bbox": [ + 0.893, + 0.182, + 0.995, + 0.713 + ], + "refine_crop": "crops/detect_refine_passenger.png" + }, + { + "name": "shopper_waiting", + "present": true, + "bbox": [ + 0.632, + 0.1691, + 0.7153, + 0.6522 + ], + "confidence": 0.99, + "notes": "Woman in a black jacket and dark trousers, carrying a brown handbag, accurately matches the description.", + "coarse_bbox": [ + 0.631, + 0.168, + 0.715, + 0.655 + ], + "refine_crop": "crops/detect_refine_shopper_waiting.png" + }, + { + "name": "shopper_standing", + "present": true, + "bbox": [ + 0.5209, + 0.1793, + 0.5735, + 0.5325 + ], + "confidence": 0.98, + "notes": "A person standing, wearing a blue jacket and blue jeans, near the crosswalk.", + "coarse_bbox": [ + 0.52, + 0.177, + 0.573, + 0.534 + ], + "refine_crop": "crops/detect_refine_shopper_standing.png" + }, + { + "name": "protester", + "present": true, + "bbox": [ + 0.8193, + 0.1216, + 0.8875, + 0.4511 + ], + "confidence": 0.99, + "notes": "Tight bounding box around the visible protester holding a sign.", + "coarse_bbox": [ + 0.818, + 0.122, + 0.887, + 0.435 + ], + "refine_crop": "crops/detect_refine_protester.png" + }, + { + "name": "pedestrian_crossing", + "present": true, + "bbox": [ + 0.2322, + 0.1993, + 0.3165, + 0.4965 + ], + "confidence": 0.99, + "notes": "Tight bounding box around the pedestrian with white top and dark pants.", + "coarse_bbox": [ + 0.229, + 0.208, + 0.315, + 0.497 + ], + "refine_crop": "crops/detect_refine_pedestrian_crossing.png" + }, + { + "name": "pedestrian_walking_away", + "present": true, + "bbox": [ + 0.013, + 0.2139, + 0.0908, + 0.494 + ], + "confidence": 100, + "notes": "A person wearing a white top and dark pants, walking away from the camera.", + "coarse_bbox": [ + 0.014, + 0.21, + 0.091, + 0.495 + ], + "refine_crop": "crops/detect_refine_pedestrian_walking_away.png" + }, + { + "name": "young_man", + "present": true, + "bbox": [ + 0.5568, + 0.1246, + 0.6032, + 0.5033 + ], + "confidence": 0.95, + "notes": "young man in dark blue hoodie found in crop", + "coarse_bbox": [ + 0.557, + 0.126, + 0.603, + 0.495 + ], + "refine_crop": "crops/detect_refine_young_man.png" + }, + { + "name": "businessman", + "present": true, + "bbox": [ + 0.5976, + 0.1322, + 0.6413, + 0.4385 + ], + "confidence": 0.95, + "notes": "Businessman in a dark suit standing.", + "coarse_bbox": [ + 0.599, + 0.128, + 0.644, + 0.439 + ], + "refine_crop": "crops/detect_refine_businessman.png" + }, + { + "name": "street_lamp", + "present": true, + "bbox": [ + 0.014, + 0.03, + 0.109, + 0.254 + ], + "confidence": 0.8, + "notes": "refine failed; using coarse bbox", + "coarse_bbox": [ + 0.014, + 0.03, + 0.109, + 0.254 + ], + "refine_crop": "crops/detect_refine_street_lamp.png" + }, + { + "name": "dashboard_reflection", + "present": true, + "bbox": [ + 0.143, + 0.6854, + 0.461, + 0.8934 + ], + "confidence": 0.95, + "notes": "Tight bounding box around the reflection of the dashboard and document on the windshield.", + "coarse_bbox": [ + 0.0, + 0.685, + 1.0, + 0.998 + ], + "refine_crop": "crops/detect_refine_dashboard_reflection.png" + } +] diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/main_image.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/main_image.png new file mode 100644 index 0000000000000000000000000000000000000000..cba39db780689089ebb5bc9b7389290476d3b2a0 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/main_image.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c09e207d874dfd84d325c12067e0ccc9e007ea31558eea1e65728fecca5a3a5 +size 1533645 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/plan.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/plan.json new file mode 100644 index 0000000000000000000000000000000000000000..77dbbf3423f50bac9db26bbc606326ab13664a06 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/plan.json @@ -0,0 +1,240 @@ +{ + "sample_id": "sample_000008", + "target_total": 10, + "target_people": 8, + "target_objects": 2, + "compose_prompt": { + "format": "structured_json_prompt", + "canvas": { + "size": [ + 1280, + 720 + ], + "aspect_ratio": "16:9", + "style": "photorealistic" + }, + "scene": { + "setting": "Busy city intersection seen from the driver's perspective inside a vehicle stopped at a crosswalk.", + "activity": "Pedestrians are crossing the street and waiting at the corners, while a person holding a sign walks along the sidewalk.", + "composition": "Dashcam perspective, wide 16:9 aspect ratio. A vehicle dashboard reflection is visible at the very bottom edge. A street lamp stands tall on the right sidewalk. Pedestrians are spatially distributed across the crosswalk midground and grouped on the sidewalks, conveying realistic urban depth and occlusion.", + "constraints": [ + "no text", + "no labels", + "no watermarks", + "true 16:9 composition", + "final canvas size 1280x720", + "normal human and object proportions", + "no squeezed perspective", + "no anamorphic stretching", + "every listed person and object must be visibly present", + "the foreground may contain only the listed people and objects", + "no extra foreground people, animals, vehicles, props, tools, products, signs, readable text, logos, or other localizable subjects absent from this JSON", + "the entire image must be a coherent driving scenario", + "the scene must include an outdoor road, street, highway, intersection, curbside, parking, or traffic environment", + "integrate all listed subjects as road users, vehicles, roadside objects, traffic infrastructure, or driving-context objects", + "do not render an office, kitchen, studio, product catalog, indoor room, or unrelated non-driving scene" + ] + }, + "people": [ + { + "name": "passenger", + "source_index": 0, + "source_image_id": "CrowdHuman:data/data_36/273278,a753000396c56bd.jpg:person:6", + "source_name": "passenger", + "description": "A man wearing glasses, a black jacket, and a light-colored shirt underneath, walking away from the camera.", + "role_in_scene": "Walking away on the sidewalk ahead." + }, + { + "name": "shopper_waiting", + "source_index": 2, + "source_image_id": "CrowdHuman:data/data_4/284193,a17b00013852c31.jpg:person:2", + "source_name": "shopper", + "description": "A woman standing on the right, looking towards the street, wearing a black jacket, dark trousers, and carrying a brown handbag.", + "role_in_scene": "Standing on the corner curb, waiting to cross the street." + }, + { + "name": "shopper_standing", + "source_index": 3, + "source_image_id": "CrowdHuman:data/data_59/283554,3417e000ebf80bca.jpg:person:13", + "source_name": "shopper", + "description": "A person standing, wearing a blue jacket and blue jeans.", + "role_in_scene": "Standing near the crosswalk edge amidst the crowd." + }, + { + "name": "protester", + "source_index": 5, + "source_image_id": "CrowdHuman:data/data_22/273278,c58890002730d04e.jpg:person:8", + "source_name": "protester holding sign in back", + "description": "A person walking in the background on the right, holding up a large white sign.", + "role_in_scene": "Walking further down the right sidewalk carrying a sign." + }, + { + "name": "pedestrian_crossing", + "source_index": 9, + "source_image_id": "CrowdHuman:data/data_1/273275,1361a000ad290eee.jpg:person:11", + "source_name": "pedestrian", + "description": "A person in a white top and dark pants, walking towards the right.", + "role_in_scene": "Actively walking across the crosswalk in front of the vehicle." + }, + { + "name": "pedestrian_walking_away", + "source_index": 12, + "source_image_id": "CrowdHuman:data/data_66/273275,194f6000c5fdbe78.jpg:person:12", + "source_name": "pedestrian", + "description": "A person wearing a white top and dark pants, walking away from the camera.", + "role_in_scene": "Walking away on the left side of the street." + }, + { + "name": "young_man", + "source_index": 13, + "source_image_id": "CrowdHuman:data/data_22/282555,1b65800070e437ee.jpg:person:8", + "source_name": "young man", + "description": "Standing back row, wearing a dark blue hoodie.", + "role_in_scene": "Waiting in the crowd at the corner intersection." + }, + { + "name": "businessman", + "source_index": 15, + "source_image_id": "CrowdHuman:data/data_56/282555,cd31e000ef97ef85.jpg:person:23", + "source_name": "adult in dark suit", + "description": "Standing in back rows, wearing dark suit and tie.", + "role_in_scene": "Standing on the sidewalk behind other pedestrians." + } + ], + "objects": [ + { + "name": "street_lamp", + "source_index": 0, + "source_image_id": "CrowdHuman:data/data_10/273278,10841e0001c957a7c.jpg:object:6", + "source_name": "street lamp", + "description": "A tall street lamp pole partially visible on the right side of the street.", + "role_in_scene": "Towering above the right sidewalk corner, serving as city infrastructure." + }, + { + "name": "dashboard_reflection", + "source_index": 2, + "source_image_id": "BDD100K:c24e7c72-f53291ea:object:8", + "source_name": "vehicle dashboard reflection", + "description": "A reflection on the windshield showing the interior dashboard and a document or object with large blue text.", + "role_in_scene": "Visible along the bottom edge of the frame, establishing the camera's perspective from inside a car." + } + ] + }, + "expected_subjects": [ + { + "name": "passenger", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_36/273278,a753000396c56bd.jpg:person:6", + "source_name": "passenger", + "source_description": "A man wearing glasses, a black jacket, and a light-colored shirt underneath, walking away from the camera. Source dataset: CrowdHuman. Scene context: A large crowd of people waiting at a train platform next to a stationary train.", + "sub_caption": "passenger: A man wearing glasses, a black jacket, and a light-colored shirt underneath, walking away from the camera.. Scene role: Walking away on the sidewalk ahead.", + "ref_style": "white_bg_full_body_front" + }, + { + "name": "shopper_waiting", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_4/284193,a17b00013852c31.jpg:person:2", + "source_name": "shopper", + "source_description": "A woman standing on the right, looking towards the street, wearing a black jacket, dark trousers, and carrying a brown handbag. Source dataset: CrowdHuman. Scene context: A bustling cobblestone street in a town lined with brick buildings, decorated with colorful bunting flags, and populated with pedestrians walking, shopping, and dining outdoors.", + "sub_caption": "shopper: A woman standing on the right, looking towards the street, wearing a black jacket, dark trousers, and carrying a brown handbag.. Scene role: Standing on the corner curb, waiting to cross the street.", + "ref_style": "white_bg_full_body_front" + }, + { + "name": "shopper_standing", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_59/283554,3417e000ebf80bca.jpg:person:13", + "source_name": "shopper", + "source_description": "A person standing, wearing a blue jacket and blue jeans. Source dataset: CrowdHuman. Scene context: A bustling street market with many pedestrians browsing stalls and goods displayed on the ground.", + "sub_caption": "shopper: A person standing, wearing a blue jacket and blue jeans.. Scene role: Standing near the crosswalk edge amidst the crowd.", + "ref_style": "white_bg_full_body_front" + }, + { + "name": "protester", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_22/273278,c58890002730d04e.jpg:person:8", + "source_name": "protester holding sign in back", + "source_description": "A person walking in the background on the right, holding up a large white sign. Source dataset: CrowdHuman. Scene context: A large crowd of people, possibly students, are walking down a city street, some holding signs, suggesting a protest or march.", + "sub_caption": "protester holding sign in back: A person walking in the background on the right, holding up a large white sign.. Scene role: Walking further down the right sidewalk carrying a sign.", + "ref_style": "white_bg_full_body_front" + }, + { + "name": "pedestrian_crossing", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_1/273275,1361a000ad290eee.jpg:person:11", + "source_name": "pedestrian", + "source_description": "A person in a white top and dark pants, walking towards the right. Source dataset: CrowdHuman. Scene context: A bustling city street scene with a mix of old brick buildings and modern glass structures, connected by an elevated glass walkway, with a large crowd of pedestrians moving along the wide sidewalk.", + "sub_caption": "pedestrian: A person in a white top and dark pants, walking towards the right.. Scene role: Actively walking across the crosswalk in front of the vehicle.", + "ref_style": "white_bg_full_body_front" + }, + { + "name": "pedestrian_walking_away", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_66/273275,194f6000c5fdbe78.jpg:person:12", + "source_name": "pedestrian", + "source_description": "A person wearing a white top and dark pants, walking away from the camera. Source dataset: CrowdHuman. Scene context: A large crowd of people walking on a city street in front of a grand, multi-story commercial building with signs for Virgin Megastore and Monoprix.", + "sub_caption": "pedestrian: A person wearing a white top and dark pants, walking away from the camera.. Scene role: Walking away on the left side of the street.", + "ref_style": "white_bg_full_body_front" + }, + { + "name": "young_man", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_22/282555,1b65800070e437ee.jpg:person:8", + "source_name": "young man", + "source_description": "Standing back row, wearing a dark blue hoodie. Source dataset: CrowdHuman. Scene context: A group of young people, mostly wearing college apparel, are posing for a photograph on bleachers in front of a red building.", + "sub_caption": "young man: Standing back row, wearing a dark blue hoodie.. Scene role: Waiting in the crowd at the corner intersection.", + "ref_style": "white_bg_full_body_front" + }, + { + "name": "businessman", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_56/282555,cd31e000ef97ef85.jpg:person:23", + "source_name": "adult in dark suit", + "source_description": "Standing in back rows, wearing dark suit and tie. Source dataset: CrowdHuman. Scene context: A large group of people gathered for a celebration, many wearing bright traditional Chinese costumes, holding props like fans and a lion dance head, posing for a group photo behind a banner.", + "sub_caption": "adult in dark suit: Standing in back rows, wearing dark suit and tie.. Scene role: Standing on the sidewalk behind other pedestrians.", + "ref_style": "white_bg_full_body_front" + }, + { + "name": "street_lamp", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_10/273278,10841e0001c957a7c.jpg:object:6", + "source_name": "street lamp", + "source_description": "A tall street lamp pole partially visible on the right side of the street. Source dataset: CrowdHuman. Scene context: A crowded city street at night, illuminated by numerous bright, colorful neon signs and shop displays.", + "sub_caption": "street lamp: A tall street lamp pole partially visible on the right side of the street.. Scene role: Towering above the right sidewalk corner, serving as city infrastructure.", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "dashboard_reflection", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c24e7c72-f53291ea:object:8", + "source_name": "vehicle dashboard reflection", + "source_description": "A reflection on the windshield showing the interior dashboard and a document or object with large blue text. Source dataset: BDD100K. Scene context: View from inside a vehicle driving on a city street on an overcast day, passing a large brick building on the right.", + "sub_caption": "vehicle dashboard reflection: A reflection on the windshield showing the interior dashboard and a document or object with large blue text.. Scene role: Visible along the bottom edge of the frame, establishing the camera's perspective from inside a car.", + "ref_style": "white_bg_encyclopedia_photo" + } + ], + "vocab_task_path": "sample_000008/vocab_task.json", + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references.json new file mode 100644 index 0000000000000000000000000000000000000000..ad954948d33a8ac0a540bd8b837f499d4645fad0 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references.json @@ -0,0 +1,325 @@ +{ + "references": [ + { + "name": "passenger", + "ref_image": "references/ref_passenger.png", + "raw_ref_image": "references/raw_ref_passenger_attempt_01.png", + "diversify_input": "crops/diversify_input_passenger.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_passenger_attempt_01.png", + "output": "references/ref_passenger.png", + "mask": "references/sam_mask_passenger.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 342.0, + 31.0, + 682.0, + 985.0 + ], + "mask_score": 3.454991, + "mask_area_ratio": 0.146239, + "elapsed_seconds": 8.0907 + }, + "reference_verify": "references/reference_verify_passenger.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "shopper_waiting", + "ref_image": "references/ref_shopper_waiting.png", + "raw_ref_image": "references/raw_ref_shopper_waiting_attempt_01.png", + "diversify_input": "crops/diversify_input_shopper_waiting.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_shopper_waiting_attempt_01.png", + "output": "references/ref_shopper_waiting.png", + "mask": "references/sam_mask_shopper_waiting.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 365.0, + 69.0, + 720.0, + 1006.0 + ], + "mask_score": 3.169183, + "mask_area_ratio": 0.111197, + "elapsed_seconds": 8.0622 + }, + "reference_verify": "references/reference_verify_shopper_waiting.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "shopper_standing", + "ref_image": "references/ref_shopper_standing.png", + "raw_ref_image": "references/raw_ref_shopper_standing_attempt_01.png", + "diversify_input": "crops/diversify_input_shopper_standing.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_shopper_standing_attempt_01.png", + "output": "references/ref_shopper_standing.png", + "mask": "references/sam_mask_shopper_standing.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 326.0, + 43.0, + 698.0, + 998.0 + ], + "mask_score": 3.440171, + "mask_area_ratio": 0.161293, + "elapsed_seconds": 8.0811 + }, + "reference_verify": "references/reference_verify_shopper_standing.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "protester", + "ref_image": "references/ref_protester.png", + "raw_ref_image": "references/raw_ref_protester_attempt_01.png", + "diversify_input": "crops/diversify_input_protester.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_protester_attempt_01.png", + "output": "references/ref_protester.png", + "mask": "references/sam_mask_protester.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 337.0, + 19.0, + 694.0, + 1013.0 + ], + "mask_score": 3.465365, + "mask_area_ratio": 0.161731, + "elapsed_seconds": 8.3408 + }, + "reference_verify": "references/reference_verify_protester.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "pedestrian_crossing", + "ref_image": "references/ref_pedestrian_crossing.png", + "raw_ref_image": "references/raw_ref_pedestrian_crossing_attempt_01.png", + "diversify_input": "crops/diversify_input_pedestrian_crossing.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_pedestrian_crossing_attempt_01.png", + "output": "references/ref_pedestrian_crossing.png", + "mask": "references/sam_mask_pedestrian_crossing.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 355.0, + 20.0, + 674.0, + 1012.0 + ], + "mask_score": 3.482863, + "mask_area_ratio": 0.15384, + "elapsed_seconds": 8.0791 + }, + "reference_verify": "references/reference_verify_pedestrian_crossing.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "pedestrian_walking_away", + "ref_image": "references/ref_pedestrian_walking_away.png", + "raw_ref_image": "references/raw_ref_pedestrian_walking_away_attempt_01.png", + "diversify_input": "crops/diversify_input_pedestrian_walking_away.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_pedestrian_walking_away_attempt_01.png", + "output": "references/ref_pedestrian_walking_away.png", + "mask": "references/sam_mask_pedestrian_walking_away.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 350.0, + 50.0, + 671.0, + 987.0 + ], + "mask_score": 3.476273, + "mask_area_ratio": 0.142721, + "elapsed_seconds": 8.2428 + }, + "reference_verify": "references/reference_verify_pedestrian_walking_away.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "young_man", + "ref_image": "references/ref_young_man.png", + "raw_ref_image": "references/raw_ref_young_man_attempt_01.png", + "diversify_input": "crops/diversify_input_young_man.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_young_man_attempt_01.png", + "output": "references/ref_young_man.png", + "mask": "references/sam_mask_young_man.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 352.0, + 65.0, + 671.0, + 928.0 + ], + "mask_score": 3.483394, + "mask_area_ratio": 0.132506, + "elapsed_seconds": 8.2271 + }, + "reference_verify": "references/reference_verify_young_man.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "businessman", + "ref_image": "references/ref_businessman.png", + "raw_ref_image": "references/raw_ref_businessman_attempt_01.png", + "diversify_input": "crops/diversify_input_businessman.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_businessman_attempt_01.png", + "output": "references/ref_businessman.png", + "mask": "references/sam_mask_businessman.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 336.0, + 19.0, + 690.0, + 1013.0 + ], + "mask_score": 2.970801, + "mask_area_ratio": 0.135565, + "elapsed_seconds": 8.2448 + }, + "reference_verify": "references/reference_verify_businessman.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "street_lamp", + "ref_image": "references/ref_street_lamp.png", + "raw_ref_image": "references/raw_ref_street_lamp_attempt_01.png", + "diversify_input": "crops/diversify_input_street_lamp.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_street_lamp_attempt_01.png", + "output": "references/ref_street_lamp.png", + "mask": "references/sam_mask_street_lamp.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 257.0, + 5.0, + 840.0, + 1019.0 + ], + "mask_score": 3.134794, + "mask_area_ratio": 0.049316, + "elapsed_seconds": 8.2643 + }, + "reference_verify": "references/reference_verify_street_lamp.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "dashboard_reflection", + "ref_image": "references/ref_dashboard_reflection.png", + "raw_ref_image": "references/raw_ref_dashboard_reflection_attempt_01.png", + "diversify_input": "crops/diversify_input_dashboard_reflection.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_dashboard_reflection_attempt_01.png", + "output": "references/ref_dashboard_reflection.png", + "mask": "references/sam_mask_dashboard_reflection.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 336.0, + 1023.0, + 1023.0 + ], + "mask_score": 1.211741, + "mask_area_ratio": 0.687541, + "elapsed_seconds": 9.9949 + }, + "reference_verify": "references/reference_verify_dashboard_reflection.json", + "reference_verify_passed": true, + "reference_attempts": 1 + } + ], + "reference_errors": {} +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/ref_businessman.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/ref_businessman.png new file mode 100644 index 0000000000000000000000000000000000000000..1bc4b1d6e2ed6622aaf431eed290a71928c163a7 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/ref_businessman.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7963f78f161f481df5ddb1311811f701c6b8685a39d4e45fa6cc8a983738f2f8 +size 288583 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/ref_dashboard_reflection.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/ref_dashboard_reflection.png new file mode 100644 index 0000000000000000000000000000000000000000..c7aff09bb014319b9fd61c7ec01761c61d20e7e4 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/ref_dashboard_reflection.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f0dd25e527fb34cd611e0931496d8fcd6fa4f0bcc6416140caac4975c75a7b1 +size 975719 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/ref_passenger.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/ref_passenger.png new file mode 100644 index 0000000000000000000000000000000000000000..08462701111b9afe3d3df55adb94e73c483611db --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/ref_passenger.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fceaf243e28a8434c0c7c6585fa437410ab095b00077d83c26c848f426d6321 +size 273946 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/ref_pedestrian_crossing.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/ref_pedestrian_crossing.png new file mode 100644 index 0000000000000000000000000000000000000000..c2dfb0396861277f5849da8d2b7a61719929c600 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/ref_pedestrian_crossing.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19e7e1a6efca6e2d2c24cf679477d3f6397ef1ced0c25a736ac73869c1e74d11 +size 258890 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/ref_pedestrian_walking_away.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/ref_pedestrian_walking_away.png new file mode 100644 index 0000000000000000000000000000000000000000..b1442fb351cc67141ab648ef5da59b6c089e9cd3 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/ref_pedestrian_walking_away.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e779d45ed2f181452ce2f417156c1233fe93a5a24a22a018f4b3cf07a06a1eb4 +size 250961 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/ref_protester.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/ref_protester.png new file mode 100644 index 0000000000000000000000000000000000000000..903cf50498902915f1e250be7ca0be6c6cd615fb --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/ref_protester.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b50d5b60887117ae3fb570da2fe2500c6b28a2e48f2a423f1b4b6e08059c72b7 +size 309153 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/ref_shopper_standing.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/ref_shopper_standing.png new file mode 100644 index 0000000000000000000000000000000000000000..0e0eb2927aeab2322038a4f4d1a4766324e9ab89 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/ref_shopper_standing.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a252854cc519dd386338e30a6cf8bb57c6440f1bd96d364ec380faf293c3fc0 +size 324953 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/ref_shopper_waiting.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/ref_shopper_waiting.png new file mode 100644 index 0000000000000000000000000000000000000000..debe4b84fa455b0c946aacc3a068cc02fcde3bf7 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/ref_shopper_waiting.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83183ff9310ce357eba6965265657a097f93ee175a6997dc0f96faa9cd9cbd9e +size 224413 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/ref_street_lamp.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/ref_street_lamp.png new file mode 100644 index 0000000000000000000000000000000000000000..2e3dd71ab240abab914b5b53e40a85a86ac2fe3f --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/ref_street_lamp.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af061c37221b7a1f033dead754abcaa645b2779ed0a3bcc37ddb601591d1289e +size 103033 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/ref_young_man.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/ref_young_man.png new file mode 100644 index 0000000000000000000000000000000000000000..d6d028f1dc5df937fb852018d14b61c32f434094 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/ref_young_man.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93450d11f3a9511ce9c1c368811b0b9dc2c9990142437e9fb3d9eef257130e6f +size 242505 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/reference_verify_businessman.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/reference_verify_businessman.json new file mode 100644 index 0000000000000000000000000000000000000000..32301f680f8ccd4c51ce53e981b424759492bc1f --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/reference_verify_businessman.json @@ -0,0 +1,46 @@ +{ + "name": "businessman", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_businessman_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_businessman_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_businessman_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_businessman_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/candidate_ref_businessman_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/candidate_sam_mask_businessman_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 336.0, + 19.0, + 690.0, + 1013.0 + ], + "mask_score": 2.970801, + "mask_area_ratio": 0.135565, + "elapsed_seconds": 8.2448 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image meets all hard requirements. It is a full-body reference image of a businessman with no parts cropped out, enough white margin, and a single main subject." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/reference_verify_dashboard_reflection.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/reference_verify_dashboard_reflection.json new file mode 100644 index 0000000000000000000000000000000000000000..e803277b429bb20f26e27c5d2d2d73c058e64673 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/reference_verify_dashboard_reflection.json @@ -0,0 +1,46 @@ +{ + "name": "dashboard_reflection", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_dashboard_reflection_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_dashboard_reflection_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_dashboard_reflection_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_dashboard_reflection_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/candidate_ref_dashboard_reflection_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/candidate_sam_mask_dashboard_reflection_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 336.0, + 1023.0, + 1023.0 + ], + "mask_score": 1.211741, + "mask_area_ratio": 0.687541, + "elapsed_seconds": 9.9949 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": true, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image successfully isolates the dashboard and its reflection on the windshield against a white background. As a surface/environmental feature, the cropping at the edges is perfectly acceptable." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/reference_verify_passenger.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/reference_verify_passenger.json new file mode 100644 index 0000000000000000000000000000000000000000..d2c056ae9a933cd87ab815d80eaf54dd92a144d3 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/reference_verify_passenger.json @@ -0,0 +1,46 @@ +{ + "name": "passenger", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_passenger_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_passenger_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_passenger_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_passenger_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/candidate_ref_passenger_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/candidate_sam_mask_passenger_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 342.0, + 31.0, + 682.0, + 985.0 + ], + "mask_score": 3.454991, + "mask_area_ratio": 0.146239, + "elapsed_seconds": 8.0907 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "Full body visible with white background. Meets all hard requirements." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/reference_verify_pedestrian_crossing.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/reference_verify_pedestrian_crossing.json new file mode 100644 index 0000000000000000000000000000000000000000..b3690e072afc69aff8f64d2df2d0a86f2eff5cc9 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/reference_verify_pedestrian_crossing.json @@ -0,0 +1,46 @@ +{ + "name": "pedestrian_crossing", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_pedestrian_crossing_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_pedestrian_crossing_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_pedestrian_crossing_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_pedestrian_crossing_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/candidate_ref_pedestrian_crossing_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/candidate_sam_mask_pedestrian_crossing_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 355.0, + 20.0, + 674.0, + 1012.0 + ], + "mask_score": 3.482863, + "mask_area_ratio": 0.15384, + "elapsed_seconds": 8.0791 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The reference image shows the complete full body of a single person on a white background with no edge cropping. The subject is wearing a white top and dark pants as required, and there is ample margin around the body." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/reference_verify_pedestrian_walking_away.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/reference_verify_pedestrian_walking_away.json new file mode 100644 index 0000000000000000000000000000000000000000..9c2874607c0b1be70b1df561831dd2eefc15a673 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/reference_verify_pedestrian_walking_away.json @@ -0,0 +1,46 @@ +{ + "name": "pedestrian_walking_away", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_pedestrian_walking_away_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_pedestrian_walking_away_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_pedestrian_walking_away_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_pedestrian_walking_away_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/candidate_ref_pedestrian_walking_away_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/candidate_sam_mask_pedestrian_walking_away_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 350.0, + 50.0, + 671.0, + 987.0 + ], + "mask_score": 3.476273, + "mask_area_ratio": 0.142721, + "elapsed_seconds": 8.2428 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "Full body isolated person with adequate margins." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/reference_verify_protester.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/reference_verify_protester.json new file mode 100644 index 0000000000000000000000000000000000000000..d29d4dfc86a270352171b12e330aaa82250f2038 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/reference_verify_protester.json @@ -0,0 +1,46 @@ +{ + "name": "protester", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_protester_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_protester_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_protester_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_protester_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/candidate_ref_protester_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/candidate_sam_mask_protester_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 337.0, + 19.0, + 694.0, + 1013.0 + ], + "mask_score": 3.465365, + "mask_area_ratio": 0.161731, + "elapsed_seconds": 8.3408 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The person is fully visible from head to toe with sufficient margin on a white background." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/reference_verify_shopper_standing.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/reference_verify_shopper_standing.json new file mode 100644 index 0000000000000000000000000000000000000000..3623622351af71eb8c8a1a30a892914a116dca2e --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/reference_verify_shopper_standing.json @@ -0,0 +1,46 @@ +{ + "name": "shopper_standing", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_shopper_standing_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_shopper_standing_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_shopper_standing_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_shopper_standing_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/candidate_ref_shopper_standing_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/candidate_sam_mask_shopper_standing_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 326.0, + 43.0, + 698.0, + 998.0 + ], + "mask_score": 3.440171, + "mask_area_ratio": 0.161293, + "elapsed_seconds": 8.0811 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows a full body of a person standing, wearing a blue jacket and blue jeans, with a clean white background and no cropping." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/reference_verify_shopper_waiting.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/reference_verify_shopper_waiting.json new file mode 100644 index 0000000000000000000000000000000000000000..712a5692b819ace4b1b514ca09f608ce121bdd43 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/reference_verify_shopper_waiting.json @@ -0,0 +1,46 @@ +{ + "name": "shopper_waiting", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_shopper_waiting_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_shopper_waiting_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_shopper_waiting_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_shopper_waiting_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/candidate_ref_shopper_waiting_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/candidate_sam_mask_shopper_waiting_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 365.0, + 69.0, + 720.0, + 1006.0 + ], + "mask_score": 3.169183, + "mask_area_ratio": 0.111197, + "elapsed_seconds": 8.0622 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The person is fully visible with no edge cropping. There are some visual artifacts on the handbag where the background removal or generation process left white splotches, but the person's body is complete." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/reference_verify_street_lamp.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/reference_verify_street_lamp.json new file mode 100644 index 0000000000000000000000000000000000000000..d9f7a2c5d0bc352198aef93da31dfb9de31cf9de --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/reference_verify_street_lamp.json @@ -0,0 +1,46 @@ +{ + "name": "street_lamp", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_street_lamp_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_street_lamp_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_street_lamp_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_street_lamp_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/candidate_ref_street_lamp_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/candidate_sam_mask_street_lamp_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 257.0, + 5.0, + 840.0, + 1019.0 + ], + "mask_score": 3.134794, + "mask_area_ratio": 0.049316, + "elapsed_seconds": 8.2643 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows a complete traffic light pole, which fits the general infrastructure description of 'street lamp pole'. It is fully visible on a white background without truncation." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/reference_verify_young_man.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/reference_verify_young_man.json new file mode 100644 index 0000000000000000000000000000000000000000..654e900a2a45bf47d01bfe2ce5d4c5847ffc8c06 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/reference_verify_young_man.json @@ -0,0 +1,46 @@ +{ + "name": "young_man", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_young_man_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_young_man_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_young_man_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_young_man_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/candidate_ref_young_man_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/candidate_sam_mask_young_man_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 352.0, + 65.0, + 671.0, + 928.0 + ], + "mask_score": 3.483394, + "mask_area_ratio": 0.132506, + "elapsed_seconds": 8.2271 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows a full body shot of a single young man standing, wearing a dark blue hoodie, with a white background and no cropping." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/sam_mask_businessman.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/sam_mask_businessman.png new file mode 100644 index 0000000000000000000000000000000000000000..1052a33e946b8800cf3a67832730acc210c02657 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/sam_mask_businessman.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/sam_mask_dashboard_reflection.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/sam_mask_dashboard_reflection.png new file mode 100644 index 0000000000000000000000000000000000000000..0af909fa673e12569ddfb5d17d928521419a24b6 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/sam_mask_dashboard_reflection.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/sam_mask_passenger.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/sam_mask_passenger.png new file mode 100644 index 0000000000000000000000000000000000000000..40f73e5b53a42c660bd4ebf06076ad68c617fae6 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/sam_mask_passenger.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/sam_mask_pedestrian_crossing.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/sam_mask_pedestrian_crossing.png new file mode 100644 index 0000000000000000000000000000000000000000..93584e0fd74723bb6438a235bf332ca6483470f8 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/sam_mask_pedestrian_crossing.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/sam_mask_pedestrian_walking_away.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/sam_mask_pedestrian_walking_away.png new file mode 100644 index 0000000000000000000000000000000000000000..ddb7661ae1fd9f914657dc49970c32f86a15d679 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/sam_mask_pedestrian_walking_away.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/sam_mask_protester.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/sam_mask_protester.png new file mode 100644 index 0000000000000000000000000000000000000000..6e53928c13063f7dff13c0129a8c1e64d3f77ee3 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/sam_mask_protester.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/sam_mask_shopper_standing.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/sam_mask_shopper_standing.png new file mode 100644 index 0000000000000000000000000000000000000000..699a5f5ed756df787bf5a298f6d5f1c678b22576 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/sam_mask_shopper_standing.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/sam_mask_shopper_waiting.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/sam_mask_shopper_waiting.png new file mode 100644 index 0000000000000000000000000000000000000000..38e1eb360a07a38371b93d2bd54a4855246eb2e2 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/sam_mask_shopper_waiting.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/sam_mask_street_lamp.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/sam_mask_street_lamp.png new file mode 100644 index 0000000000000000000000000000000000000000..6c84d3fc83e4e9b424b6047bdcc986e8ef24e50d Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/sam_mask_street_lamp.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/sam_mask_young_man.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/sam_mask_young_man.png new file mode 100644 index 0000000000000000000000000000000000000000..a42bf1375d1533977a02b4de82fd56120b32eef6 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/sam_mask_young_man.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/row.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/row.json new file mode 100644 index 0000000000000000000000000000000000000000..3a428eff762cc54b0584b1907cbfeeecb00f3a7d --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/row.json @@ -0,0 +1,486 @@ +{ + "sample_id": "sample_000008", + "target_total": 10, + "target_people": 8, + "target_objects": 2, + "canvas_size": [ + 1280, + 720 + ], + "canvas_aspect_ratio": "16:9", + "main_image": "main_image.png", + "bbox_overlay": "bbox_overlay.png", + "plan": "plan.json", + "detections": "detections.json", + "vocab_task": "vocab_task.json", + "n_planned": 10, + "n_detected": 10, + "n_subjects": 10, + "subjects": [ + { + "name": "passenger", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_36/273278,a753000396c56bd.jpg:person:6", + "source_name": "passenger", + "source_description": "A man wearing glasses, a black jacket, and a light-colored shirt underneath, walking away from the camera. Source dataset: CrowdHuman. Scene context: A large crowd of people waiting at a train platform next to a stationary train.", + "sub_caption": "passenger: A man wearing glasses, a black jacket, and a light-colored shirt underneath, walking away from the camera.. Scene role: Walking away on the sidewalk ahead.", + "measured_bbox": [ + 0.8938, + 0.1807, + 0.9951, + 0.7157 + ], + "detection_confidence": "high", + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_passenger.png", + "raw_ref_image": "references/raw_ref_passenger_attempt_01.png", + "reference_verify": "references/reference_verify_passenger.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_passenger_attempt_01.png", + "output": "references/ref_passenger.png", + "mask": "references/sam_mask_passenger.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 342.0, + 31.0, + 682.0, + 985.0 + ], + "mask_score": 3.454991, + "mask_area_ratio": 0.146239, + "elapsed_seconds": 8.0907 + } + }, + { + "name": "shopper_waiting", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_4/284193,a17b00013852c31.jpg:person:2", + "source_name": "shopper", + "source_description": "A woman standing on the right, looking towards the street, wearing a black jacket, dark trousers, and carrying a brown handbag. Source dataset: CrowdHuman. Scene context: A bustling cobblestone street in a town lined with brick buildings, decorated with colorful bunting flags, and populated with pedestrians walking, shopping, and dining outdoors.", + "sub_caption": "shopper: A woman standing on the right, looking towards the street, wearing a black jacket, dark trousers, and carrying a brown handbag.. Scene role: Standing on the corner curb, waiting to cross the street.", + "measured_bbox": [ + 0.632, + 0.1691, + 0.7153, + 0.6522 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_shopper_waiting.png", + "raw_ref_image": "references/raw_ref_shopper_waiting_attempt_01.png", + "reference_verify": "references/reference_verify_shopper_waiting.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_shopper_waiting_attempt_01.png", + "output": "references/ref_shopper_waiting.png", + "mask": "references/sam_mask_shopper_waiting.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 365.0, + 69.0, + 720.0, + 1006.0 + ], + "mask_score": 3.169183, + "mask_area_ratio": 0.111197, + "elapsed_seconds": 8.0622 + } + }, + { + "name": "shopper_standing", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_59/283554,3417e000ebf80bca.jpg:person:13", + "source_name": "shopper", + "source_description": "A person standing, wearing a blue jacket and blue jeans. Source dataset: CrowdHuman. Scene context: A bustling street market with many pedestrians browsing stalls and goods displayed on the ground.", + "sub_caption": "shopper: A person standing, wearing a blue jacket and blue jeans.. Scene role: Standing near the crosswalk edge amidst the crowd.", + "measured_bbox": [ + 0.5209, + 0.1793, + 0.5735, + 0.5325 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_shopper_standing.png", + "raw_ref_image": "references/raw_ref_shopper_standing_attempt_01.png", + "reference_verify": "references/reference_verify_shopper_standing.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_shopper_standing_attempt_01.png", + "output": "references/ref_shopper_standing.png", + "mask": "references/sam_mask_shopper_standing.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 326.0, + 43.0, + 698.0, + 998.0 + ], + "mask_score": 3.440171, + "mask_area_ratio": 0.161293, + "elapsed_seconds": 8.0811 + } + }, + { + "name": "protester", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_22/273278,c58890002730d04e.jpg:person:8", + "source_name": "protester holding sign in back", + "source_description": "A person walking in the background on the right, holding up a large white sign. Source dataset: CrowdHuman. Scene context: A large crowd of people, possibly students, are walking down a city street, some holding signs, suggesting a protest or march.", + "sub_caption": "protester holding sign in back: A person walking in the background on the right, holding up a large white sign.. Scene role: Walking further down the right sidewalk carrying a sign.", + "measured_bbox": [ + 0.8193, + 0.1216, + 0.8875, + 0.4511 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_protester.png", + "raw_ref_image": "references/raw_ref_protester_attempt_01.png", + "reference_verify": "references/reference_verify_protester.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_protester_attempt_01.png", + "output": "references/ref_protester.png", + "mask": "references/sam_mask_protester.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 337.0, + 19.0, + 694.0, + 1013.0 + ], + "mask_score": 3.465365, + "mask_area_ratio": 0.161731, + "elapsed_seconds": 8.3408 + } + }, + { + "name": "pedestrian_crossing", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_1/273275,1361a000ad290eee.jpg:person:11", + "source_name": "pedestrian", + "source_description": "A person in a white top and dark pants, walking towards the right. Source dataset: CrowdHuman. Scene context: A bustling city street scene with a mix of old brick buildings and modern glass structures, connected by an elevated glass walkway, with a large crowd of pedestrians moving along the wide sidewalk.", + "sub_caption": "pedestrian: A person in a white top and dark pants, walking towards the right.. Scene role: Actively walking across the crosswalk in front of the vehicle.", + "measured_bbox": [ + 0.2322, + 0.1993, + 0.3165, + 0.4965 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian_crossing.png", + "raw_ref_image": "references/raw_ref_pedestrian_crossing_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian_crossing.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_pedestrian_crossing_attempt_01.png", + "output": "references/ref_pedestrian_crossing.png", + "mask": "references/sam_mask_pedestrian_crossing.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 355.0, + 20.0, + 674.0, + 1012.0 + ], + "mask_score": 3.482863, + "mask_area_ratio": 0.15384, + "elapsed_seconds": 8.0791 + } + }, + { + "name": "pedestrian_walking_away", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_66/273275,194f6000c5fdbe78.jpg:person:12", + "source_name": "pedestrian", + "source_description": "A person wearing a white top and dark pants, walking away from the camera. Source dataset: CrowdHuman. Scene context: A large crowd of people walking on a city street in front of a grand, multi-story commercial building with signs for Virgin Megastore and Monoprix.", + "sub_caption": "pedestrian: A person wearing a white top and dark pants, walking away from the camera.. Scene role: Walking away on the left side of the street.", + "measured_bbox": [ + 0.013, + 0.2139, + 0.0908, + 0.494 + ], + "detection_confidence": 100, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian_walking_away.png", + "raw_ref_image": "references/raw_ref_pedestrian_walking_away_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian_walking_away.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_pedestrian_walking_away_attempt_01.png", + "output": "references/ref_pedestrian_walking_away.png", + "mask": "references/sam_mask_pedestrian_walking_away.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 350.0, + 50.0, + 671.0, + 987.0 + ], + "mask_score": 3.476273, + "mask_area_ratio": 0.142721, + "elapsed_seconds": 8.2428 + } + }, + { + "name": "young_man", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_22/282555,1b65800070e437ee.jpg:person:8", + "source_name": "young man", + "source_description": "Standing back row, wearing a dark blue hoodie. Source dataset: CrowdHuman. Scene context: A group of young people, mostly wearing college apparel, are posing for a photograph on bleachers in front of a red building.", + "sub_caption": "young man: Standing back row, wearing a dark blue hoodie.. Scene role: Waiting in the crowd at the corner intersection.", + "measured_bbox": [ + 0.5568, + 0.1246, + 0.6032, + 0.5033 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_young_man.png", + "raw_ref_image": "references/raw_ref_young_man_attempt_01.png", + "reference_verify": "references/reference_verify_young_man.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_young_man_attempt_01.png", + "output": "references/ref_young_man.png", + "mask": "references/sam_mask_young_man.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 352.0, + 65.0, + 671.0, + 928.0 + ], + "mask_score": 3.483394, + "mask_area_ratio": 0.132506, + "elapsed_seconds": 8.2271 + } + }, + { + "name": "businessman", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_56/282555,cd31e000ef97ef85.jpg:person:23", + "source_name": "adult in dark suit", + "source_description": "Standing in back rows, wearing dark suit and tie. Source dataset: CrowdHuman. Scene context: A large group of people gathered for a celebration, many wearing bright traditional Chinese costumes, holding props like fans and a lion dance head, posing for a group photo behind a banner.", + "sub_caption": "adult in dark suit: Standing in back rows, wearing dark suit and tie.. Scene role: Standing on the sidewalk behind other pedestrians.", + "measured_bbox": [ + 0.5976, + 0.1322, + 0.6413, + 0.4385 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_businessman.png", + "raw_ref_image": "references/raw_ref_businessman_attempt_01.png", + "reference_verify": "references/reference_verify_businessman.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_businessman_attempt_01.png", + "output": "references/ref_businessman.png", + "mask": "references/sam_mask_businessman.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 336.0, + 19.0, + 690.0, + 1013.0 + ], + "mask_score": 2.970801, + "mask_area_ratio": 0.135565, + "elapsed_seconds": 8.2448 + } + }, + { + "name": "street_lamp", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_10/273278,10841e0001c957a7c.jpg:object:6", + "source_name": "street lamp", + "source_description": "A tall street lamp pole partially visible on the right side of the street. Source dataset: CrowdHuman. Scene context: A crowded city street at night, illuminated by numerous bright, colorful neon signs and shop displays.", + "sub_caption": "street lamp: A tall street lamp pole partially visible on the right side of the street.. Scene role: Towering above the right sidewalk corner, serving as city infrastructure.", + "measured_bbox": [ + 0.014, + 0.03, + 0.109, + 0.254 + ], + "detection_confidence": 0.8, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_street_lamp.png", + "raw_ref_image": "references/raw_ref_street_lamp_attempt_01.png", + "reference_verify": "references/reference_verify_street_lamp.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_street_lamp_attempt_01.png", + "output": "references/ref_street_lamp.png", + "mask": "references/sam_mask_street_lamp.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 257.0, + 5.0, + 840.0, + 1019.0 + ], + "mask_score": 3.134794, + "mask_area_ratio": 0.049316, + "elapsed_seconds": 8.2643 + } + }, + { + "name": "dashboard_reflection", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c24e7c72-f53291ea:object:8", + "source_name": "vehicle dashboard reflection", + "source_description": "A reflection on the windshield showing the interior dashboard and a document or object with large blue text. Source dataset: BDD100K. Scene context: View from inside a vehicle driving on a city street on an overcast day, passing a large brick building on the right.", + "sub_caption": "vehicle dashboard reflection: A reflection on the windshield showing the interior dashboard and a document or object with large blue text.. Scene role: Visible along the bottom edge of the frame, establishing the camera's perspective from inside a car.", + "measured_bbox": [ + 0.143, + 0.6854, + 0.461, + 0.8934 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_dashboard_reflection.png", + "raw_ref_image": "references/raw_ref_dashboard_reflection_attempt_01.png", + "reference_verify": "references/reference_verify_dashboard_reflection.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000008/references/raw_ref_dashboard_reflection_attempt_01.png", + "output": "references/ref_dashboard_reflection.png", + "mask": "references/sam_mask_dashboard_reflection.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 336.0, + 1023.0, + 1023.0 + ], + "mask_score": 1.211741, + "mask_area_ratio": 0.687541, + "elapsed_seconds": 9.9949 + } + } + ], + "not_emitted": [], + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/vocab_task.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/vocab_task.json new file mode 100644 index 0000000000000000000000000000000000000000..119d818ef50fa7ea05bfb9e04a522b99215e9cf5 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/vocab_task.json @@ -0,0 +1,154 @@ +{ + "task_id": "sample_000008", + "sample_id": "sample_000008", + "sample_index": 8, + "target_total": 10, + "target_people": 8, + "target_objects": 2, + "people_candidates": [ + { + "candidate_index": 0, + "source_offset": 78660, + "image_id": "CrowdHuman:data/data_36/273278,a753000396c56bd.jpg:person:6", + "name": "passenger", + "description": "A man wearing glasses, a black jacket, and a light-colored shirt underneath, walking away from the camera. Source dataset: CrowdHuman. Scene context: A large crowd of people waiting at a train platform next to a stationary train." + }, + { + "candidate_index": 1, + "source_offset": 20565, + "image_id": "CrowdHuman:data/data_16/273275,5f6b50004676029f.jpg:person:6", + "name": "woman looking up", + "description": "A woman sitting on the grass, wearing a light beige jacket or shirt, looking up and smiling towards the right. Source dataset: CrowdHuman. Scene context: A group of young adults is relaxing and interacting on a grassy slope next to a modern building, some using laptops and others playing with a football." + }, + { + "candidate_index": 2, + "source_offset": 90506, + "image_id": "CrowdHuman:data/data_4/284193,a17b00013852c31.jpg:person:2", + "name": "shopper", + "description": "A woman standing on the right, looking towards the street, wearing a black jacket, dark trousers, and carrying a brown handbag. Source dataset: CrowdHuman. Scene context: A bustling cobblestone street in a town lined with brick buildings, decorated with colorful bunting flags, and populated with pedestrians walking, shopping, and dining outdoors." + }, + { + "candidate_index": 3, + "source_offset": 144829, + "image_id": "CrowdHuman:data/data_59/283554,3417e000ebf80bca.jpg:person:13", + "name": "shopper", + "description": "A person standing, wearing a blue jacket and blue jeans. Source dataset: CrowdHuman. Scene context: A bustling street market with many pedestrians browsing stalls and goods displayed on the ground." + }, + { + "candidate_index": 4, + "source_offset": 143218, + "image_id": "CrowdHuman:data/data_59/273271,2ced70001bb62011.jpg:person:28", + "name": "child in white cardigan", + "description": "Kneeling, wearing a white cardigan over a red top. Source dataset: CrowdHuman. Scene context: A large group of children and a few adults are posed for a group photo in what appears to be a school gym or hall, some holding props or instruments, in front of a decorated banner." + }, + { + "candidate_index": 5, + "source_offset": 39864, + "image_id": "CrowdHuman:data/data_22/273278,c58890002730d04e.jpg:person:8", + "name": "protester holding sign in back", + "description": "A person walking in the background on the right, holding up a large white sign. Source dataset: CrowdHuman. Scene context: A large crowd of people, possibly students, are walking down a city street, some holding signs, suggesting a protest or march." + }, + { + "candidate_index": 6, + "source_offset": 97495, + "image_id": "CrowdHuman:data/data_40/282555,2c9d000ea25c638.jpg:person:3", + "name": "person sitting", + "description": "person sitting on the stairs, wearing a dark top and light pants Source dataset: CrowdHuman. Scene context: A large crowd of people sits and stands on the wide stone steps in front of a grand, classical building, viewed through the arching jets of a fountain in the foreground." + }, + { + "candidate_index": 7, + "source_offset": 82251, + "image_id": "CrowdHuman:data/data_37/283991,1970600072fc59c6.jpg:person:4", + "name": "cafe patron", + "description": "Seated at a table in the foreground, facing right, wearing a light long-sleeved shirt and appearing to interact with someone or something off-camera. Source dataset: CrowdHuman. Scene context: A black and white view of a lively outdoor pedestrian area lined with trees, outdoor cafes with large umbrellas, and numerous people walking and sitting." + }, + { + "candidate_index": 8, + "source_offset": 88856, + "image_id": "CrowdHuman:data/data_4/273278,1f14b00090e94205.jpg:person:1", + "name": "person on screen", + "description": "Partially visible on the right digital display, a person with long brown hair, seen from behind. Source dataset: CrowdHuman. Scene context: A man stands in front of a digital display showing a service change announcement for the Long Island Rail Road." + }, + { + "candidate_index": 9, + "source_offset": 2586, + "image_id": "CrowdHuman:data/data_1/273275,1361a000ad290eee.jpg:person:11", + "name": "pedestrian", + "description": "A person in a white top and dark pants, walking towards the right. Source dataset: CrowdHuman. Scene context: A bustling city street scene with a mix of old brick buildings and modern glass structures, connected by an elevated glass walkway, with a large crowd of pedestrians moving along the wide sidewalk." + }, + { + "candidate_index": 10, + "source_offset": 159889, + "image_id": "CrowdHuman:data/data_64/283081,dccc00022c623e8.jpg:person:5", + "name": "man in gray jacket", + "description": "A man seated in the lower right foreground with an afro, wearing a gray velvety jacket over a dark shirt and blue jeans. Source dataset: CrowdHuman. Scene context: A group portrait of six men posing together against a plain white background, dressed in varied retro-style clothing including hats, suits, and jackets." + }, + { + "candidate_index": 11, + "source_offset": 9906, + "image_id": "CrowdHuman:data/data_12/273275,106895000eb7b7132.jpg:person:5", + "name": "person sitting on right", + "description": "Sitting on the right side of the table, wearing a panda hoodie with hood up, using a laptop. Source dataset: CrowdHuman. Scene context: A group of people wearing panda-themed hoodies are sitting around a circular table in an office setting, working on laptops." + }, + { + "candidate_index": 12, + "source_offset": 165119, + "image_id": "CrowdHuman:data/data_66/273275,194f6000c5fdbe78.jpg:person:12", + "name": "pedestrian", + "description": "A person wearing a white top and dark pants, walking away from the camera. Source dataset: CrowdHuman. Scene context: A large crowd of people walking on a city street in front of a grand, multi-story commercial building with signs for Virgin Megastore and Monoprix." + }, + { + "candidate_index": 13, + "source_offset": 41919, + "image_id": "CrowdHuman:data/data_22/282555,1b65800070e437ee.jpg:person:8", + "name": "young man", + "description": "Standing back row, wearing a dark blue hoodie. Source dataset: CrowdHuman. Scene context: A group of young people, mostly wearing college apparel, are posing for a photograph on bleachers in front of a red building." + }, + { + "candidate_index": 14, + "source_offset": 8187, + "image_id": "CrowdHuman:data/data_11/273275,f16410005b938064.jpg:person:27", + "name": "child in dark jacket", + "description": "A young child crouching in the front, wearing a dark jacket. Source dataset: CrowdHuman. Scene context: A large group of people posing for a group photo outdoors with ancient ruins and a large mountain in the background." + }, + { + "candidate_index": 15, + "source_offset": 138906, + "image_id": "CrowdHuman:data/data_56/282555,cd31e000ef97ef85.jpg:person:23", + "name": "adult in dark suit", + "description": "Standing in back rows, wearing dark suit and tie. Source dataset: CrowdHuman. Scene context: A large group of people gathered for a celebration, many wearing bright traditional Chinese costumes, holding props like fans and a lion dance head, posing for a group photo behind a banner." + } + ], + "object_candidates": [ + { + "candidate_index": 0, + "source_offset": 4011, + "image_id": "CrowdHuman:data/data_10/273278,10841e0001c957a7c.jpg:object:6", + "name": "street lamp", + "description": "A tall street lamp pole partially visible on the right side of the street. Source dataset: CrowdHuman. Scene context: A crowded city street at night, illuminated by numerous bright, colorful neon signs and shop displays." + }, + { + "candidate_index": 1, + "source_offset": 3235, + "image_id": "CrowdHuman:data/data_1/283992,13ecf00046b443c7.jpg:object:3", + "name": "camera", + "description": "black digital camera held by the woman in the center Source dataset: CrowdHuman. Scene context: A crowd of tourists gathers around and interacts with a bronze statue of a mermaid sitting on a large rock." + }, + { + "candidate_index": 2, + "source_offset": 209406, + "image_id": "BDD100K:c24e7c72-f53291ea:object:8", + "name": "vehicle dashboard reflection", + "description": "A reflection on the windshield showing the interior dashboard and a document or object with large blue text. Source dataset: BDD100K. Scene context: View from inside a vehicle driving on a city street on an overcast day, passing a large brick building on the right." + }, + { + "candidate_index": 3, + "source_offset": 109024, + "image_id": "CrowdHuman:data/data_64/273278,edaf2000081eef33.jpg:object:9", + "name": "shoulder bag", + "description": "A light-colored shoulder bag carried by the woman in the teal jacket. Source dataset: CrowdHuman. Scene context: A group of people walks on a grassy area in front of a large, two-story house with a porch." + } + ], + "rng_seed": 1782765825, + "created_at": 1782223460.3670564 +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/bbox_overlay.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/bbox_overlay.png new file mode 100644 index 0000000000000000000000000000000000000000..bc7842fc3155ced13354b0f64af8ca861f9792b2 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/bbox_overlay.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a4115f82ae1274c5fdb8da31fea372437f20b8f0779c9d0c149bcd09d92a269 +size 1767610 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/compose_prompt.txt b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/compose_prompt.txt new file mode 100644 index 0000000000000000000000000000000000000000..73ffe867339fb231c58852b1749202e16fbee087 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/compose_prompt.txt @@ -0,0 +1,79 @@ +Render the following JSON scene specification as a photorealistic 1248x832 image using a true 3:2 canvas. The rendered image must be a coherent driving scenario with an outdoor road, street, highway, intersection, curbside, parking, or traffic environment; do not render a non-driving indoor scene or product-catalog scene. Every listed person and object must appear visibly in the image. Keep normal proportions and the requested aspect ratio. The foreground must contain only subjects explicitly listed in the JSON scene specification. Do not add any unlisted foreground people, animals, vehicles, props, tools, products, signs, readable text, logos, or other localizable subjects. Background context may include non-localizable scenery only when it does not introduce a distinct foreground subject. No text, no labels, no logos, no watermarks. +JSON scene specification: +{ + "format": "structured_json_prompt", + "canvas": { + "size": [ + 1248, + 832 + ], + "aspect_ratio": "3:2", + "style": "photorealistic" + }, + "scene": { + "setting": "Urban city street approaching a concrete overpass during early evening.", + "activity": "A pedestrian walks along the side of the road as seen from the perspective of an approaching vehicle.", + "composition": "Eye-level view from the middle of the road. A solid yellow lane line leads from the bottom foreground toward the center. The pedestrian is positioned on the sidewalk to the right. An overpass spans horizontally across the upper midground, with glowing street lights attached or nearby. A yellow multi-story building is visible in the right background behind the sidewalk.", + "constraints": [ + "no text", + "no labels", + "no watermarks", + "true 3:2 composition", + "final canvas size 1248x832", + "normal human and object proportions", + "no squeezed perspective", + "no anamorphic stretching", + "every listed person and object must be visibly present", + "the foreground may contain only the listed people and objects", + "no extra foreground people, animals, vehicles, props, tools, products, signs, readable text, logos, or other localizable subjects absent from this JSON", + "the entire image must be a coherent driving scenario", + "the scene must include an outdoor road, street, highway, intersection, curbside, parking, or traffic environment", + "integrate all listed subjects as road users, vehicles, roadside objects, traffic infrastructure, or driving-context objects", + "do not render an office, kitchen, studio, product catalog, indoor room, or unrelated non-driving scene" + ] + }, + "people": [ + { + "name": "pedestrian_in_suit", + "source_index": 0, + "source_image_id": "CrowdHuman:data/data_11/282555,d1509000eb848483.jpg:person:21", + "source_name": "pedestrian in suit", + "description": "A person walking, wearing a dark suit.", + "role_in_scene": "walking along the right side of the street on the sidewalk" + } + ], + "objects": [ + { + "name": "yellow_building", + "source_index": 0, + "source_image_id": "CrowdHuman:data/data_32/273275,7595c000fa55d9d7.jpg:object:5", + "source_name": "yellow building", + "description": "A multi-story building with a yellow ochre facade and numerous shuttered windows.", + "role_in_scene": "providing a backdrop on the right side of the street scene" + }, + { + "name": "yellow_lane_line", + "source_index": 1, + "source_image_id": "BDD100K:c9195e43-b2fdd978:object:2", + "source_name": "yellow lane line", + "description": "A solid yellow line painted on the road surface indicating the edge of the lane.", + "role_in_scene": "extending along the road surface towards the distance" + }, + { + "name": "overpass", + "source_index": 3, + "source_image_id": "BDD100K:bcd32109-8b0b1eef:object:6", + "source_name": "overpass", + "description": "A concrete bridge structure spanning across the street ahead.", + "role_in_scene": "arching over the road in the midground" + }, + { + "name": "street_light", + "source_index": 7, + "source_image_id": "BDD100K:bd5bb58b-24a6f5e7:object:2", + "source_name": "street light", + "description": "Bright, glowing street lights illuminating the road.", + "role_in_scene": "overhead fixtures providing illumination over the roadway and sidewalk" + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/crops/detect_refine_overpass.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/crops/detect_refine_overpass.png new file mode 100644 index 0000000000000000000000000000000000000000..e24f40a38e00c0969615119605d6d531cb0dc3d5 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/crops/detect_refine_overpass.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63d6ffb97161108cf96c20912aa7f7b083e3191c352aaf61e95fe73ae09891d5 +size 530903 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/crops/detect_refine_pedestrian_in_suit.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/crops/detect_refine_pedestrian_in_suit.png new file mode 100644 index 0000000000000000000000000000000000000000..89f44e8d45b5c6f59a8e2bbf7158506d83b3a513 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/crops/detect_refine_pedestrian_in_suit.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/crops/detect_refine_street_light.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/crops/detect_refine_street_light.png new file mode 100644 index 0000000000000000000000000000000000000000..2d209e8ab1d5b7265e4380302b23ef2da5395ddb Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/crops/detect_refine_street_light.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/crops/detect_refine_yellow_building.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/crops/detect_refine_yellow_building.png new file mode 100644 index 0000000000000000000000000000000000000000..b51b3d0a41278788f4a987125b2c88d0b871d5ee --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/crops/detect_refine_yellow_building.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75bb567c5afc500668c2d6b58b1c0bee00d06a953676ff6dcef33c8e5c963456 +size 455251 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/crops/detect_refine_yellow_lane_line.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/crops/detect_refine_yellow_lane_line.png new file mode 100644 index 0000000000000000000000000000000000000000..42e39fa9e5d69ed259c5bdbe15aaf3c2d40562fc --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/crops/detect_refine_yellow_lane_line.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:367fd97999b21206f474a0f0111c820b76488514ffa90ecb90c96c8718c2573b +size 117398 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/crops/diversify_input_overpass.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/crops/diversify_input_overpass.png new file mode 100644 index 0000000000000000000000000000000000000000..e576be34c635e1c0e140056f810f86c474d79fb0 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/crops/diversify_input_overpass.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ff08ee6b089e889b88fefa7d25283365d32266cca1442e6a8ce12866618240e +size 611549 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/crops/diversify_input_pedestrian_in_suit.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/crops/diversify_input_pedestrian_in_suit.png new file mode 100644 index 0000000000000000000000000000000000000000..2bb3644928c0fb607705a9e1015741efc813f024 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/crops/diversify_input_pedestrian_in_suit.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/crops/diversify_input_street_light.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/crops/diversify_input_street_light.png new file mode 100644 index 0000000000000000000000000000000000000000..17ef428b55d5c53ecf17e457d30310ba207e67e7 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/crops/diversify_input_street_light.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/crops/diversify_input_yellow_building.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/crops/diversify_input_yellow_building.png new file mode 100644 index 0000000000000000000000000000000000000000..7746744bb556b43f6404d397468f78d2e947ce4b --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/crops/diversify_input_yellow_building.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e611a69a3d7b0f8c285bdcaee1bc3b3a2100ed4bebd68206db14e5bc6e7c7a6f +size 545920 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/crops/diversify_input_yellow_lane_line.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/crops/diversify_input_yellow_lane_line.png new file mode 100644 index 0000000000000000000000000000000000000000..b81170e4045c3f9023fe20edc9b9c189d9a182f8 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/crops/diversify_input_yellow_lane_line.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eec22adcc256c8929348ba918aadd0463504f54c34d348af5d4a82e31ff98659 +size 145839 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/detections.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/detections.json new file mode 100644 index 0000000000000000000000000000000000000000..a18cad50a241b4297f760435474b4e299a8bedea --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/detections.json @@ -0,0 +1,97 @@ +[ + { + "name": "pedestrian_in_suit", + "present": true, + "bbox": [ + 0.7498, + 0.407, + 0.8062, + 0.6382 + ], + "confidence": 0.99, + "notes": "A person walking, wearing a dark suit.", + "coarse_bbox": [ + 0.748, + 0.408, + 0.806, + 0.638 + ], + "refine_crop": "crops/detect_refine_pedestrian_in_suit.png" + }, + { + "name": "yellow_building", + "present": true, + "bbox": [ + 0.6651, + 0.0, + 0.9968, + 0.6296 + ], + "confidence": 0.99, + "notes": "The yellow building with shuttered windows occupies the majority of the image, from the top down to its base on the right side of the image.", + "coarse_bbox": [ + 0.675, + 0.0, + 0.999, + 0.623 + ], + "refine_crop": "crops/detect_refine_yellow_building.png" + }, + { + "name": "yellow_lane_line", + "present": true, + "bbox": [ + 0.4487, + 0.5308, + 0.5367, + 1.0 + ], + "confidence": 0.98, + "notes": "The yellow lane line is clearly visible running from the bottom to the top center.", + "coarse_bbox": [ + 0.447, + 0.538, + 0.535, + 1.0 + ], + "refine_crop": "crops/detect_refine_yellow_lane_line.png" + }, + { + "name": "overpass", + "present": true, + "bbox": [ + 0.0031, + 0.1925, + 0.6919, + 0.5364 + ], + "confidence": 0.95, + "notes": "The overpass is a prominent concrete bridge structure spanning across the street.", + "coarse_bbox": [ + 0.002, + 0.188, + 0.694, + 0.553 + ], + "refine_crop": "crops/detect_refine_overpass.png" + }, + { + "name": "street_light", + "present": true, + "bbox": [ + 0.5545, + 0.0, + 0.5804, + 0.0625 + ], + "confidence": 0.95, + "notes": "A bright, glowing street light is clearly visible.", + "coarse_bbox": [ + 0.553, + 0.002, + 0.581, + 0.058 + ], + "refine_crop": "crops/detect_refine_street_light.png" + } +] diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/main_image.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/main_image.png new file mode 100644 index 0000000000000000000000000000000000000000..fc21a0f93804026d11bfaafa0e2ae94f124686cf --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/main_image.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac3bef73934a6198de5f97692085d2ea3ab917e3138559c7498875b3faa4b67f +size 1831206 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/plan.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/plan.json new file mode 100644 index 0000000000000000000000000000000000000000..e98be4d31eb69188fdebdf601125a05b4953e92e --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/plan.json @@ -0,0 +1,145 @@ +{ + "sample_id": "sample_000009", + "target_total": 5, + "target_people": 1, + "target_objects": 4, + "compose_prompt": { + "format": "structured_json_prompt", + "canvas": { + "size": [ + 1248, + 832 + ], + "aspect_ratio": "3:2", + "style": "photorealistic" + }, + "scene": { + "setting": "Urban city street approaching a concrete overpass during early evening.", + "activity": "A pedestrian walks along the side of the road as seen from the perspective of an approaching vehicle.", + "composition": "Eye-level view from the middle of the road. A solid yellow lane line leads from the bottom foreground toward the center. The pedestrian is positioned on the sidewalk to the right. An overpass spans horizontally across the upper midground, with glowing street lights attached or nearby. A yellow multi-story building is visible in the right background behind the sidewalk.", + "constraints": [ + "no text", + "no labels", + "no watermarks", + "true 3:2 composition", + "final canvas size 1248x832", + "normal human and object proportions", + "no squeezed perspective", + "no anamorphic stretching", + "every listed person and object must be visibly present", + "the foreground may contain only the listed people and objects", + "no extra foreground people, animals, vehicles, props, tools, products, signs, readable text, logos, or other localizable subjects absent from this JSON", + "the entire image must be a coherent driving scenario", + "the scene must include an outdoor road, street, highway, intersection, curbside, parking, or traffic environment", + "integrate all listed subjects as road users, vehicles, roadside objects, traffic infrastructure, or driving-context objects", + "do not render an office, kitchen, studio, product catalog, indoor room, or unrelated non-driving scene" + ] + }, + "people": [ + { + "name": "pedestrian_in_suit", + "source_index": 0, + "source_image_id": "CrowdHuman:data/data_11/282555,d1509000eb848483.jpg:person:21", + "source_name": "pedestrian in suit", + "description": "A person walking, wearing a dark suit.", + "role_in_scene": "walking along the right side of the street on the sidewalk" + } + ], + "objects": [ + { + "name": "yellow_building", + "source_index": 0, + "source_image_id": "CrowdHuman:data/data_32/273275,7595c000fa55d9d7.jpg:object:5", + "source_name": "yellow building", + "description": "A multi-story building with a yellow ochre facade and numerous shuttered windows.", + "role_in_scene": "providing a backdrop on the right side of the street scene" + }, + { + "name": "yellow_lane_line", + "source_index": 1, + "source_image_id": "BDD100K:c9195e43-b2fdd978:object:2", + "source_name": "yellow lane line", + "description": "A solid yellow line painted on the road surface indicating the edge of the lane.", + "role_in_scene": "extending along the road surface towards the distance" + }, + { + "name": "overpass", + "source_index": 3, + "source_image_id": "BDD100K:bcd32109-8b0b1eef:object:6", + "source_name": "overpass", + "description": "A concrete bridge structure spanning across the street ahead.", + "role_in_scene": "arching over the road in the midground" + }, + { + "name": "street_light", + "source_index": 7, + "source_image_id": "BDD100K:bd5bb58b-24a6f5e7:object:2", + "source_name": "street light", + "description": "Bright, glowing street lights illuminating the road.", + "role_in_scene": "overhead fixtures providing illumination over the roadway and sidewalk" + } + ] + }, + "expected_subjects": [ + { + "name": "pedestrian_in_suit", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_11/282555,d1509000eb848483.jpg:person:21", + "source_name": "pedestrian in suit", + "source_description": "A person walking, wearing a dark suit. Source dataset: CrowdHuman. Scene context: A crowd of tourists gathers outside a grand palace, with some sitting on a stone balustrade and others walking along a wide paved path.", + "sub_caption": "pedestrian in suit: A person walking, wearing a dark suit.. Scene role: walking along the right side of the street on the sidewalk", + "ref_style": "white_bg_full_body_front" + }, + { + "name": "yellow_building", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_32/273275,7595c000fa55d9d7.jpg:object:5", + "source_name": "yellow building", + "source_description": "A multi-story building with a yellow ochre facade and numerous shuttered windows visible in the background on the far left. Source dataset: CrowdHuman. Scene context: A large crowd of tourists is gathered around a monumental, ornate stone fountain with large statues and cascading water, situated in a city square surrounded by buildings.", + "sub_caption": "yellow building: A multi-story building with a yellow ochre facade and numerous shuttered windows.. Scene role: providing a backdrop on the right side of the street scene", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "yellow_lane_line", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c9195e43-b2fdd978:object:2", + "source_name": "yellow lane line", + "source_description": "A solid yellow line painted on the road surface indicating the edge of the lane. Source dataset: BDD100K. Scene context: View from inside a vehicle driving on a highway, with a white car ahead in the next lane and lush green bushes along the right side barrier.", + "sub_caption": "yellow lane line: A solid yellow line painted on the road surface indicating the edge of the lane.. Scene role: extending along the road surface towards the distance", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "overpass", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:bcd32109-8b0b1eef:object:6", + "source_name": "overpass", + "source_description": "A concrete bridge structure spanning across the highway ahead, casting a shadow over the road. Source dataset: BDD100K. Scene context: View from a moving vehicle on a multi-lane highway, approaching an underpass with several cars ahead.", + "sub_caption": "overpass: A concrete bridge structure spanning across the street ahead.. Scene role: arching over the road in the midground", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "street_light", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:bd5bb58b-24a6f5e7:object:2", + "source_name": "street light", + "source_description": "Bright, glowing street lights illuminating the road. Source dataset: BDD100K. Scene context: Nighttime driving scene on a city street with multiple vehicles, overhead structures, and traffic signals.", + "sub_caption": "street light: Bright, glowing street lights illuminating the road.. Scene role: overhead fixtures providing illumination over the roadway and sidewalk", + "ref_style": "white_bg_encyclopedia_photo" + } + ], + "vocab_task_path": "sample_000009/vocab_task.json", + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/references.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/references.json new file mode 100644 index 0000000000000000000000000000000000000000..1ca55e1b89516411592ab0e395adf3cf98672b28 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/references.json @@ -0,0 +1,165 @@ +{ + "references": [ + { + "name": "pedestrian_in_suit", + "ref_image": "references/ref_pedestrian_in_suit.png", + "raw_ref_image": "references/raw_ref_pedestrian_in_suit_attempt_01.png", + "diversify_input": "crops/diversify_input_pedestrian_in_suit.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000009/references/raw_ref_pedestrian_in_suit_attempt_01.png", + "output": "references/ref_pedestrian_in_suit.png", + "mask": "references/sam_mask_pedestrian_in_suit.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 347.0, + 40.0, + 695.0, + 1018.0 + ], + "mask_score": 3.473173, + "mask_area_ratio": 0.152202, + "elapsed_seconds": 9.5571 + }, + "reference_verify": "references/reference_verify_pedestrian_in_suit.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "yellow_building", + "ref_image": "references/ref_yellow_building.png", + "raw_ref_image": "references/raw_ref_yellow_building_attempt_01.png", + "diversify_input": "crops/diversify_input_yellow_building.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000009/references/raw_ref_yellow_building_attempt_01.png", + "output": "references/ref_yellow_building.png", + "mask": "references/sam_mask_yellow_building.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 4.0, + 0.0, + 1023.0, + 995.0 + ], + "mask_score": 2.131685, + "mask_area_ratio": 0.742735, + "elapsed_seconds": 8.5184 + }, + "reference_verify": "references/reference_verify_yellow_building.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "yellow_lane_line", + "ref_image": "references/ref_yellow_lane_line.png", + "raw_ref_image": "references/raw_ref_yellow_lane_line_attempt_01.png", + "diversify_input": "crops/diversify_input_yellow_lane_line.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000009/references/raw_ref_yellow_lane_line_attempt_01.png", + "output": "references/ref_yellow_lane_line.png", + "mask": "references/sam_mask_yellow_lane_line.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 12.0, + 55.0, + 1018.0, + 969.0 + ], + "mask_score": 0.925602, + "mask_area_ratio": 0.952688, + "elapsed_seconds": 8.3819 + }, + "reference_verify": "references/reference_verify_yellow_lane_line.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "overpass", + "ref_image": "references/ref_overpass.png", + "raw_ref_image": "references/raw_ref_overpass_attempt_01.png", + "diversify_input": "crops/diversify_input_overpass.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000009/references/raw_ref_overpass_attempt_01.png", + "output": "references/ref_overpass.png", + "mask": "references/sam_mask_overpass.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 28.0, + 284.0, + 965.0, + 771.0 + ], + "mask_score": 3.406555, + "mask_area_ratio": 0.166775, + "elapsed_seconds": 8.3597 + }, + "reference_verify": "references/reference_verify_overpass.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "street_light", + "ref_image": "references/ref_street_light.png", + "raw_ref_image": "references/raw_ref_street_light_attempt_01.png", + "diversify_input": "crops/diversify_input_street_light.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000009/references/raw_ref_street_light_attempt_01.png", + "output": "references/ref_street_light.png", + "mask": "references/sam_mask_street_light.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 205.0, + 0.0, + 824.0, + 857.0 + ], + "mask_score": 3.391373, + "mask_area_ratio": 0.189186, + "elapsed_seconds": 8.2179 + }, + "reference_verify": "references/reference_verify_street_light.json", + "reference_verify_passed": true, + "reference_attempts": 1 + } + ], + "reference_errors": {} +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/references/ref_overpass.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/references/ref_overpass.png new file mode 100644 index 0000000000000000000000000000000000000000..11efffa8f8b7bf14cd9c1f602c9f34478d25b0a8 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/references/ref_overpass.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4036d078ab1cf45b7ea1497a80067e4d5282ac30863a7380c4616a3c08435617 +size 326857 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/references/ref_pedestrian_in_suit.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/references/ref_pedestrian_in_suit.png new file mode 100644 index 0000000000000000000000000000000000000000..c17f80de60d216c6293c37b01d437a46d495b1a5 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/references/ref_pedestrian_in_suit.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88b5aac081acb73be78f9380b884102f35875ac7fdccd188011643b74c3cd87f +size 269476 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/references/ref_street_light.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/references/ref_street_light.png new file mode 100644 index 0000000000000000000000000000000000000000..88daa1f245785fbedeff7165e65b4ee2c03ddd07 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/references/ref_street_light.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f24d0a8d752992d0518e7557b3a22a96212e8d38d4d1b02da4486fa45004b3c +size 309554 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/references/ref_yellow_building.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/references/ref_yellow_building.png new file mode 100644 index 0000000000000000000000000000000000000000..cb8eaf0e1c7e6f2936e3f342abdf4f772027298d --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/references/ref_yellow_building.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33af74062f0f6570c48e02e28198de6c0595f184c726170a6b00a67c4a9ab8d4 +size 1342837 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/references/ref_yellow_lane_line.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/references/ref_yellow_lane_line.png new file mode 100644 index 0000000000000000000000000000000000000000..a0b110838065f6b6bdbf2f3d1659ba42125a4bf6 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/references/ref_yellow_lane_line.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d08f292cbf02a67184edccf050a937557fb863761f24aa9f4b890081a9b2815 +size 775704 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/references/reference_verify_overpass.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/references/reference_verify_overpass.json new file mode 100644 index 0000000000000000000000000000000000000000..d8af9cd5028310d8939c10cf59e121c25857202a --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/references/reference_verify_overpass.json @@ -0,0 +1,46 @@ +{ + "name": "overpass", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_overpass_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_overpass_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_overpass_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000009/references/raw_ref_overpass_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000009/references/candidate_ref_overpass_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000009/references/candidate_sam_mask_overpass_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 28.0, + 284.0, + 965.0, + 771.0 + ], + "mask_score": 3.406555, + "mask_area_ratio": 0.166775, + "elapsed_seconds": 8.3597 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image clearly shows an isolated overpass/concrete bridge structure on a white background. It satisfies all hard requirements." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/references/reference_verify_pedestrian_in_suit.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/references/reference_verify_pedestrian_in_suit.json new file mode 100644 index 0000000000000000000000000000000000000000..7c7c50f913a07499d12a5ee615df88c10132a8c6 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/references/reference_verify_pedestrian_in_suit.json @@ -0,0 +1,46 @@ +{ + "name": "pedestrian_in_suit", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_pedestrian_in_suit_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_pedestrian_in_suit_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_pedestrian_in_suit_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000009/references/raw_ref_pedestrian_in_suit_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000009/references/candidate_ref_pedestrian_in_suit_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000009/references/candidate_sam_mask_pedestrian_in_suit_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 347.0, + 40.0, + 695.0, + 1018.0 + ], + "mask_score": 3.473173, + "mask_area_ratio": 0.152202, + "elapsed_seconds": 9.5571 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows a full-body view of a person in a suit on a white background with ample margins, meeting all hard requirements." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/references/reference_verify_street_light.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/references/reference_verify_street_light.json new file mode 100644 index 0000000000000000000000000000000000000000..527dc987f46270f51ca779a6d98063b073e94baf --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/references/reference_verify_street_light.json @@ -0,0 +1,46 @@ +{ + "name": "street_light", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_street_light_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_street_light_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_street_light_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000009/references/raw_ref_street_light_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000009/references/candidate_ref_street_light_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000009/references/candidate_sam_mask_street_light_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 205.0, + 0.0, + 824.0, + 857.0 + ], + "mask_score": 3.391373, + "mask_area_ratio": 0.189186, + "elapsed_seconds": 8.2179 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": false, + "cropped_or_truncated": true, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The main luminaire of the street light is clearly visible and isolated against a white background. The supporting arm/pipe is cropped at the top, but the subject remains highly recognizable and serves as a good reference for the lamp fixture." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/references/reference_verify_yellow_building.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/references/reference_verify_yellow_building.json new file mode 100644 index 0000000000000000000000000000000000000000..98c5e6667fa5d8b3bb0c969d5a5aefd79d10e046 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/references/reference_verify_yellow_building.json @@ -0,0 +1,46 @@ +{ + "name": "yellow_building", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_yellow_building_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_yellow_building_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_yellow_building_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000009/references/raw_ref_yellow_building_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000009/references/candidate_ref_yellow_building_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000009/references/candidate_sam_mask_yellow_building_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 4.0, + 0.0, + 1023.0, + 995.0 + ], + "mask_score": 2.131685, + "mask_area_ratio": 0.742735, + "elapsed_seconds": 8.5184 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": false, + "cropped_or_truncated": true, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image is a cropped view of the building facade, which is acceptable for a large architectural subject. It clearly shows the yellow color and window details." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/references/reference_verify_yellow_lane_line.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/references/reference_verify_yellow_lane_line.json new file mode 100644 index 0000000000000000000000000000000000000000..1cfa39fd7c94f24c5e845ddd375e090882de8248 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/references/reference_verify_yellow_lane_line.json @@ -0,0 +1,46 @@ +{ + "name": "yellow_lane_line", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_yellow_lane_line_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_yellow_lane_line_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_yellow_lane_line_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000009/references/raw_ref_yellow_lane_line_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000009/references/candidate_ref_yellow_lane_line_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000009/references/candidate_sam_mask_yellow_lane_line_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 12.0, + 55.0, + 1018.0, + 969.0 + ], + "mask_score": 0.925602, + "mask_area_ratio": 0.952688, + "elapsed_seconds": 8.3819 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image clearly shows a representation of yellow lane lines against a white background." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/references/sam_mask_overpass.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/references/sam_mask_overpass.png new file mode 100644 index 0000000000000000000000000000000000000000..2676931a96540e3cb1e3fbb2b726f68f68cbfdd8 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/references/sam_mask_overpass.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/references/sam_mask_pedestrian_in_suit.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/references/sam_mask_pedestrian_in_suit.png new file mode 100644 index 0000000000000000000000000000000000000000..6f7867283ee165d4459d3cdfbde0b747b26d8fff Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/references/sam_mask_pedestrian_in_suit.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/references/sam_mask_street_light.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/references/sam_mask_street_light.png new file mode 100644 index 0000000000000000000000000000000000000000..6a59616993d65046da195c0c424afea6dfe7368c Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/references/sam_mask_street_light.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/references/sam_mask_yellow_building.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/references/sam_mask_yellow_building.png new file mode 100644 index 0000000000000000000000000000000000000000..fddcdc27559e63695e38d0f2be77dd82ee63be96 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/references/sam_mask_yellow_building.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/references/sam_mask_yellow_lane_line.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/references/sam_mask_yellow_lane_line.png new file mode 100644 index 0000000000000000000000000000000000000000..4c9da8478e5d405d295094493b84cb667d51d829 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/references/sam_mask_yellow_lane_line.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/row.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/row.json new file mode 100644 index 0000000000000000000000000000000000000000..594f74bd375aa36d1d5928b67b75d41f6748d18f --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/row.json @@ -0,0 +1,256 @@ +{ + "sample_id": "sample_000009", + "target_total": 5, + "target_people": 1, + "target_objects": 4, + "canvas_size": [ + 1248, + 832 + ], + "canvas_aspect_ratio": "3:2", + "main_image": "main_image.png", + "bbox_overlay": "bbox_overlay.png", + "plan": "plan.json", + "detections": "detections.json", + "vocab_task": "vocab_task.json", + "n_planned": 5, + "n_detected": 5, + "n_subjects": 5, + "subjects": [ + { + "name": "pedestrian_in_suit", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_11/282555,d1509000eb848483.jpg:person:21", + "source_name": "pedestrian in suit", + "source_description": "A person walking, wearing a dark suit. Source dataset: CrowdHuman. Scene context: A crowd of tourists gathers outside a grand palace, with some sitting on a stone balustrade and others walking along a wide paved path.", + "sub_caption": "pedestrian in suit: A person walking, wearing a dark suit.. Scene role: walking along the right side of the street on the sidewalk", + "measured_bbox": [ + 0.7498, + 0.407, + 0.8062, + 0.6382 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian_in_suit.png", + "raw_ref_image": "references/raw_ref_pedestrian_in_suit_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian_in_suit.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000009/references/raw_ref_pedestrian_in_suit_attempt_01.png", + "output": "references/ref_pedestrian_in_suit.png", + "mask": "references/sam_mask_pedestrian_in_suit.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 347.0, + 40.0, + 695.0, + 1018.0 + ], + "mask_score": 3.473173, + "mask_area_ratio": 0.152202, + "elapsed_seconds": 9.5571 + } + }, + { + "name": "yellow_building", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_32/273275,7595c000fa55d9d7.jpg:object:5", + "source_name": "yellow building", + "source_description": "A multi-story building with a yellow ochre facade and numerous shuttered windows visible in the background on the far left. Source dataset: CrowdHuman. Scene context: A large crowd of tourists is gathered around a monumental, ornate stone fountain with large statues and cascading water, situated in a city square surrounded by buildings.", + "sub_caption": "yellow building: A multi-story building with a yellow ochre facade and numerous shuttered windows.. Scene role: providing a backdrop on the right side of the street scene", + "measured_bbox": [ + 0.6651, + 0.0, + 0.9968, + 0.6296 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_yellow_building.png", + "raw_ref_image": "references/raw_ref_yellow_building_attempt_01.png", + "reference_verify": "references/reference_verify_yellow_building.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000009/references/raw_ref_yellow_building_attempt_01.png", + "output": "references/ref_yellow_building.png", + "mask": "references/sam_mask_yellow_building.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 4.0, + 0.0, + 1023.0, + 995.0 + ], + "mask_score": 2.131685, + "mask_area_ratio": 0.742735, + "elapsed_seconds": 8.5184 + } + }, + { + "name": "yellow_lane_line", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c9195e43-b2fdd978:object:2", + "source_name": "yellow lane line", + "source_description": "A solid yellow line painted on the road surface indicating the edge of the lane. Source dataset: BDD100K. Scene context: View from inside a vehicle driving on a highway, with a white car ahead in the next lane and lush green bushes along the right side barrier.", + "sub_caption": "yellow lane line: A solid yellow line painted on the road surface indicating the edge of the lane.. Scene role: extending along the road surface towards the distance", + "measured_bbox": [ + 0.4487, + 0.5308, + 0.5367, + 1.0 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_yellow_lane_line.png", + "raw_ref_image": "references/raw_ref_yellow_lane_line_attempt_01.png", + "reference_verify": "references/reference_verify_yellow_lane_line.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000009/references/raw_ref_yellow_lane_line_attempt_01.png", + "output": "references/ref_yellow_lane_line.png", + "mask": "references/sam_mask_yellow_lane_line.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 12.0, + 55.0, + 1018.0, + 969.0 + ], + "mask_score": 0.925602, + "mask_area_ratio": 0.952688, + "elapsed_seconds": 8.3819 + } + }, + { + "name": "overpass", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:bcd32109-8b0b1eef:object:6", + "source_name": "overpass", + "source_description": "A concrete bridge structure spanning across the highway ahead, casting a shadow over the road. Source dataset: BDD100K. Scene context: View from a moving vehicle on a multi-lane highway, approaching an underpass with several cars ahead.", + "sub_caption": "overpass: A concrete bridge structure spanning across the street ahead.. Scene role: arching over the road in the midground", + "measured_bbox": [ + 0.0031, + 0.1925, + 0.6919, + 0.5364 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_overpass.png", + "raw_ref_image": "references/raw_ref_overpass_attempt_01.png", + "reference_verify": "references/reference_verify_overpass.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000009/references/raw_ref_overpass_attempt_01.png", + "output": "references/ref_overpass.png", + "mask": "references/sam_mask_overpass.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 28.0, + 284.0, + 965.0, + 771.0 + ], + "mask_score": 3.406555, + "mask_area_ratio": 0.166775, + "elapsed_seconds": 8.3597 + } + }, + { + "name": "street_light", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:bd5bb58b-24a6f5e7:object:2", + "source_name": "street light", + "source_description": "Bright, glowing street lights illuminating the road. Source dataset: BDD100K. Scene context: Nighttime driving scene on a city street with multiple vehicles, overhead structures, and traffic signals.", + "sub_caption": "street light: Bright, glowing street lights illuminating the road.. Scene role: overhead fixtures providing illumination over the roadway and sidewalk", + "measured_bbox": [ + 0.5545, + 0.0, + 0.5804, + 0.0625 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_street_light.png", + "raw_ref_image": "references/raw_ref_street_light_attempt_01.png", + "reference_verify": "references/reference_verify_street_light.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000009/references/raw_ref_street_light_attempt_01.png", + "output": "references/ref_street_light.png", + "mask": "references/sam_mask_street_light.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 205.0, + 0.0, + 824.0, + 857.0 + ], + "mask_score": 3.391373, + "mask_area_ratio": 0.189186, + "elapsed_seconds": 8.2179 + } + } + ], + "not_emitted": [], + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/vocab_task.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/vocab_task.json new file mode 100644 index 0000000000000000000000000000000000000000..0efe4ca0a811ffa0a6dad2b90afce042b2d7f992 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/vocab_task.json @@ -0,0 +1,84 @@ +{ + "task_id": "sample_000009", + "sample_id": "sample_000009", + "sample_index": 9, + "target_total": 5, + "target_people": 1, + "target_objects": 4, + "people_candidates": [ + { + "candidate_index": 0, + "source_offset": 8921, + "image_id": "CrowdHuman:data/data_11/282555,d1509000eb848483.jpg:person:21", + "name": "pedestrian in suit", + "description": "A person walking, wearing a dark suit. Source dataset: CrowdHuman. Scene context: A crowd of tourists gathers outside a grand palace, with some sitting on a stone balustrade and others walking along a wide paved path." + }, + { + "candidate_index": 1, + "source_offset": 24385, + "image_id": "CrowdHuman:data/data_17/282555,b049a000a718515c.jpg:person:9", + "name": "spectator", + "description": "person in the background crowd on the upper level or further back, wearing a blue jacket. Source dataset: CrowdHuman. Scene context: A large crowd gathered in an indoor mall watching a staged hockey-like game played on the floor." + } + ], + "object_candidates": [ + { + "candidate_index": 0, + "source_offset": 49557, + "image_id": "CrowdHuman:data/data_32/273275,7595c000fa55d9d7.jpg:object:5", + "name": "yellow building", + "description": "A multi-story building with a yellow ochre facade and numerous shuttered windows visible in the background on the far left. Source dataset: CrowdHuman. Scene context: A large crowd of tourists is gathered around a monumental, ornate stone fountain with large statues and cascading water, situated in a city square surrounded by buildings." + }, + { + "candidate_index": 1, + "source_offset": 240928, + "image_id": "BDD100K:c9195e43-b2fdd978:object:2", + "name": "yellow lane line", + "description": "A solid yellow line painted on the road surface indicating the edge of the lane. Source dataset: BDD100K. Scene context: View from inside a vehicle driving on a highway, with a white car ahead in the next lane and lush green bushes along the right side barrier." + }, + { + "candidate_index": 2, + "source_offset": 198760, + "image_id": "BDD100K:bfbe2ad2-ec5dea9d:object:9", + "name": "windshield reflection", + "description": "A blurry, bright green rectangular reflection appearing on the vehicle's windshield on the left side. Source dataset: BDD100K. Scene context: A nighttime view from a vehicle driving on a wet city street behind a dark SUV, with a construction zone marked by barricades on the right side." + }, + { + "candidate_index": 3, + "source_offset": 183256, + "image_id": "BDD100K:bcd32109-8b0b1eef:object:6", + "name": "overpass", + "description": "A concrete bridge structure spanning across the highway ahead, casting a shadow over the road. Source dataset: BDD100K. Scene context: View from a moving vehicle on a multi-lane highway, approaching an underpass with several cars ahead." + }, + { + "candidate_index": 4, + "source_offset": 44513, + "image_id": "CrowdHuman:data/data_31/273275,83e380001691cba1.jpg:object:2", + "name": "wooden cabinet", + "description": "A tall, dark wood cabinet or hutch located behind the people on the left side. Source dataset: CrowdHuman. Scene context: A group of people, including priests and nuns, poses for a photograph behind a long dining table set for a meal in a wood-paneled room." + }, + { + "candidate_index": 5, + "source_offset": 61366, + "image_id": "CrowdHuman:data/data_4/283554,13b0000ade9cfd6.jpg:object:1", + "name": "sunglasses", + "description": "Dark sunglasses worn by the woman in the grey sweater. Source dataset: CrowdHuman. Scene context: Three women walk across a paved urban street, with one carrying a motorcycle helmet." + }, + { + "candidate_index": 6, + "source_offset": 18898, + "image_id": "CrowdHuman:data/data_18/282555,101ffe000a8c8717f.jpg:object:11", + "name": "metal railing", + "description": "A metal railing visible on the far right edge of the scene. Source dataset: CrowdHuman. Scene context: A group of people, possibly a tour group, is walking outdoors on a paved area near a stone building, with one person holding a green flag and another speaking into a microphone." + }, + { + "candidate_index": 7, + "source_offset": 185323, + "image_id": "BDD100K:bd5bb58b-24a6f5e7:object:2", + "name": "street light", + "description": "Bright, glowing street lights illuminating the road. Source dataset: BDD100K. Scene context: Nighttime driving scene on a city street with multiple vehicles, overhead structures, and traffic signals." + } + ], + "rng_seed": 1782870554, + "created_at": 1782223460.3891625 +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/bbox_overlay.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/bbox_overlay.png new file mode 100644 index 0000000000000000000000000000000000000000..af1f95ab455e268dfe91edfcb43b495192ee55f6 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/bbox_overlay.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e129420eb8f2b4e6d3630d4e617812e9da6c37732d4669e7ffa231b8c64dcbf6 +size 1569256 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/compose_prompt.txt b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/compose_prompt.txt new file mode 100644 index 0000000000000000000000000000000000000000..814d71dee303e75d1b4c61e42b27b0009403d90e --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/compose_prompt.txt @@ -0,0 +1,111 @@ +Render the following JSON scene specification as a photorealistic 1280x720 image using a true 16:9 canvas. The rendered image must be a coherent driving scenario with an outdoor road, street, highway, intersection, curbside, parking, or traffic environment; do not render a non-driving indoor scene or product-catalog scene. Every listed person and object must appear visibly in the image. Keep normal proportions and the requested aspect ratio. The foreground must contain only subjects explicitly listed in the JSON scene specification. Do not add any unlisted foreground people, animals, vehicles, props, tools, products, signs, readable text, logos, or other localizable subjects. Background context may include non-localizable scenery only when it does not introduce a distinct foreground subject. No text, no labels, no logos, no watermarks. +JSON scene specification: +{ + "format": "structured_json_prompt", + "canvas": { + "size": [ + 1280, + 720 + ], + "aspect_ratio": "16:9", + "style": "photorealistic" + }, + "scene": { + "setting": "A sunny city street lined with parked vehicles and a busy sidewalk", + "activity": "Pedestrians are walking along the sidewalk and waiting near the curb while traffic moves down the road", + "composition": "Wide-angle street view from the perspective of an approaching vehicle, with a parked black SUV anchoring the right foreground, pedestrians distributed along the right sidewalk, and distant pedestrians providing depth down the receding street", + "constraints": [ + "no text", + "no labels", + "no watermarks", + "true 16:9 composition", + "final canvas size 1280x720", + "normal human and object proportions", + "no squeezed perspective", + "no anamorphic stretching", + "every listed person and object must be visibly present", + "the foreground may contain only the listed people and objects", + "no extra foreground people, animals, vehicles, props, tools, products, signs, readable text, logos, or other localizable subjects absent from this JSON", + "the entire image must be a coherent driving scenario", + "the scene must include an outdoor road, street, highway, intersection, curbside, parking, or traffic environment", + "integrate all listed subjects as road users, vehicles, roadside objects, traffic infrastructure, or driving-context objects", + "do not render an office, kitchen, studio, product catalog, indoor room, or unrelated non-driving scene" + ] + }, + "people": [ + { + "name": "pedestrian_black_jacket", + "source_index": 2, + "source_image_id": "CrowdHuman:data/data_69/273275,3ad7000cc6b4598.jpg:person:7", + "source_name": "pedestrian", + "description": "Person walking away, wearing a black jacket and dark pants.", + "role_in_scene": "walking away on the sidewalk to the right" + }, + { + "name": "pedestrian_backpack", + "source_index": 3, + "source_image_id": "CrowdHuman:data/data_9/283991,1e62f00058996b51.jpg:person:10", + "source_name": "pedestrian", + "description": "A man wearing a blue t-shirt and a backpack.", + "role_in_scene": "standing near the curb looking toward the road" + }, + { + "name": "pedestrian_red_jacket", + "source_index": 4, + "source_image_id": "CrowdHuman:data/data_29/283991,4d5b0000ceec5a1.jpg:person:5", + "source_name": "pedestrian standing", + "description": "A person standing, wearing a bright red jacket.", + "role_in_scene": "waiting at a crosswalk edge" + }, + { + "name": "pedestrian_striped_shirt", + "source_index": 5, + "source_image_id": "CrowdHuman:data/data_15/273278,141d72000a04cd076.jpg:person:8", + "source_name": "pedestrian", + "description": "Person wearing a striped shirt and dark pants.", + "role_in_scene": "strolling along the sidewalk" + }, + { + "name": "man_pink_shirt", + "source_index": 9, + "source_image_id": "CrowdHuman:data/data_41/283992,8df800092d3201e.jpg:person:1", + "source_name": "man talking to young man", + "description": "Man wearing a pink shirt and dark shorts.", + "role_in_scene": "conversing near a storefront on the sidewalk" + }, + { + "name": "pedestrian_light_jacket", + "source_index": 11, + "source_image_id": "CrowdHuman:data/data_16/273278,11c1c7000110dd323.jpg:person:1", + "source_name": "pedestrian", + "description": "Individual wearing a light-colored jacket.", + "role_in_scene": "approaching the street intersection" + }, + { + "name": "pedestrian_light_blue_shirt", + "source_index": 12, + "source_image_id": "CrowdHuman:data/data_4/273275,1dd0f000c9356aaa.jpg:person:18", + "source_name": "pedestrian", + "description": "A person in a light blue shirt walking away from the camera.", + "role_in_scene": "walking further down the sidewalk in the mid-ground" + }, + { + "name": "distant_pedestrian", + "source_index": 15, + "source_image_id": "CrowdHuman:data/data_21/283991,2145f000467f0a28.jpg:person:14", + "source_name": "distant pedestrian", + "description": "A person walking in the background.", + "role_in_scene": "walking in the far background down the street" + } + ], + "objects": [ + { + "name": "black_suv", + "source_index": 1, + "source_image_id": "BDD100K:b8fe1054-42625c45:object:1", + "source_name": "black suv", + "description": "A black SUV parked ahead on the right.", + "role_in_scene": "parked at the curb on the right side of the street" + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/crops/detect_refine_black_suv.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/crops/detect_refine_black_suv.png new file mode 100644 index 0000000000000000000000000000000000000000..60de5b97dcc801f97e14163be8361837c45f46cf --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/crops/detect_refine_black_suv.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6b2f2ab6812f514c864b27d1510f8dadc9e593919e1f370f0eeb90e6306223c +size 204988 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/crops/detect_refine_distant_pedestrian.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/crops/detect_refine_distant_pedestrian.png new file mode 100644 index 0000000000000000000000000000000000000000..6bcb2e7dee2570bf8e54e6618fa9bda93658c231 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/crops/detect_refine_distant_pedestrian.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/crops/detect_refine_man_pink_shirt.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/crops/detect_refine_man_pink_shirt.png new file mode 100644 index 0000000000000000000000000000000000000000..cced0a5b253ab6893c626e3adfb35dc32b65d38a Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/crops/detect_refine_man_pink_shirt.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/crops/detect_refine_pedestrian_backpack.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/crops/detect_refine_pedestrian_backpack.png new file mode 100644 index 0000000000000000000000000000000000000000..73f4a05a3dd16924d869ddc961158c73d1b56564 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/crops/detect_refine_pedestrian_backpack.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/crops/detect_refine_pedestrian_black_jacket.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/crops/detect_refine_pedestrian_black_jacket.png new file mode 100644 index 0000000000000000000000000000000000000000..c5e1fdbceff4c13853195f47534be69daea46281 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/crops/detect_refine_pedestrian_black_jacket.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/crops/detect_refine_pedestrian_light_blue_shirt.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/crops/detect_refine_pedestrian_light_blue_shirt.png new file mode 100644 index 0000000000000000000000000000000000000000..c4fbd6661b9ef0c693ffe29fd4163cec06da6ad7 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/crops/detect_refine_pedestrian_light_blue_shirt.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/crops/detect_refine_pedestrian_light_jacket.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/crops/detect_refine_pedestrian_light_jacket.png new file mode 100644 index 0000000000000000000000000000000000000000..6bcb2e7dee2570bf8e54e6618fa9bda93658c231 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/crops/detect_refine_pedestrian_light_jacket.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/crops/detect_refine_pedestrian_red_jacket.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/crops/detect_refine_pedestrian_red_jacket.png new file mode 100644 index 0000000000000000000000000000000000000000..54e4f1bb32befd15cfd53926f9e72e8f6935bd4c Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/crops/detect_refine_pedestrian_red_jacket.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/crops/detect_refine_pedestrian_striped_shirt.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/crops/detect_refine_pedestrian_striped_shirt.png new file mode 100644 index 0000000000000000000000000000000000000000..37b7996dfa3b4ab9ed5575c9def111f0d671a7e5 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/crops/detect_refine_pedestrian_striped_shirt.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/crops/diversify_input_black_suv.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/crops/diversify_input_black_suv.png new file mode 100644 index 0000000000000000000000000000000000000000..7c9936506ce9a853588ae650ab47074e0851decf --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/crops/diversify_input_black_suv.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bfb8a4d32aba323772450139345dfe695a716f9424a9652ceb5c1d4abe204c2 +size 268886 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/crops/diversify_input_distant_pedestrian.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/crops/diversify_input_distant_pedestrian.png new file mode 100644 index 0000000000000000000000000000000000000000..2dc657786837b3174f155ff10bf8a04ea2fdb405 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/crops/diversify_input_distant_pedestrian.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/crops/diversify_input_man_pink_shirt.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/crops/diversify_input_man_pink_shirt.png new file mode 100644 index 0000000000000000000000000000000000000000..8a0977343b9d2ffc87510fbcc955771df0c74b1d Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/crops/diversify_input_man_pink_shirt.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/crops/diversify_input_pedestrian_backpack.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/crops/diversify_input_pedestrian_backpack.png new file mode 100644 index 0000000000000000000000000000000000000000..2c371e728844587038b52891747db1f1a51fbf98 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/crops/diversify_input_pedestrian_backpack.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/crops/diversify_input_pedestrian_black_jacket.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/crops/diversify_input_pedestrian_black_jacket.png new file mode 100644 index 0000000000000000000000000000000000000000..fbbfceeeb3f1af10e55dca0c4aa29b572e6a9c2c --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/crops/diversify_input_pedestrian_black_jacket.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a441cabc10c37a897be571d410b3ef50938feab033725700367967eefd74dad5 +size 130500 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/crops/diversify_input_pedestrian_light_blue_shirt.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/crops/diversify_input_pedestrian_light_blue_shirt.png new file mode 100644 index 0000000000000000000000000000000000000000..919dbfe08ded68090b0e6d18961a2cfd9ce101f8 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/crops/diversify_input_pedestrian_light_blue_shirt.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/crops/diversify_input_pedestrian_light_jacket.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/crops/diversify_input_pedestrian_light_jacket.png new file mode 100644 index 0000000000000000000000000000000000000000..8fb7ff1c83550ff213a29a3207193edc32cce6ce Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/crops/diversify_input_pedestrian_light_jacket.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/crops/diversify_input_pedestrian_red_jacket.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/crops/diversify_input_pedestrian_red_jacket.png new file mode 100644 index 0000000000000000000000000000000000000000..35ce11e4b735002a6182dfffcfb3e530a68c9837 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/crops/diversify_input_pedestrian_red_jacket.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/crops/diversify_input_pedestrian_striped_shirt.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/crops/diversify_input_pedestrian_striped_shirt.png new file mode 100644 index 0000000000000000000000000000000000000000..5121f31fa6a6399dc3c2d4884ac49c9c0b9790e4 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/crops/diversify_input_pedestrian_striped_shirt.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/detections.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/detections.json new file mode 100644 index 0000000000000000000000000000000000000000..a00a07a8fb7010cf981232d1e4f34ee6feb1a470 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/detections.json @@ -0,0 +1,173 @@ +[ + { + "name": "pedestrian_black_jacket", + "present": true, + "bbox": [ + 0.8947, + 0.408, + 0.9768, + 0.8 + ], + "confidence": 0.98, + "notes": "Tight bounding box around the main person walking away, wearing a black jacket and dark pants.", + "coarse_bbox": [ + 0.894, + 0.408, + 0.975, + 0.772 + ], + "refine_crop": "crops/detect_refine_pedestrian_black_jacket.png" + }, + { + "name": "pedestrian_backpack", + "present": true, + "bbox": [ + 0.6953, + 0.4394, + 0.7156, + 0.5151 + ], + "confidence": 0.95, + "notes": "Backpack worn by the pedestrian.", + "coarse_bbox": [ + 0.671, + 0.391, + 0.712, + 0.654 + ], + "refine_crop": "crops/detect_refine_pedestrian_backpack.png" + }, + { + "name": "pedestrian_red_jacket", + "present": true, + "bbox": [ + 0.4504, + 0.4033, + 0.474, + 0.5253 + ], + "confidence": 0.98, + "notes": "A person standing, wearing a bright red jacket.", + "coarse_bbox": [ + 0.449, + 0.399, + 0.475, + 0.524 + ], + "refine_crop": "crops/detect_refine_pedestrian_red_jacket.png" + }, + { + "name": "pedestrian_striped_shirt", + "present": true, + "bbox": [ + 0.7269, + 0.3947, + 0.7711, + 0.5853 + ], + "confidence": 100, + "notes": "Person wearing a striped shirt and dark pants", + "coarse_bbox": [ + 0.724, + 0.395, + 0.771, + 0.579 + ], + "refine_crop": "crops/detect_refine_pedestrian_striped_shirt.png" + }, + { + "name": "man_pink_shirt", + "present": true, + "bbox": [ + 0.8332, + 0.3734, + 0.8735, + 0.5918 + ], + "confidence": 0.99, + "notes": "Found the man in a pink shirt and dark shorts.", + "coarse_bbox": [ + 0.833, + 0.375, + 0.873, + 0.589 + ], + "refine_crop": "crops/detect_refine_man_pink_shirt.png" + }, + { + "name": "pedestrian_light_jacket", + "present": true, + "bbox": [ + 0.6065, + 0.3907, + 0.6375, + 0.4907 + ], + "confidence": "high", + "notes": "Pedestrian wearing a light-colored jacket is clearly visible from behind.", + "coarse_bbox": [ + 0.603, + 0.391, + 0.636, + 0.486 + ], + "refine_crop": "crops/detect_refine_pedestrian_light_jacket.png" + }, + { + "name": "pedestrian_light_blue_shirt", + "present": true, + "bbox": [ + 0.9459, + 0.3895, + 0.9964, + 0.6538 + ], + "confidence": 100, + "notes": "A person in a light blue shirt walking away from the camera.", + "coarse_bbox": [ + 0.946, + 0.388, + 0.996, + 0.63 + ], + "refine_crop": "crops/detect_refine_pedestrian_light_blue_shirt.png" + }, + { + "name": "distant_pedestrian", + "present": true, + "bbox": [ + 0.6066, + 0.3904, + 0.6375, + 0.489 + ], + "confidence": 1.0, + "notes": "A person walking in the background.", + "coarse_bbox": [ + 0.603, + 0.391, + 0.636, + 0.486 + ], + "refine_crop": "crops/detect_refine_distant_pedestrian.png" + }, + { + "name": "black_suv", + "present": true, + "bbox": [ + 0.4391, + 0.47, + 0.6899, + 0.8264 + ], + "confidence": 0.99, + "notes": "The large black SUV dominating the center of the crop.", + "coarse_bbox": [ + 0.438, + 0.47, + 0.688, + 0.824 + ], + "refine_crop": "crops/detect_refine_black_suv.png" + } +] diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/main_image.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/main_image.png new file mode 100644 index 0000000000000000000000000000000000000000..e13c0bcedaf3f3a274371065ef05d6cf3d24d195 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/main_image.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70e42aa92f26c516a1e78a02f588ec92ce856aba5c7adb1bcd5f194a430cdb20 +size 1642656 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/plan.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/plan.json new file mode 100644 index 0000000000000000000000000000000000000000..e7f268d95447e7baf21f98e5e6726f0624a1f788 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/plan.json @@ -0,0 +1,221 @@ +{ + "sample_id": "sample_000010", + "target_total": 9, + "target_people": 8, + "target_objects": 1, + "compose_prompt": { + "format": "structured_json_prompt", + "canvas": { + "size": [ + 1280, + 720 + ], + "aspect_ratio": "16:9", + "style": "photorealistic" + }, + "scene": { + "setting": "A sunny city street lined with parked vehicles and a busy sidewalk", + "activity": "Pedestrians are walking along the sidewalk and waiting near the curb while traffic moves down the road", + "composition": "Wide-angle street view from the perspective of an approaching vehicle, with a parked black SUV anchoring the right foreground, pedestrians distributed along the right sidewalk, and distant pedestrians providing depth down the receding street", + "constraints": [ + "no text", + "no labels", + "no watermarks", + "true 16:9 composition", + "final canvas size 1280x720", + "normal human and object proportions", + "no squeezed perspective", + "no anamorphic stretching", + "every listed person and object must be visibly present", + "the foreground may contain only the listed people and objects", + "no extra foreground people, animals, vehicles, props, tools, products, signs, readable text, logos, or other localizable subjects absent from this JSON", + "the entire image must be a coherent driving scenario", + "the scene must include an outdoor road, street, highway, intersection, curbside, parking, or traffic environment", + "integrate all listed subjects as road users, vehicles, roadside objects, traffic infrastructure, or driving-context objects", + "do not render an office, kitchen, studio, product catalog, indoor room, or unrelated non-driving scene" + ] + }, + "people": [ + { + "name": "pedestrian_black_jacket", + "source_index": 2, + "source_image_id": "CrowdHuman:data/data_69/273275,3ad7000cc6b4598.jpg:person:7", + "source_name": "pedestrian", + "description": "Person walking away, wearing a black jacket and dark pants.", + "role_in_scene": "walking away on the sidewalk to the right" + }, + { + "name": "pedestrian_backpack", + "source_index": 3, + "source_image_id": "CrowdHuman:data/data_9/283991,1e62f00058996b51.jpg:person:10", + "source_name": "pedestrian", + "description": "A man wearing a blue t-shirt and a backpack.", + "role_in_scene": "standing near the curb looking toward the road" + }, + { + "name": "pedestrian_red_jacket", + "source_index": 4, + "source_image_id": "CrowdHuman:data/data_29/283991,4d5b0000ceec5a1.jpg:person:5", + "source_name": "pedestrian standing", + "description": "A person standing, wearing a bright red jacket.", + "role_in_scene": "waiting at a crosswalk edge" + }, + { + "name": "pedestrian_striped_shirt", + "source_index": 5, + "source_image_id": "CrowdHuman:data/data_15/273278,141d72000a04cd076.jpg:person:8", + "source_name": "pedestrian", + "description": "Person wearing a striped shirt and dark pants.", + "role_in_scene": "strolling along the sidewalk" + }, + { + "name": "man_pink_shirt", + "source_index": 9, + "source_image_id": "CrowdHuman:data/data_41/283992,8df800092d3201e.jpg:person:1", + "source_name": "man talking to young man", + "description": "Man wearing a pink shirt and dark shorts.", + "role_in_scene": "conversing near a storefront on the sidewalk" + }, + { + "name": "pedestrian_light_jacket", + "source_index": 11, + "source_image_id": "CrowdHuman:data/data_16/273278,11c1c7000110dd323.jpg:person:1", + "source_name": "pedestrian", + "description": "Individual wearing a light-colored jacket.", + "role_in_scene": "approaching the street intersection" + }, + { + "name": "pedestrian_light_blue_shirt", + "source_index": 12, + "source_image_id": "CrowdHuman:data/data_4/273275,1dd0f000c9356aaa.jpg:person:18", + "source_name": "pedestrian", + "description": "A person in a light blue shirt walking away from the camera.", + "role_in_scene": "walking further down the sidewalk in the mid-ground" + }, + { + "name": "distant_pedestrian", + "source_index": 15, + "source_image_id": "CrowdHuman:data/data_21/283991,2145f000467f0a28.jpg:person:14", + "source_name": "distant pedestrian", + "description": "A person walking in the background.", + "role_in_scene": "walking in the far background down the street" + } + ], + "objects": [ + { + "name": "black_suv", + "source_index": 1, + "source_image_id": "BDD100K:b8fe1054-42625c45:object:1", + "source_name": "black suv", + "description": "A black SUV parked ahead on the right.", + "role_in_scene": "parked at the curb on the right side of the street" + } + ] + }, + "expected_subjects": [ + { + "name": "pedestrian_black_jacket", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_69/273275,3ad7000cc6b4598.jpg:person:7", + "source_name": "pedestrian", + "source_description": "Person walking away, wearing a black jacket and dark pants. Source dataset: CrowdHuman. Scene context: People are walking on a street in front of a large, ornate cathedral.", + "sub_caption": "pedestrian: Person walking away, wearing a black jacket and dark pants.. Scene role: walking away on the sidewalk to the right", + "ref_style": "white_bg_full_body_front" + }, + { + "name": "pedestrian_backpack", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_9/283991,1e62f00058996b51.jpg:person:10", + "source_name": "pedestrian", + "source_description": "A man partially visible behind the man in the black polo shirt, wearing a blue t-shirt and a backpack. Source dataset: CrowdHuman. Scene context: A group of people standing on a paved street in front of stone buildings, seemingly listening to a tour guide.", + "sub_caption": "pedestrian: A man wearing a blue t-shirt and a backpack.. Scene role: standing near the curb looking toward the road", + "ref_style": "white_bg_full_body_front" + }, + { + "name": "pedestrian_red_jacket", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_29/283991,4d5b0000ceec5a1.jpg:person:5", + "source_name": "pedestrian standing", + "source_description": "A person standing near a kiosk, wearing a bright red jacket. Source dataset: CrowdHuman. Scene context: A large outdoor public square with a fountain, surrounded by buildings and trees, with many pedestrians walking around and climbing a wide set of marble stairs in the foreground.", + "sub_caption": "pedestrian standing: A person standing, wearing a bright red jacket.. Scene role: waiting at a crosswalk edge", + "ref_style": "white_bg_full_body_front" + }, + { + "name": "pedestrian_striped_shirt", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_15/273278,141d72000a04cd076.jpg:person:8", + "source_name": "pedestrian", + "source_description": "Person in the background wearing a striped shirt and dark pants. Source dataset: CrowdHuman. Scene context: A busy city square with pedestrians, bicycles, and a yellow tram on a sunny day.", + "sub_caption": "pedestrian: Person wearing a striped shirt and dark pants.. Scene role: strolling along the sidewalk", + "ref_style": "white_bg_full_body_front" + }, + { + "name": "man_pink_shirt", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_41/283992,8df800092d3201e.jpg:person:1", + "source_name": "man talking to young man", + "source_description": "Standing next to the young man with the backpack, wearing a pink shirt and dark shorts. Source dataset: CrowdHuman. Scene context: A bustling city street corner with people walking and standing around, surrounded by buildings with large digital billboards and store signs, with a few vehicles including a prominent red SUV passing through.", + "sub_caption": "man talking to young man: Man wearing a pink shirt and dark shorts.. Scene role: conversing near a storefront on the sidewalk", + "ref_style": "white_bg_full_body_front" + }, + { + "name": "pedestrian_light_jacket", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_16/273278,11c1c7000110dd323.jpg:person:1", + "source_name": "pedestrian", + "source_description": "Individual in the crowd near the intersection, wearing a light-colored jacket. Source dataset: CrowdHuman. Scene context: A bustling city street at night, brightly illuminated by numerous large electronic billboards on tall buildings, with heavy vehicle and pedestrian traffic below.", + "sub_caption": "pedestrian: Individual wearing a light-colored jacket.. Scene role: approaching the street intersection", + "ref_style": "white_bg_full_body_front" + }, + { + "name": "pedestrian_light_blue_shirt", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_4/273275,1dd0f000c9356aaa.jpg:person:18", + "source_name": "pedestrian", + "source_description": "A person in a light blue shirt walking away from the camera. Source dataset: CrowdHuman. Scene context: A wide, tree-lined avenue crowded with many pedestrians walking in both directions, adorned with red banners on the trees under a clear blue sky.", + "sub_caption": "pedestrian: A person in a light blue shirt walking away from the camera.. Scene role: walking further down the sidewalk in the mid-ground", + "ref_style": "white_bg_full_body_front" + }, + { + "name": "distant_pedestrian", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_21/283991,2145f000467f0a28.jpg:person:14", + "source_name": "distant pedestrian", + "source_description": "A person walking in the background. Source dataset: CrowdHuman. Scene context: People walk through an outdoor plaza area with modern architecture, an outdoor seating section with red chairs on the left, and planters with yellow and blue flowers on the right.", + "sub_caption": "distant pedestrian: A person walking in the background.. Scene role: walking in the far background down the street", + "ref_style": "white_bg_full_body_front" + }, + { + "name": "black_suv", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b8fe1054-42625c45:object:1", + "source_name": "black suv", + "source_description": "A black SUV parked ahead of the silver SUV on the right. Source dataset: BDD100K. Scene context: View from a vehicle driving down a cracked city street lined with parked cars, buildings, and trees on a sunny day.", + "sub_caption": "black suv: A black SUV parked ahead on the right.. Scene role: parked at the curb on the right side of the street", + "ref_style": "white_bg_encyclopedia_photo" + } + ], + "vocab_task_path": "sample_000010/vocab_task.json", + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references.json new file mode 100644 index 0000000000000000000000000000000000000000..b7e6eedc7b831a475ed43fe5c727469f4822446f --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references.json @@ -0,0 +1,293 @@ +{ + "references": [ + { + "name": "pedestrian_black_jacket", + "ref_image": "references/ref_pedestrian_black_jacket.png", + "raw_ref_image": "references/raw_ref_pedestrian_black_jacket_attempt_01.png", + "diversify_input": "crops/diversify_input_pedestrian_black_jacket.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_black_jacket_attempt_01.png", + "output": "references/ref_pedestrian_black_jacket.png", + "mask": "references/sam_mask_pedestrian_black_jacket.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 332.0, + 25.0, + 691.0, + 1014.0 + ], + "mask_score": 3.419812, + "mask_area_ratio": 0.160983, + "elapsed_seconds": 8.2172 + }, + "reference_verify": "references/reference_verify_pedestrian_black_jacket.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "pedestrian_backpack", + "ref_image": "references/ref_pedestrian_backpack.png", + "raw_ref_image": "references/raw_ref_pedestrian_backpack_attempt_01.png", + "diversify_input": "crops/diversify_input_pedestrian_backpack.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_backpack_attempt_01.png", + "output": "references/ref_pedestrian_backpack.png", + "mask": "references/sam_mask_pedestrian_backpack.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 344.0, + 30.0, + 689.0, + 1018.0 + ], + "mask_score": 3.422455, + "mask_area_ratio": 0.157988, + "elapsed_seconds": 8.1451 + }, + "reference_verify": "references/reference_verify_pedestrian_backpack.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "pedestrian_red_jacket", + "ref_image": "references/ref_pedestrian_red_jacket.png", + "raw_ref_image": "references/raw_ref_pedestrian_red_jacket_attempt_01.png", + "diversify_input": "crops/diversify_input_pedestrian_red_jacket.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_red_jacket_attempt_01.png", + "output": "references/ref_pedestrian_red_jacket.png", + "mask": "references/sam_mask_pedestrian_red_jacket.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 346.0, + 92.0, + 677.0, + 984.0 + ], + "mask_score": 3.472322, + "mask_area_ratio": 0.129704, + "elapsed_seconds": 9.5973 + }, + "reference_verify": "references/reference_verify_pedestrian_red_jacket.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "pedestrian_striped_shirt", + "ref_image": "references/ref_pedestrian_striped_shirt.png", + "raw_ref_image": "references/raw_ref_pedestrian_striped_shirt_attempt_01.png", + "diversify_input": "crops/diversify_input_pedestrian_striped_shirt.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_striped_shirt_attempt_01.png", + "output": "references/ref_pedestrian_striped_shirt.png", + "mask": "references/sam_mask_pedestrian_striped_shirt.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 338.0, + 11.0, + 687.0, + 1018.0 + ], + "mask_score": 3.206288, + "mask_area_ratio": 0.147885, + "elapsed_seconds": 8.1875 + }, + "reference_verify": "references/reference_verify_pedestrian_striped_shirt.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "man_pink_shirt", + "ref_image": "references/ref_man_pink_shirt.png", + "raw_ref_image": "references/raw_ref_man_pink_shirt_attempt_01.png", + "diversify_input": "crops/diversify_input_man_pink_shirt.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_man_pink_shirt_attempt_01.png", + "output": "references/ref_man_pink_shirt.png", + "mask": "references/sam_mask_man_pink_shirt.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 340.0, + 42.0, + 689.0, + 995.0 + ], + "mask_score": 3.442738, + "mask_area_ratio": 0.146916, + "elapsed_seconds": 8.1734 + }, + "reference_verify": "references/reference_verify_man_pink_shirt.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "pedestrian_light_jacket", + "ref_image": "references/ref_pedestrian_light_jacket.png", + "raw_ref_image": "references/raw_ref_pedestrian_light_jacket_attempt_01.png", + "diversify_input": "crops/diversify_input_pedestrian_light_jacket.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_light_jacket_attempt_01.png", + "output": "references/ref_pedestrian_light_jacket.png", + "mask": "references/sam_mask_pedestrian_light_jacket.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 342.0, + 28.0, + 681.0, + 1013.0 + ], + "mask_score": 3.460161, + "mask_area_ratio": 0.163844, + "elapsed_seconds": 9.6744 + }, + "reference_verify": "references/reference_verify_pedestrian_light_jacket.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "pedestrian_light_blue_shirt", + "ref_image": "references/ref_pedestrian_light_blue_shirt.png", + "raw_ref_image": "references/raw_ref_pedestrian_light_blue_shirt_attempt_01.png", + "diversify_input": "crops/diversify_input_pedestrian_light_blue_shirt.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_light_blue_shirt_attempt_01.png", + "output": "references/ref_pedestrian_light_blue_shirt.png", + "mask": "references/sam_mask_pedestrian_light_blue_shirt.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 357.0, + 33.0, + 693.0, + 1012.0 + ], + "mask_score": 3.452806, + "mask_area_ratio": 0.153078, + "elapsed_seconds": 9.6513 + }, + "reference_verify": "references/reference_verify_pedestrian_light_blue_shirt.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "distant_pedestrian", + "ref_image": "references/ref_distant_pedestrian.png", + "raw_ref_image": "references/raw_ref_distant_pedestrian_attempt_01.png", + "diversify_input": "crops/diversify_input_distant_pedestrian.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_distant_pedestrian_attempt_01.png", + "output": "references/ref_distant_pedestrian.png", + "mask": "references/sam_mask_distant_pedestrian.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 360.0, + 88.0, + 677.0, + 997.0 + ], + "mask_score": 3.489431, + "mask_area_ratio": 0.138401, + "elapsed_seconds": 8.1869 + }, + "reference_verify": "references/reference_verify_distant_pedestrian.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "black_suv", + "ref_image": "references/ref_black_suv.png", + "raw_ref_image": "references/raw_ref_black_suv_attempt_01.png", + "diversify_input": "crops/diversify_input_black_suv.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_black_suv_attempt_01.png", + "output": "references/ref_black_suv.png", + "mask": "references/sam_mask_black_suv.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 16.0, + 190.0, + 1007.0, + 843.0 + ], + "mask_score": 3.120914, + "mask_area_ratio": 0.384048, + "elapsed_seconds": 9.8714 + }, + "reference_verify": "references/reference_verify_black_suv.json", + "reference_verify_passed": true, + "reference_attempts": 1 + } + ], + "reference_errors": {} +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/ref_black_suv.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/ref_black_suv.png new file mode 100644 index 0000000000000000000000000000000000000000..8204d1ea0dd5ce4213962eb324d62e9f117d0e09 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/ref_black_suv.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c483a5dd63a97b592ffc4b8f7d681ca9fbe6309f4794b7a09ef9dd09a9aa827 +size 652550 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/ref_distant_pedestrian.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/ref_distant_pedestrian.png new file mode 100644 index 0000000000000000000000000000000000000000..d76a911506e0d8e0722652f662afa1c842c3c955 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/ref_distant_pedestrian.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ebd0d8abd9f1d822e5c88662349a96c92c3a7d0d1994e823c24655c52cb96044 +size 244347 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/ref_man_pink_shirt.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/ref_man_pink_shirt.png new file mode 100644 index 0000000000000000000000000000000000000000..ac111f39c413e9c497f3cac58c67a347a32b3e9a --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/ref_man_pink_shirt.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:319b0a0a1637a1054d431224b4130ff08fede9d0b476d7c631b86982110daf03 +size 291090 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/ref_pedestrian_backpack.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/ref_pedestrian_backpack.png new file mode 100644 index 0000000000000000000000000000000000000000..f02d125e97d46ea17683c32bee7c81c468e58b2d --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/ref_pedestrian_backpack.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da66e3f0cb4b467886b79393cdec0cf7e334b98682b31d68a26c9cc3e1decd0f +size 317026 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/ref_pedestrian_black_jacket.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/ref_pedestrian_black_jacket.png new file mode 100644 index 0000000000000000000000000000000000000000..5521d2383fd927fd4e0992be497172498845abc4 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/ref_pedestrian_black_jacket.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03fb703825601acd140b6a74ce90241e3d0a557c7e6e1ab0ebc367aa03abd9da +size 300931 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/ref_pedestrian_light_blue_shirt.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/ref_pedestrian_light_blue_shirt.png new file mode 100644 index 0000000000000000000000000000000000000000..84dc17b31cb460f02d0209bcce8955b1633e77d7 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/ref_pedestrian_light_blue_shirt.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d471ea167f274809901dd39442e22f97bb22ec0d521a8d9924150bde8feedc9 +size 293215 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/ref_pedestrian_light_jacket.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/ref_pedestrian_light_jacket.png new file mode 100644 index 0000000000000000000000000000000000000000..c0627778918e3db8c8295760b11028c4ba062a4e --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/ref_pedestrian_light_jacket.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e38df33d39ec6ecca4496588c83b04506bfdf7a228d12368d7ec34fcf4e9d97 +size 277257 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/ref_pedestrian_red_jacket.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/ref_pedestrian_red_jacket.png new file mode 100644 index 0000000000000000000000000000000000000000..1fe2938ebbb4db7f6614c8828f4db79b473cc74c --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/ref_pedestrian_red_jacket.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7cc774b43e03a21705b6bc0b720d7c4f6faaa4f1d6344b551d505d330b7053d +size 263502 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/ref_pedestrian_striped_shirt.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/ref_pedestrian_striped_shirt.png new file mode 100644 index 0000000000000000000000000000000000000000..cc5066db36a80cdd855e0b902ce5314e7855db95 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/ref_pedestrian_striped_shirt.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5fa5b40701140f75866cbc07f2b694afaec5fb1512b2d660e8235a892c2d186 +size 335965 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/reference_verify_black_suv.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/reference_verify_black_suv.json new file mode 100644 index 0000000000000000000000000000000000000000..e3161f2928f4ca103a860bc047185d4e45f42768 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/reference_verify_black_suv.json @@ -0,0 +1,46 @@ +{ + "name": "black_suv", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_black_suv_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_black_suv_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_black_suv_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_black_suv_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/candidate_ref_black_suv_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/candidate_sam_mask_black_suv_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 16.0, + 190.0, + 1007.0, + 843.0 + ], + "mask_score": 3.120914, + "mask_area_ratio": 0.384048, + "elapsed_seconds": 9.8714 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows a complete, isolated black SUV on a white background. All requirements are met." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/reference_verify_distant_pedestrian.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/reference_verify_distant_pedestrian.json new file mode 100644 index 0000000000000000000000000000000000000000..32b17fe40bf77eeb42b27827dddd843e19cd780d --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/reference_verify_distant_pedestrian.json @@ -0,0 +1,46 @@ +{ + "name": "distant_pedestrian", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_distant_pedestrian_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_distant_pedestrian_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_distant_pedestrian_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_distant_pedestrian_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/candidate_ref_distant_pedestrian_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/candidate_sam_mask_distant_pedestrian_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 360.0, + 88.0, + 677.0, + 997.0 + ], + "mask_score": 3.489431, + "mask_area_ratio": 0.138401, + "elapsed_seconds": 8.1869 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "Full body visible, well isolated on white background, no cropping." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/reference_verify_man_pink_shirt.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/reference_verify_man_pink_shirt.json new file mode 100644 index 0000000000000000000000000000000000000000..b44cb18dc380dbf0f725a8bcd3d52d5d140b8719 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/reference_verify_man_pink_shirt.json @@ -0,0 +1,46 @@ +{ + "name": "man_pink_shirt", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_man_pink_shirt_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_man_pink_shirt_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_man_pink_shirt_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_man_pink_shirt_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/candidate_ref_man_pink_shirt_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/candidate_sam_mask_man_pink_shirt_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 340.0, + 42.0, + 689.0, + 995.0 + ], + "mask_score": 3.442738, + "mask_area_ratio": 0.146916, + "elapsed_seconds": 8.1734 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The man is fully visible from head to toe with a white background and ample margins. No cropping occurs." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/reference_verify_pedestrian_backpack.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/reference_verify_pedestrian_backpack.json new file mode 100644 index 0000000000000000000000000000000000000000..2ab25f56abb41c351a22ddf1f8e209d2a20fb940 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/reference_verify_pedestrian_backpack.json @@ -0,0 +1,46 @@ +{ + "name": "pedestrian_backpack", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_pedestrian_backpack_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_pedestrian_backpack_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_pedestrian_backpack_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_backpack_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/candidate_ref_pedestrian_backpack_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/candidate_sam_mask_pedestrian_backpack_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 344.0, + 30.0, + 689.0, + 1018.0 + ], + "mask_score": 3.422455, + "mask_area_ratio": 0.157988, + "elapsed_seconds": 8.1451 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows a full body of a man with a backpack on a white background. No cropping or truncation is observed, and there is adequate margin around the subject." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/reference_verify_pedestrian_black_jacket.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/reference_verify_pedestrian_black_jacket.json new file mode 100644 index 0000000000000000000000000000000000000000..8fe05f330291cb9ff2561256f6d1a7c70d4ff172 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/reference_verify_pedestrian_black_jacket.json @@ -0,0 +1,46 @@ +{ + "name": "pedestrian_black_jacket", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_pedestrian_black_jacket_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_pedestrian_black_jacket_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_pedestrian_black_jacket_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_black_jacket_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/candidate_ref_pedestrian_black_jacket_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/candidate_sam_mask_pedestrian_black_jacket_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 332.0, + 25.0, + 691.0, + 1014.0 + ], + "mask_score": 3.419812, + "mask_area_ratio": 0.160983, + "elapsed_seconds": 8.2172 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows a full body of a person on a white background, isolated and fully visible." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/reference_verify_pedestrian_light_blue_shirt.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/reference_verify_pedestrian_light_blue_shirt.json new file mode 100644 index 0000000000000000000000000000000000000000..4c1ab55baed20f81b0595a1aaed0af40cd655bd4 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/reference_verify_pedestrian_light_blue_shirt.json @@ -0,0 +1,46 @@ +{ + "name": "pedestrian_light_blue_shirt", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_pedestrian_light_blue_shirt_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_pedestrian_light_blue_shirt_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_pedestrian_light_blue_shirt_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_light_blue_shirt_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/candidate_ref_pedestrian_light_blue_shirt_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/candidate_sam_mask_pedestrian_light_blue_shirt_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 357.0, + 33.0, + 693.0, + 1012.0 + ], + "mask_score": 3.452806, + "mask_area_ratio": 0.153078, + "elapsed_seconds": 9.6513 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "Image shows a complete full-body view of a person with a white background, satisfying all requirements." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/reference_verify_pedestrian_light_jacket.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/reference_verify_pedestrian_light_jacket.json new file mode 100644 index 0000000000000000000000000000000000000000..009a585e57b431a12cc0a7490e411973f92274b3 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/reference_verify_pedestrian_light_jacket.json @@ -0,0 +1,46 @@ +{ + "name": "pedestrian_light_jacket", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_pedestrian_light_jacket_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_pedestrian_light_jacket_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_pedestrian_light_jacket_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_light_jacket_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/candidate_ref_pedestrian_light_jacket_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/candidate_sam_mask_pedestrian_light_jacket_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 342.0, + 28.0, + 681.0, + 1013.0 + ], + "mask_score": 3.460161, + "mask_area_ratio": 0.163844, + "elapsed_seconds": 9.6744 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "Full body of a single person wearing a light-colored jacket is clearly visible against a white background with no cropping." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/reference_verify_pedestrian_red_jacket.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/reference_verify_pedestrian_red_jacket.json new file mode 100644 index 0000000000000000000000000000000000000000..8069b72c9e952dbed4b209c1b85b59d1c8038541 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/reference_verify_pedestrian_red_jacket.json @@ -0,0 +1,46 @@ +{ + "name": "pedestrian_red_jacket", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_pedestrian_red_jacket_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_pedestrian_red_jacket_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_pedestrian_red_jacket_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_red_jacket_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/candidate_ref_pedestrian_red_jacket_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/candidate_sam_mask_pedestrian_red_jacket_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 346.0, + 92.0, + 677.0, + 984.0 + ], + "mask_score": 3.472322, + "mask_area_ratio": 0.129704, + "elapsed_seconds": 9.5973 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows a full body of a single person wearing a red jacket, fully visible from head to toe with sufficient white margin around it." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/reference_verify_pedestrian_striped_shirt.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/reference_verify_pedestrian_striped_shirt.json new file mode 100644 index 0000000000000000000000000000000000000000..0f474b337f9c52efb8d96ce89c45ee0a6f1df209 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/reference_verify_pedestrian_striped_shirt.json @@ -0,0 +1,46 @@ +{ + "name": "pedestrian_striped_shirt", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_pedestrian_striped_shirt_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_pedestrian_striped_shirt_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_pedestrian_striped_shirt_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_striped_shirt_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/candidate_ref_pedestrian_striped_shirt_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/candidate_sam_mask_pedestrian_striped_shirt_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 338.0, + 11.0, + 687.0, + 1018.0 + ], + "mask_score": 3.206288, + "mask_area_ratio": 0.147885, + "elapsed_seconds": 8.1875 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The full body of the person is visible and not cropped. The subject is isolated on a white background with sufficient margin." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/sam_mask_black_suv.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/sam_mask_black_suv.png new file mode 100644 index 0000000000000000000000000000000000000000..55ab66ea72d3b8ca300e82461638abf07e993629 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/sam_mask_black_suv.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/sam_mask_distant_pedestrian.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/sam_mask_distant_pedestrian.png new file mode 100644 index 0000000000000000000000000000000000000000..e586ecd67af91d956a23dbe682020beea8600099 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/sam_mask_distant_pedestrian.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/sam_mask_man_pink_shirt.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/sam_mask_man_pink_shirt.png new file mode 100644 index 0000000000000000000000000000000000000000..a2c44acb912ee5305056c37fa7ee12c8762355d3 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/sam_mask_man_pink_shirt.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/sam_mask_pedestrian_backpack.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/sam_mask_pedestrian_backpack.png new file mode 100644 index 0000000000000000000000000000000000000000..21836cf48d619daddf9d0b8eb67bc05046fbeeb7 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/sam_mask_pedestrian_backpack.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/sam_mask_pedestrian_black_jacket.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/sam_mask_pedestrian_black_jacket.png new file mode 100644 index 0000000000000000000000000000000000000000..d291ba1a26d3c6b42b6c3a2fdfac696128428c7e Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/sam_mask_pedestrian_black_jacket.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/sam_mask_pedestrian_light_blue_shirt.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/sam_mask_pedestrian_light_blue_shirt.png new file mode 100644 index 0000000000000000000000000000000000000000..6d3d577e89bd8a4164a38d596cfdab14879baa3d Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/sam_mask_pedestrian_light_blue_shirt.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/sam_mask_pedestrian_light_jacket.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/sam_mask_pedestrian_light_jacket.png new file mode 100644 index 0000000000000000000000000000000000000000..b792cafd313b395bf13053cb498bbb67b6c8178d Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/sam_mask_pedestrian_light_jacket.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/sam_mask_pedestrian_red_jacket.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/sam_mask_pedestrian_red_jacket.png new file mode 100644 index 0000000000000000000000000000000000000000..8039b43e5623ac80b2a3b96cf193af45ec0fa959 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/sam_mask_pedestrian_red_jacket.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/sam_mask_pedestrian_striped_shirt.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/sam_mask_pedestrian_striped_shirt.png new file mode 100644 index 0000000000000000000000000000000000000000..081859d8a481ddd4fac30ffcc2d5c83d64c55d54 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/sam_mask_pedestrian_striped_shirt.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/row.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/row.json new file mode 100644 index 0000000000000000000000000000000000000000..60e2119788d8f051819af9114f044d43aca406e0 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/row.json @@ -0,0 +1,440 @@ +{ + "sample_id": "sample_000010", + "target_total": 9, + "target_people": 8, + "target_objects": 1, + "canvas_size": [ + 1280, + 720 + ], + "canvas_aspect_ratio": "16:9", + "main_image": "main_image.png", + "bbox_overlay": "bbox_overlay.png", + "plan": "plan.json", + "detections": "detections.json", + "vocab_task": "vocab_task.json", + "n_planned": 9, + "n_detected": 9, + "n_subjects": 9, + "subjects": [ + { + "name": "pedestrian_black_jacket", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_69/273275,3ad7000cc6b4598.jpg:person:7", + "source_name": "pedestrian", + "source_description": "Person walking away, wearing a black jacket and dark pants. Source dataset: CrowdHuman. Scene context: People are walking on a street in front of a large, ornate cathedral.", + "sub_caption": "pedestrian: Person walking away, wearing a black jacket and dark pants.. Scene role: walking away on the sidewalk to the right", + "measured_bbox": [ + 0.8947, + 0.408, + 0.9768, + 0.8 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian_black_jacket.png", + "raw_ref_image": "references/raw_ref_pedestrian_black_jacket_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian_black_jacket.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_black_jacket_attempt_01.png", + "output": "references/ref_pedestrian_black_jacket.png", + "mask": "references/sam_mask_pedestrian_black_jacket.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 332.0, + 25.0, + 691.0, + 1014.0 + ], + "mask_score": 3.419812, + "mask_area_ratio": 0.160983, + "elapsed_seconds": 8.2172 + } + }, + { + "name": "pedestrian_backpack", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_9/283991,1e62f00058996b51.jpg:person:10", + "source_name": "pedestrian", + "source_description": "A man partially visible behind the man in the black polo shirt, wearing a blue t-shirt and a backpack. Source dataset: CrowdHuman. Scene context: A group of people standing on a paved street in front of stone buildings, seemingly listening to a tour guide.", + "sub_caption": "pedestrian: A man wearing a blue t-shirt and a backpack.. Scene role: standing near the curb looking toward the road", + "measured_bbox": [ + 0.6953, + 0.4394, + 0.7156, + 0.5151 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian_backpack.png", + "raw_ref_image": "references/raw_ref_pedestrian_backpack_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian_backpack.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_backpack_attempt_01.png", + "output": "references/ref_pedestrian_backpack.png", + "mask": "references/sam_mask_pedestrian_backpack.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 344.0, + 30.0, + 689.0, + 1018.0 + ], + "mask_score": 3.422455, + "mask_area_ratio": 0.157988, + "elapsed_seconds": 8.1451 + } + }, + { + "name": "pedestrian_red_jacket", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_29/283991,4d5b0000ceec5a1.jpg:person:5", + "source_name": "pedestrian standing", + "source_description": "A person standing near a kiosk, wearing a bright red jacket. Source dataset: CrowdHuman. Scene context: A large outdoor public square with a fountain, surrounded by buildings and trees, with many pedestrians walking around and climbing a wide set of marble stairs in the foreground.", + "sub_caption": "pedestrian standing: A person standing, wearing a bright red jacket.. Scene role: waiting at a crosswalk edge", + "measured_bbox": [ + 0.4504, + 0.4033, + 0.474, + 0.5253 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian_red_jacket.png", + "raw_ref_image": "references/raw_ref_pedestrian_red_jacket_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian_red_jacket.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_red_jacket_attempt_01.png", + "output": "references/ref_pedestrian_red_jacket.png", + "mask": "references/sam_mask_pedestrian_red_jacket.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 346.0, + 92.0, + 677.0, + 984.0 + ], + "mask_score": 3.472322, + "mask_area_ratio": 0.129704, + "elapsed_seconds": 9.5973 + } + }, + { + "name": "pedestrian_striped_shirt", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_15/273278,141d72000a04cd076.jpg:person:8", + "source_name": "pedestrian", + "source_description": "Person in the background wearing a striped shirt and dark pants. Source dataset: CrowdHuman. Scene context: A busy city square with pedestrians, bicycles, and a yellow tram on a sunny day.", + "sub_caption": "pedestrian: Person wearing a striped shirt and dark pants.. Scene role: strolling along the sidewalk", + "measured_bbox": [ + 0.7269, + 0.3947, + 0.7711, + 0.5853 + ], + "detection_confidence": 100, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian_striped_shirt.png", + "raw_ref_image": "references/raw_ref_pedestrian_striped_shirt_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian_striped_shirt.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_striped_shirt_attempt_01.png", + "output": "references/ref_pedestrian_striped_shirt.png", + "mask": "references/sam_mask_pedestrian_striped_shirt.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 338.0, + 11.0, + 687.0, + 1018.0 + ], + "mask_score": 3.206288, + "mask_area_ratio": 0.147885, + "elapsed_seconds": 8.1875 + } + }, + { + "name": "man_pink_shirt", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_41/283992,8df800092d3201e.jpg:person:1", + "source_name": "man talking to young man", + "source_description": "Standing next to the young man with the backpack, wearing a pink shirt and dark shorts. Source dataset: CrowdHuman. Scene context: A bustling city street corner with people walking and standing around, surrounded by buildings with large digital billboards and store signs, with a few vehicles including a prominent red SUV passing through.", + "sub_caption": "man talking to young man: Man wearing a pink shirt and dark shorts.. Scene role: conversing near a storefront on the sidewalk", + "measured_bbox": [ + 0.8332, + 0.3734, + 0.8735, + 0.5918 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_man_pink_shirt.png", + "raw_ref_image": "references/raw_ref_man_pink_shirt_attempt_01.png", + "reference_verify": "references/reference_verify_man_pink_shirt.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_man_pink_shirt_attempt_01.png", + "output": "references/ref_man_pink_shirt.png", + "mask": "references/sam_mask_man_pink_shirt.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 340.0, + 42.0, + 689.0, + 995.0 + ], + "mask_score": 3.442738, + "mask_area_ratio": 0.146916, + "elapsed_seconds": 8.1734 + } + }, + { + "name": "pedestrian_light_jacket", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_16/273278,11c1c7000110dd323.jpg:person:1", + "source_name": "pedestrian", + "source_description": "Individual in the crowd near the intersection, wearing a light-colored jacket. Source dataset: CrowdHuman. Scene context: A bustling city street at night, brightly illuminated by numerous large electronic billboards on tall buildings, with heavy vehicle and pedestrian traffic below.", + "sub_caption": "pedestrian: Individual wearing a light-colored jacket.. Scene role: approaching the street intersection", + "measured_bbox": [ + 0.6065, + 0.3907, + 0.6375, + 0.4907 + ], + "detection_confidence": "high", + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian_light_jacket.png", + "raw_ref_image": "references/raw_ref_pedestrian_light_jacket_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian_light_jacket.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_light_jacket_attempt_01.png", + "output": "references/ref_pedestrian_light_jacket.png", + "mask": "references/sam_mask_pedestrian_light_jacket.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 342.0, + 28.0, + 681.0, + 1013.0 + ], + "mask_score": 3.460161, + "mask_area_ratio": 0.163844, + "elapsed_seconds": 9.6744 + } + }, + { + "name": "pedestrian_light_blue_shirt", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_4/273275,1dd0f000c9356aaa.jpg:person:18", + "source_name": "pedestrian", + "source_description": "A person in a light blue shirt walking away from the camera. Source dataset: CrowdHuman. Scene context: A wide, tree-lined avenue crowded with many pedestrians walking in both directions, adorned with red banners on the trees under a clear blue sky.", + "sub_caption": "pedestrian: A person in a light blue shirt walking away from the camera.. Scene role: walking further down the sidewalk in the mid-ground", + "measured_bbox": [ + 0.9459, + 0.3895, + 0.9964, + 0.6538 + ], + "detection_confidence": 100, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_pedestrian_light_blue_shirt.png", + "raw_ref_image": "references/raw_ref_pedestrian_light_blue_shirt_attempt_01.png", + "reference_verify": "references/reference_verify_pedestrian_light_blue_shirt.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_pedestrian_light_blue_shirt_attempt_01.png", + "output": "references/ref_pedestrian_light_blue_shirt.png", + "mask": "references/sam_mask_pedestrian_light_blue_shirt.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 357.0, + 33.0, + 693.0, + 1012.0 + ], + "mask_score": 3.452806, + "mask_area_ratio": 0.153078, + "elapsed_seconds": 9.6513 + } + }, + { + "name": "distant_pedestrian", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_21/283991,2145f000467f0a28.jpg:person:14", + "source_name": "distant pedestrian", + "source_description": "A person walking in the background. Source dataset: CrowdHuman. Scene context: People walk through an outdoor plaza area with modern architecture, an outdoor seating section with red chairs on the left, and planters with yellow and blue flowers on the right.", + "sub_caption": "distant pedestrian: A person walking in the background.. Scene role: walking in the far background down the street", + "measured_bbox": [ + 0.6066, + 0.3904, + 0.6375, + 0.489 + ], + "detection_confidence": 1.0, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_distant_pedestrian.png", + "raw_ref_image": "references/raw_ref_distant_pedestrian_attempt_01.png", + "reference_verify": "references/reference_verify_distant_pedestrian.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_distant_pedestrian_attempt_01.png", + "output": "references/ref_distant_pedestrian.png", + "mask": "references/sam_mask_distant_pedestrian.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 360.0, + 88.0, + 677.0, + 997.0 + ], + "mask_score": 3.489431, + "mask_area_ratio": 0.138401, + "elapsed_seconds": 8.1869 + } + }, + { + "name": "black_suv", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:b8fe1054-42625c45:object:1", + "source_name": "black suv", + "source_description": "A black SUV parked ahead of the silver SUV on the right. Source dataset: BDD100K. Scene context: View from a vehicle driving down a cracked city street lined with parked cars, buildings, and trees on a sunny day.", + "sub_caption": "black suv: A black SUV parked ahead on the right.. Scene role: parked at the curb on the right side of the street", + "measured_bbox": [ + 0.4391, + 0.47, + 0.6899, + 0.8264 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_black_suv.png", + "raw_ref_image": "references/raw_ref_black_suv_attempt_01.png", + "reference_verify": "references/reference_verify_black_suv.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000010/references/raw_ref_black_suv_attempt_01.png", + "output": "references/ref_black_suv.png", + "mask": "references/sam_mask_black_suv.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 16.0, + 190.0, + 1007.0, + 843.0 + ], + "mask_score": 3.120914, + "mask_area_ratio": 0.384048, + "elapsed_seconds": 9.8714 + } + } + ], + "not_emitted": [], + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/vocab_task.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/vocab_task.json new file mode 100644 index 0000000000000000000000000000000000000000..f89b1c388218c8a78bc2b85cde8616a2d5018b21 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/vocab_task.json @@ -0,0 +1,140 @@ +{ + "task_id": "sample_000010", + "sample_id": "sample_000010", + "sample_index": 10, + "target_total": 9, + "target_people": 8, + "target_objects": 1, + "people_candidates": [ + { + "candidate_index": 0, + "source_offset": 73165, + "image_id": "CrowdHuman:data/data_34/273275,e38390009eb9d542.jpg:person:3", + "name": "adult man", + "description": "Standing in the back center, taller than the others. Wearing a dark blue t-shirt with 'Red Sox' in red lettering. Smiling and holding up two fingers on both hands. Source dataset: CrowdHuman. Scene context: A group of young people and an adult posing for a photo outdoors near a pond and greenery." + }, + { + "candidate_index": 1, + "source_offset": 141052, + "image_id": "CrowdHuman:data/data_58/273275,81578000b3bc0044.jpg:person:8", + "name": "athlete in maroon", + "description": "Standing near the middle back, wearing a maroon jersey with white and green accents. Source dataset: CrowdHuman. Scene context: A large group of female athletes in team uniforms poses together for a team photo on a grassy field." + }, + { + "candidate_index": 2, + "source_offset": 171457, + "image_id": "CrowdHuman:data/data_69/273275,3ad7000cc6b4598.jpg:person:7", + "name": "pedestrian", + "description": "Person walking away, wearing a black jacket and dark pants. Source dataset: CrowdHuman. Scene context: People are walking on a street in front of a large, ornate cathedral." + }, + { + "candidate_index": 3, + "source_offset": 193812, + "image_id": "CrowdHuman:data/data_9/283991,1e62f00058996b51.jpg:person:10", + "name": "pedestrian", + "description": "A man partially visible behind the man in the black polo shirt, wearing a blue t-shirt and a backpack. Source dataset: CrowdHuman. Scene context: A group of people standing on a paved street in front of stone buildings, seemingly listening to a tour guide." + }, + { + "candidate_index": 4, + "source_offset": 58750, + "image_id": "CrowdHuman:data/data_29/283991,4d5b0000ceec5a1.jpg:person:5", + "name": "pedestrian standing", + "description": "A person standing near a kiosk, wearing a bright red jacket. Source dataset: CrowdHuman. Scene context: A large outdoor public square with a fountain, surrounded by buildings and trees, with many pedestrians walking around and climbing a wide set of marble stairs in the foreground." + }, + { + "candidate_index": 5, + "source_offset": 23389, + "image_id": "CrowdHuman:data/data_15/273278,141d72000a04cd076.jpg:person:8", + "name": "pedestrian", + "description": "Person in the background wearing a striped shirt and dark pants. Source dataset: CrowdHuman. Scene context: A busy city square with pedestrians, bicycles, and a yellow tram on a sunny day." + }, + { + "candidate_index": 6, + "source_offset": 32556, + "image_id": "CrowdHuman:data/data_2/282555,955000086c7869b.jpg:person:5", + "name": "visitor", + "description": "A smaller figure, likely a child, wearing a red top and light-colored pants or shorts, standing next to a pedestal. Source dataset: CrowdHuman. Scene context: A large indoor sculpture gallery with classical and neoclassical statues displayed on pedestals, surrounded by ornate architecture and visitors walking and admiring the art." + }, + { + "candidate_index": 7, + "source_offset": 136805, + "image_id": "CrowdHuman:data/data_56/273278,d7bf10008b6d941a.jpg:person:14", + "name": "person being held", + "description": "A person whose legs are being held by the woman in the green jacket. They are wearing blue jeans and black and white sneakers. Source dataset: CrowdHuman. Scene context: A large group of people posing for a photo on and around the back of a red trolley bus on a city street." + }, + { + "candidate_index": 8, + "source_offset": 113079, + "image_id": "CrowdHuman:data/data_48/273278,dee9a00056a41b83.jpg:person:9", + "name": "medical professional", + "description": "Standing in the second row, wearing a white lab coat. Source dataset: CrowdHuman. Scene context: A large group of medical professionals is posing for a photograph on the outdoor steps of a brick building." + }, + { + "candidate_index": 9, + "source_offset": 95844, + "image_id": "CrowdHuman:data/data_41/283992,8df800092d3201e.jpg:person:1", + "name": "man talking to young man", + "description": "Standing next to the young man with the backpack, wearing a pink shirt and dark shorts. Source dataset: CrowdHuman. Scene context: A bustling city street corner with people walking and standing around, surrounded by buildings with large digital billboards and store signs, with a few vehicles including a prominent red SUV passing through." + }, + { + "candidate_index": 10, + "source_offset": 191693, + "image_id": "CrowdHuman:data/data_8/284193,476300039ef5826.jpg:person:3", + "name": "man", + "description": "Man wearing a grey sweater. Source dataset: CrowdHuman. Scene context: People are walking through an airport terminal with prominent overhead signage." + }, + { + "candidate_index": 11, + "source_offset": 20832, + "image_id": "CrowdHuman:data/data_16/273278,11c1c7000110dd323.jpg:person:1", + "name": "pedestrian", + "description": "Individual in the crowd near the intersection, wearing a light-colored jacket. Source dataset: CrowdHuman. Scene context: A bustling city street at night, brightly illuminated by numerous large electronic billboards on tall buildings, with heavy vehicle and pedestrian traffic below." + }, + { + "candidate_index": 12, + "source_offset": 88333, + "image_id": "CrowdHuman:data/data_4/273275,1dd0f000c9356aaa.jpg:person:18", + "name": "pedestrian", + "description": "A person in a light blue shirt walking away from the camera. Source dataset: CrowdHuman. Scene context: A wide, tree-lined avenue crowded with many pedestrians walking in both directions, adorned with red banners on the trees under a clear blue sky." + }, + { + "candidate_index": 13, + "source_offset": 78183, + "image_id": "CrowdHuman:data/data_35/273278,10d613000f22b872d.jpg:person:2", + "name": "woman", + "description": "Standing on the left side of the promenade, wearing a sleeveless pink and white patterned dress. Source dataset: CrowdHuman. Scene context: A bustling waterfront promenade with people walking, dining under awnings, and a clock tower in the background on a sunny day." + }, + { + "candidate_index": 14, + "source_offset": 147258, + "image_id": "CrowdHuman:data/data_6/283554,110da0008553b110.jpg:person:1", + "name": "male student in yellow", + "description": "A young man on the left side, wearing a bright yellow t-shirt and black shorts, holding onto a barre and practicing a pose. Source dataset: CrowdHuman. Scene context: A female instructor is leading a group of young men in what appears to be a ballet or dance exercise using barres in a studio." + }, + { + "candidate_index": 15, + "source_offset": 39909, + "image_id": "CrowdHuman:data/data_21/283991,2145f000467f0a28.jpg:person:14", + "name": "distant pedestrian", + "description": "A person walking in the background. Source dataset: CrowdHuman. Scene context: People walk through an outdoor plaza area with modern architecture, an outdoor seating section with red chairs on the left, and planters with yellow and blue flowers on the right." + } + ], + "object_candidates": [ + { + "candidate_index": 0, + "source_offset": 156087, + "image_id": "BDD100K:b6b616b3-979e75c3:object:6", + "name": "street sign", + "description": "A rectangular street sign is visible overhead on the right side of the street, partially obscured by the dirty windshield. Source dataset: BDD100K. Scene context: View from inside a vehicle driving on a city street, looking through a dirty or condensation-covered windshield at traffic and buildings ahead." + }, + { + "candidate_index": 1, + "source_offset": 166049, + "image_id": "BDD100K:b8fe1054-42625c45:object:1", + "name": "black suv", + "description": "A black SUV parked ahead of the silver SUV on the right. Source dataset: BDD100K. Scene context: View from a vehicle driving down a cracked city street lined with parked cars, buildings, and trees on a sunny day." + } + ], + "rng_seed": 1782975283, + "created_at": 1782223460.628907 +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/bbox_overlay.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/bbox_overlay.png new file mode 100644 index 0000000000000000000000000000000000000000..bfff386364621f7c8737bd34b5c9990558d97bdb --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/bbox_overlay.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39cfb723eb0fd10ce5b74f68d4231e596ade4b2002700f4c405d5e00dd5802b9 +size 1271345 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/compose_prompt.txt b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/compose_prompt.txt new file mode 100644 index 0000000000000000000000000000000000000000..92f9f742bbfb1a4d99e618e96c83c81817934f69 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/compose_prompt.txt @@ -0,0 +1,119 @@ +Render the following JSON scene specification as a photorealistic 1280x720 image using a true 16:9 canvas. The rendered image must be a coherent driving scenario with an outdoor road, street, highway, intersection, curbside, parking, or traffic environment; do not render a non-driving indoor scene or product-catalog scene. Every listed person and object must appear visibly in the image. Keep normal proportions and the requested aspect ratio. The foreground must contain only subjects explicitly listed in the JSON scene specification. Do not add any unlisted foreground people, animals, vehicles, props, tools, products, signs, readable text, logos, or other localizable subjects. Background context may include non-localizable scenery only when it does not introduce a distinct foreground subject. No text, no labels, no logos, no watermarks. +JSON scene specification: +{ + "format": "structured_json_prompt", + "canvas": { + "size": [ + 1280, + 720 + ], + "aspect_ratio": "16:9", + "style": "photorealistic" + }, + "scene": { + "setting": "A wet city street at night viewed from inside a moving car.", + "activity": "Driving down the street at night with an oncoming car passing by, while a woman walks on the sidewalk next to parked cars.", + "composition": "Dashcam perspective with the dashboard and windshield mount in the extreme foreground, framing the street ahead. The wet road reflects overhead streetlights. An oncoming white car is in the left lane, with parked vehicles on the right. The right sidewalk features vintage street lamps, a metal utility structure, and a pedestrian in profile.", + "constraints": [ + "no text", + "no labels", + "no watermarks", + "true 16:9 composition", + "final canvas size 1280x720", + "normal human and object proportions", + "no squeezed perspective", + "no anamorphic stretching", + "every listed person and object must be visibly present", + "the foreground may contain only the listed people and objects", + "no extra foreground people, animals, vehicles, props, tools, products, signs, readable text, logos, or other localizable subjects absent from this JSON", + "the entire image must be a coherent driving scenario", + "the scene must include an outdoor road, street, highway, intersection, curbside, parking, or traffic environment", + "integrate all listed subjects as road users, vehicles, roadside objects, traffic infrastructure, or driving-context objects", + "do not render an office, kitchen, studio, product catalog, indoor room, or unrelated non-driving scene" + ] + }, + "people": [ + { + "name": "blonde_woman", + "source_index": 1, + "source_image_id": "CrowdHuman:data/data_8/273275,44ab30007bea86d2.jpg:person:5", + "source_name": "woman in foreground", + "description": "A woman with blonde hair, seen in profile.", + "role_in_scene": "walking along the right sidewalk under the street lamps" + } + ], + "objects": [ + { + "name": "metal_structure", + "source_index": 0, + "source_image_id": "CrowdHuman:data/data_54/273278,11abb000d12e48e5.jpg:object:5", + "source_name": "metal structure", + "description": "A tall, rectangular grey metal box or pillar.", + "role_in_scene": "situated on the edge of the sidewalk as a utility box" + }, + { + "name": "overhead_streetlights", + "source_index": 3, + "source_image_id": "BDD100K:bd1b8b79-829e787f:object:5", + "source_name": "streetlights", + "description": "Tall poles with bright lights illuminating the street from above, casting long reflections on the wet pavement.", + "role_in_scene": "providing overhead illumination for the wet road" + }, + { + "name": "silver_car", + "source_index": 4, + "source_image_id": "BDD100K:bba4ee51-3badc9f8:object:6", + "source_name": "silver car", + "description": "A silver car.", + "role_in_scene": "parked further down the street on the right curbside" + }, + { + "name": "white_suv", + "source_index": 7, + "source_image_id": "BDD100K:c5e32cf6-7d2e04b4:object:0", + "source_name": "white suv", + "description": "A white SUV showing its rear passenger side.", + "role_in_scene": "stopped or parked in the rightmost lane ahead" + }, + { + "name": "background_street_light", + "source_index": 10, + "source_image_id": "CrowdHuman:data/data_15/273278,8d2ae00027075d75.jpg:object:5", + "source_name": "street light", + "description": "Bright street lights.", + "role_in_scene": "illuminating the distant background area of the street" + }, + { + "name": "oncoming_white_car", + "source_index": 11, + "source_image_id": "BDD100K:be3159f3-13250ffe:object:2", + "source_name": "white car", + "description": "A white car driving toward the camera with its headlights visible.", + "role_in_scene": "active oncoming traffic in the left lane" + }, + { + "name": "windshield_mount", + "source_index": 14, + "source_image_id": "BDD100K:c5694077-e345e2a8:object:12", + "source_name": "windshield mount", + "description": "A black mount attached to the inside of the windshield.", + "role_in_scene": "partially obscuring the top view, framing the dashcam perspective" + }, + { + "name": "ego_car_dashboard", + "source_index": 15, + "source_image_id": "BDD100K:c187431f-7b6ad6d6:object:0", + "source_name": "ego car dashboard", + "description": "The dark, lower foreground showing part of the dashboard and hood of the vehicle, with red ambient reflections.", + "role_in_scene": "anchors the bottom of the frame, establishing the driver's perspective" + }, + { + "name": "vintage_street_lamp", + "source_index": 16, + "source_image_id": "CrowdHuman:data/data_29/273275,2032200056dda99e.jpg:object:0", + "source_name": "street lamp", + "description": "An ornate, black, vintage-style street lamp post.", + "role_in_scene": "providing decorative lighting on the right sidewalk next to the pedestrian" + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/detect_refine_background_street_light.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/detect_refine_background_street_light.png new file mode 100644 index 0000000000000000000000000000000000000000..83e932a421810c38ddf2413166a5305fd6cf642f Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/detect_refine_background_street_light.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/detect_refine_blonde_woman.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/detect_refine_blonde_woman.png new file mode 100644 index 0000000000000000000000000000000000000000..e7d2b858abbdb1e875a71c82beaf9718c6f7b2a3 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/detect_refine_blonde_woman.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/detect_refine_ego_car_dashboard.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/detect_refine_ego_car_dashboard.png new file mode 100644 index 0000000000000000000000000000000000000000..b63d443a93371c055dfd56554a989d20aaf76f9a --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/detect_refine_ego_car_dashboard.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f4b9ce58d0550eee0ed3f79fdce9da3326580ebc272276e422694b1e423304c +size 325031 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/detect_refine_metal_structure.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/detect_refine_metal_structure.png new file mode 100644 index 0000000000000000000000000000000000000000..4eecff40be071ef90962fda5d2f7750996c2f4b0 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/detect_refine_metal_structure.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/detect_refine_oncoming_white_car.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/detect_refine_oncoming_white_car.png new file mode 100644 index 0000000000000000000000000000000000000000..a986b4647f6b0bc76641658d51016a483dd9e000 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/detect_refine_oncoming_white_car.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/detect_refine_overhead_streetlights.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/detect_refine_overhead_streetlights.png new file mode 100644 index 0000000000000000000000000000000000000000..0d6843baefba1a7ec6bf8ab7da375560edee8566 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/detect_refine_overhead_streetlights.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/detect_refine_silver_car.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/detect_refine_silver_car.png new file mode 100644 index 0000000000000000000000000000000000000000..8909707bf4d67d25d3cf5c72fc56e495e77795e9 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/detect_refine_silver_car.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/detect_refine_vintage_street_lamp.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/detect_refine_vintage_street_lamp.png new file mode 100644 index 0000000000000000000000000000000000000000..2cea7257415a4e49514fbf821a18fa43ee439502 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/detect_refine_vintage_street_lamp.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/detect_refine_white_suv.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/detect_refine_white_suv.png new file mode 100644 index 0000000000000000000000000000000000000000..ce05bdde5049399db5a994dbb9fd6ffd4409a636 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/detect_refine_white_suv.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/detect_refine_windshield_mount.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/detect_refine_windshield_mount.png new file mode 100644 index 0000000000000000000000000000000000000000..f5de80de460c0f86797a9099ae8f921a46371cf5 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/detect_refine_windshield_mount.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/diversify_input_background_street_light.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/diversify_input_background_street_light.png new file mode 100644 index 0000000000000000000000000000000000000000..02412fc6e5d520831fdd19bdcd21f710bad0ba8a Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/diversify_input_background_street_light.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/diversify_input_blonde_woman.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/diversify_input_blonde_woman.png new file mode 100644 index 0000000000000000000000000000000000000000..2c6fb4c93d38727090f97df55f49a13099514c10 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/diversify_input_blonde_woman.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/diversify_input_ego_car_dashboard.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/diversify_input_ego_car_dashboard.png new file mode 100644 index 0000000000000000000000000000000000000000..4c5ef7cd49fa901cb6d71c36523cf9a59e6301f1 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/diversify_input_ego_car_dashboard.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56449011a53b14749b5a04d7f15c843bd8896d7433778d9bd04053228cb3dbb0 +size 401458 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/diversify_input_metal_structure.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/diversify_input_metal_structure.png new file mode 100644 index 0000000000000000000000000000000000000000..e6dbea2059f076a4617e55b7219a9bce4be9b7c7 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/diversify_input_metal_structure.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/diversify_input_oncoming_white_car.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/diversify_input_oncoming_white_car.png new file mode 100644 index 0000000000000000000000000000000000000000..8950b0df313c09e39bf9909bc680942d83a79017 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/diversify_input_oncoming_white_car.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/diversify_input_overhead_streetlights.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/diversify_input_overhead_streetlights.png new file mode 100644 index 0000000000000000000000000000000000000000..85de4718abc5b0db2a420a6350c7e7ca58872632 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/diversify_input_overhead_streetlights.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/diversify_input_silver_car.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/diversify_input_silver_car.png new file mode 100644 index 0000000000000000000000000000000000000000..e0fb82caef0f95c5dc1e6adc0b058eeb49a186a3 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/diversify_input_silver_car.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/diversify_input_vintage_street_lamp.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/diversify_input_vintage_street_lamp.png new file mode 100644 index 0000000000000000000000000000000000000000..df2afa6e6044bcb8eaf7473e333f84003d9f04c9 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/diversify_input_vintage_street_lamp.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/diversify_input_white_suv.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/diversify_input_white_suv.png new file mode 100644 index 0000000000000000000000000000000000000000..0f159c10bc68e869c8c66d10bae5bb93b4f7daff Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/diversify_input_white_suv.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/diversify_input_windshield_mount.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/diversify_input_windshield_mount.png new file mode 100644 index 0000000000000000000000000000000000000000..9c8e3d31537626d7c6242cf1e6992abbd1dddbcf --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/diversify_input_windshield_mount.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f13aed5ec56315735d5211ce5319be8e945a235fef4ff907925db97c73be3d93 +size 109786 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/detections.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/detections.json new file mode 100644 index 0000000000000000000000000000000000000000..7ff1e83704fc04c493268797e487f80eafd60dcc --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/detections.json @@ -0,0 +1,192 @@ +[ + { + "name": "blonde_woman", + "present": true, + "bbox": [ + 0.7873, + 0.3886, + 0.8283, + 0.5843 + ], + "confidence": 0.95, + "notes": "The bounding box tightly encloses the visible woman with blonde hair, from the top of her head to the soles of her shoes.", + "coarse_bbox": [ + 0.786, + 0.391, + 0.826, + 0.57 + ], + "refine_crop": "crops/detect_refine_blonde_woman.png" + }, + { + "name": "metal_structure", + "present": true, + "bbox": [ + 0.8171, + 0.3117, + 0.944, + 0.5699 + ], + "confidence": 0.98, + "notes": "The tall, rectangular grey metal box/structure is prominently visible in the image.", + "coarse_bbox": [ + 0.818, + 0.318, + 0.942, + 0.574 + ], + "refine_crop": "crops/detect_refine_metal_structure.png" + }, + { + "name": "overhead_streetlights", + "present": true, + "bbox": [ + 0.5014, + 0.0563, + 0.5225, + 0.1017 + ], + "confidence": 0.9, + "notes": "Tight bounding box around the visible bright light source, consistent with a streetlight in this crop.", + "coarse_bbox": [ + 0.493, + 0.036, + 0.528, + 0.115 + ], + "refine_crop": "crops/detect_refine_overhead_streetlights.png" + }, + { + "name": "silver_car", + "present": true, + "bbox": [ + 0.5294, + 0.4494, + 0.6488, + 0.6006 + ], + "confidence": 1.0, + "notes": "silver car", + "coarse_bbox": [ + 0.532, + 0.447, + 0.649, + 0.603 + ], + "refine_crop": "crops/detect_refine_silver_car.png" + }, + { + "name": "white_suv", + "present": true, + "bbox": [ + 0.5082, + 0.4246, + 0.5826, + 0.5552 + ], + "confidence": "high", + "notes": "The white SUV is visible in the crop.", + "coarse_bbox": [ + 0.506, + 0.425, + 0.581, + 0.556 + ], + "refine_crop": "crops/detect_refine_white_suv.png" + }, + { + "name": "background_street_light", + "present": true, + "bbox": [ + 0.5004, + 0.0592, + 0.5237, + 0.0998 + ], + "confidence": 0.9, + "notes": "The crop shows a large bright area that appears to be the glare from a bright street light, which matches the description of illuminating the distant background area.", + "coarse_bbox": [ + 0.497, + 0.051, + 0.526, + 0.098 + ], + "refine_crop": "crops/detect_refine_background_street_light.png" + }, + { + "name": "oncoming_white_car", + "present": true, + "bbox": [ + 0.1529, + 0.4523, + 0.2891, + 0.5849 + ], + "confidence": 0.98, + "notes": "Refined the box to tightly enclose the visible parts of the white car driving toward the camera, including the headlights, front grill, body, and wheels.", + "coarse_bbox": [ + 0.148, + 0.447, + 0.293, + 0.591 + ], + "refine_crop": "crops/detect_refine_oncoming_white_car.png" + }, + { + "name": "windshield_mount", + "present": true, + "bbox": [ + 0.0, + 0.0, + 0.4767, + 0.1353 + ], + "confidence": "high", + "notes": "The large dark structure attached to the windshield spanning the top area of the crop, framing the dashcam view, matches the description of the windshield mount/rearview mirror assembly.", + "coarse_bbox": [ + 0.0, + 0.0, + 0.477, + 0.134 + ], + "refine_crop": "crops/detect_refine_windshield_mount.png" + }, + { + "name": "ego_car_dashboard", + "present": true, + "bbox": [ + 0.0, + 0.7306, + 1.0, + 1.0 + ], + "confidence": 1.0, + "notes": "The entire crop is the ego car dashboard.", + "coarse_bbox": [ + 0.0, + 0.754, + 0.991, + 0.991 + ], + "refine_crop": "crops/detect_refine_ego_car_dashboard.png" + }, + { + "name": "vintage_street_lamp", + "present": true, + "bbox": [ + 0.7202, + 0.0, + 0.7633, + 0.6024 + ], + "confidence": 0.99, + "notes": "Tight bounding box around the street lamp from its glowing top to its base on the ground.", + "coarse_bbox": [ + 0.723, + 0.012, + 0.762, + 0.599 + ], + "refine_crop": "crops/detect_refine_vintage_street_lamp.png" + } +] diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/main_image.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/main_image.png new file mode 100644 index 0000000000000000000000000000000000000000..68bc6ba95966cdd782caa4a6d7a54950a05f23a7 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/main_image.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f16f92209117d77e9c1fb1fde93ff95ea075db18c6b6d73fbedb899da3afed6 +size 1362453 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/plan.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/plan.json new file mode 100644 index 0000000000000000000000000000000000000000..67a881caf8a09419b58d23d9c75a27e3556b3378 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/plan.json @@ -0,0 +1,240 @@ +{ + "sample_id": "sample_000011", + "target_total": 10, + "target_people": 1, + "target_objects": 9, + "compose_prompt": { + "format": "structured_json_prompt", + "canvas": { + "size": [ + 1280, + 720 + ], + "aspect_ratio": "16:9", + "style": "photorealistic" + }, + "scene": { + "setting": "A wet city street at night viewed from inside a moving car.", + "activity": "Driving down the street at night with an oncoming car passing by, while a woman walks on the sidewalk next to parked cars.", + "composition": "Dashcam perspective with the dashboard and windshield mount in the extreme foreground, framing the street ahead. The wet road reflects overhead streetlights. An oncoming white car is in the left lane, with parked vehicles on the right. The right sidewalk features vintage street lamps, a metal utility structure, and a pedestrian in profile.", + "constraints": [ + "no text", + "no labels", + "no watermarks", + "true 16:9 composition", + "final canvas size 1280x720", + "normal human and object proportions", + "no squeezed perspective", + "no anamorphic stretching", + "every listed person and object must be visibly present", + "the foreground may contain only the listed people and objects", + "no extra foreground people, animals, vehicles, props, tools, products, signs, readable text, logos, or other localizable subjects absent from this JSON", + "the entire image must be a coherent driving scenario", + "the scene must include an outdoor road, street, highway, intersection, curbside, parking, or traffic environment", + "integrate all listed subjects as road users, vehicles, roadside objects, traffic infrastructure, or driving-context objects", + "do not render an office, kitchen, studio, product catalog, indoor room, or unrelated non-driving scene" + ] + }, + "people": [ + { + "name": "blonde_woman", + "source_index": 1, + "source_image_id": "CrowdHuman:data/data_8/273275,44ab30007bea86d2.jpg:person:5", + "source_name": "woman in foreground", + "description": "A woman with blonde hair, seen in profile.", + "role_in_scene": "walking along the right sidewalk under the street lamps" + } + ], + "objects": [ + { + "name": "metal_structure", + "source_index": 0, + "source_image_id": "CrowdHuman:data/data_54/273278,11abb000d12e48e5.jpg:object:5", + "source_name": "metal structure", + "description": "A tall, rectangular grey metal box or pillar.", + "role_in_scene": "situated on the edge of the sidewalk as a utility box" + }, + { + "name": "overhead_streetlights", + "source_index": 3, + "source_image_id": "BDD100K:bd1b8b79-829e787f:object:5", + "source_name": "streetlights", + "description": "Tall poles with bright lights illuminating the street from above, casting long reflections on the wet pavement.", + "role_in_scene": "providing overhead illumination for the wet road" + }, + { + "name": "silver_car", + "source_index": 4, + "source_image_id": "BDD100K:bba4ee51-3badc9f8:object:6", + "source_name": "silver car", + "description": "A silver car.", + "role_in_scene": "parked further down the street on the right curbside" + }, + { + "name": "white_suv", + "source_index": 7, + "source_image_id": "BDD100K:c5e32cf6-7d2e04b4:object:0", + "source_name": "white suv", + "description": "A white SUV showing its rear passenger side.", + "role_in_scene": "stopped or parked in the rightmost lane ahead" + }, + { + "name": "background_street_light", + "source_index": 10, + "source_image_id": "CrowdHuman:data/data_15/273278,8d2ae00027075d75.jpg:object:5", + "source_name": "street light", + "description": "Bright street lights.", + "role_in_scene": "illuminating the distant background area of the street" + }, + { + "name": "oncoming_white_car", + "source_index": 11, + "source_image_id": "BDD100K:be3159f3-13250ffe:object:2", + "source_name": "white car", + "description": "A white car driving toward the camera with its headlights visible.", + "role_in_scene": "active oncoming traffic in the left lane" + }, + { + "name": "windshield_mount", + "source_index": 14, + "source_image_id": "BDD100K:c5694077-e345e2a8:object:12", + "source_name": "windshield mount", + "description": "A black mount attached to the inside of the windshield.", + "role_in_scene": "partially obscuring the top view, framing the dashcam perspective" + }, + { + "name": "ego_car_dashboard", + "source_index": 15, + "source_image_id": "BDD100K:c187431f-7b6ad6d6:object:0", + "source_name": "ego car dashboard", + "description": "The dark, lower foreground showing part of the dashboard and hood of the vehicle, with red ambient reflections.", + "role_in_scene": "anchors the bottom of the frame, establishing the driver's perspective" + }, + { + "name": "vintage_street_lamp", + "source_index": 16, + "source_image_id": "CrowdHuman:data/data_29/273275,2032200056dda99e.jpg:object:0", + "source_name": "street lamp", + "description": "An ornate, black, vintage-style street lamp post.", + "role_in_scene": "providing decorative lighting on the right sidewalk next to the pedestrian" + } + ] + }, + "expected_subjects": [ + { + "name": "blonde_woman", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_8/273275,44ab30007bea86d2.jpg:person:5", + "source_name": "woman in foreground", + "source_description": "A woman with blonde hair, seen in profile in the bottom center of the foreground. Source dataset: CrowdHuman. Scene context: A large crowd of people is gathered on a city street, many standing behind metal barricades, while some are taking photos.", + "sub_caption": "woman in foreground: A woman with blonde hair, seen in profile.. Scene role: walking along the right sidewalk under the street lamps", + "ref_style": "white_bg_full_body_front" + }, + { + "name": "metal_structure", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_54/273278,11abb000d12e48e5.jpg:object:5", + "source_name": "metal structure", + "source_description": "A tall, rectangular grey metal box or pillar on the right side, which one man is holding onto. Source dataset: CrowdHuman. Scene context: A crowded subway station with a metal barrier, where two people are climbing over the barrier while others stand by.", + "sub_caption": "metal structure: A tall, rectangular grey metal box or pillar.. Scene role: situated on the edge of the sidewalk as a utility box", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "overhead_streetlights", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:bd1b8b79-829e787f:object:5", + "source_name": "streetlights", + "source_description": "Tall poles with bright lights illuminating the street from above, casting long reflections on the wet pavement. Source dataset: BDD100K. Scene context: Nighttime driving scene on a wet city street with streetlights reflecting on the road.", + "sub_caption": "streetlights: Tall poles with bright lights illuminating the street from above, casting long reflections on the wet pavement.. Scene role: providing overhead illumination for the wet road", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "silver_car", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:bba4ee51-3badc9f8:object:6", + "source_name": "silver car", + "source_description": "Silver car parked further down the street on the right. Source dataset: BDD100K. Scene context: View from inside a car driving down a residential street lined with parked cars and trees.", + "sub_caption": "silver car: A silver car.. Scene role: parked further down the street on the right curbside", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "white_suv", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c5e32cf6-7d2e04b4:object:0", + "source_name": "white suv", + "source_description": "A white Honda CR-V parked or stopped in the rightmost lane, showing its rear passenger side. Source dataset: BDD100K. Scene context: A view from a car driving down a multi-lane city street with parked cars on the right, oncoming traffic on the left, and a highway overpass in the distance on a sunny day.", + "sub_caption": "white suv: A white SUV showing its rear passenger side.. Scene role: stopped or parked in the rightmost lane ahead", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "background_street_light", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_15/273278,8d2ae00027075d75.jpg:object:5", + "source_name": "street light", + "source_description": "bright street lights illuminating the area in the background Source dataset: CrowdHuman. Scene context: A group of people wearing athletic clothing are posed for a group photo outdoors at night.", + "sub_caption": "street light: Bright street lights.. Scene role: illuminating the distant background area of the street", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "oncoming_white_car", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:be3159f3-13250ffe:object:2", + "source_name": "white car", + "source_description": "A white car driving toward the camera in the oncoming lane, further down the road. Source dataset: BDD100K. Scene context: A daytime street view from a vehicle approaching an intersection with traffic lights and several other cars.", + "sub_caption": "white car: A white car driving toward the camera with its headlights visible.. Scene role: active oncoming traffic in the left lane", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "windshield_mount", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c5694077-e345e2a8:object:12", + "source_name": "windshield mount", + "source_description": "A black mount attached to the inside of the windshield, partially obscuring the view. Source dataset: BDD100K. Scene context: A view from inside a car driving down a busy city street flanked by tall buildings, with various vehicles including cars, a delivery truck, and a cyclist in the foreground.", + "sub_caption": "windshield mount: A black mount attached to the inside of the windshield.. Scene role: partially obscuring the top view, framing the dashcam perspective", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "ego_car_dashboard", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c187431f-7b6ad6d6:object:0", + "source_name": "ego car dashboard", + "source_description": "The dark, lower foreground showing part of the dashboard and hood of the vehicle recording the video, with red reflections from taillights. Source dataset: BDD100K. Scene context: Nighttime driving on a multi-lane highway with moderate traffic, showing vehicles ahead and green overhead highway signs.", + "sub_caption": "ego car dashboard: The dark, lower foreground showing part of the dashboard and hood of the vehicle, with red ambient reflections.. Scene role: anchors the bottom of the frame, establishing the driver's perspective", + "ref_style": "white_bg_encyclopedia_photo" + }, + { + "name": "vintage_street_lamp", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_29/273275,2032200056dda99e.jpg:object:0", + "source_name": "street lamp", + "source_description": "Ornate, black, vintage-style street lamp post. Source dataset: CrowdHuman. Scene context: A densely crowded city street with numerous pedestrians walking along a sidewalk lined with shops and tall, leafy trees.", + "sub_caption": "street lamp: An ornate, black, vintage-style street lamp post.. Scene role: providing decorative lighting on the right sidewalk next to the pedestrian", + "ref_style": "white_bg_encyclopedia_photo" + } + ], + "vocab_task_path": "sample_000011/vocab_task.json", + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references.json new file mode 100644 index 0000000000000000000000000000000000000000..678a6f044f8e7efad4f19424d0851918780c03e7 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references.json @@ -0,0 +1,325 @@ +{ + "references": [ + { + "name": "blonde_woman", + "ref_image": "references/ref_blonde_woman.png", + "raw_ref_image": "references/raw_ref_blonde_woman_attempt_02.png", + "diversify_input": "crops/diversify_input_blonde_woman.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/raw_ref_blonde_woman_attempt_02.png", + "output": "references/ref_blonde_woman.png", + "mask": "references/sam_mask_blonde_woman.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 347.0, + 60.0, + 686.0, + 982.0 + ], + "mask_score": 3.476833, + "mask_area_ratio": 0.13921, + "elapsed_seconds": 10.126 + }, + "reference_verify": "references/reference_verify_blonde_woman.json", + "reference_verify_passed": true, + "reference_attempts": 2 + }, + { + "name": "metal_structure", + "ref_image": "references/ref_metal_structure.png", + "raw_ref_image": "references/raw_ref_metal_structure_attempt_01.png", + "diversify_input": "crops/diversify_input_metal_structure.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/raw_ref_metal_structure_attempt_01.png", + "output": "references/ref_metal_structure.png", + "mask": "references/sam_mask_metal_structure.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 91.0, + 32.0, + 932.0, + 1001.0 + ], + "mask_score": 3.477494, + "mask_area_ratio": 0.534141, + "elapsed_seconds": 8.3774 + }, + "reference_verify": "references/reference_verify_metal_structure.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "overhead_streetlights", + "ref_image": "references/ref_overhead_streetlights.png", + "raw_ref_image": "references/raw_ref_overhead_streetlights_attempt_03.png", + "diversify_input": "crops/diversify_input_overhead_streetlights.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/raw_ref_overhead_streetlights_attempt_03.png", + "output": "references/ref_overhead_streetlights.png", + "mask": "references/sam_mask_overhead_streetlights.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 18.0, + 185.0, + 995.0, + 821.0 + ], + "mask_score": 3.398942, + "mask_area_ratio": 0.279411, + "elapsed_seconds": 8.219 + }, + "reference_verify": "references/reference_verify_overhead_streetlights.json", + "reference_verify_passed": true, + "reference_attempts": 3 + }, + { + "name": "silver_car", + "ref_image": "references/ref_silver_car.png", + "raw_ref_image": "references/raw_ref_silver_car_attempt_01.png", + "diversify_input": "crops/diversify_input_silver_car.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/raw_ref_silver_car_attempt_01.png", + "output": "references/ref_silver_car.png", + "mask": "references/sam_mask_silver_car.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 47.0, + 245.0, + 984.0, + 779.0 + ], + "mask_score": 3.398036, + "mask_area_ratio": 0.279834, + "elapsed_seconds": 8.4126 + }, + "reference_verify": "references/reference_verify_silver_car.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "white_suv", + "ref_image": "references/ref_white_suv.png", + "raw_ref_image": "references/raw_ref_white_suv_attempt_01.png", + "diversify_input": "crops/diversify_input_white_suv.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/raw_ref_white_suv_attempt_01.png", + "output": "references/ref_white_suv.png", + "mask": "references/sam_mask_white_suv.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 26.0, + 237.0, + 1002.0, + 800.0 + ], + "mask_score": 3.457781, + "mask_area_ratio": 0.315623, + "elapsed_seconds": 10.4584 + }, + "reference_verify": "references/reference_verify_white_suv.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "background_street_light", + "ref_image": "references/ref_background_street_light.png", + "raw_ref_image": "references/raw_ref_background_street_light_attempt_01.png", + "diversify_input": "crops/diversify_input_background_street_light.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/raw_ref_background_street_light_attempt_01.png", + "output": "references/ref_background_street_light.png", + "mask": "references/sam_mask_background_street_light.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 152.0, + 95.0, + 875.0, + 938.0 + ], + "mask_score": 3.437329, + "mask_area_ratio": 0.065212, + "elapsed_seconds": 8.1426 + }, + "reference_verify": "references/reference_verify_background_street_light.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "oncoming_white_car", + "ref_image": "references/ref_oncoming_white_car.png", + "raw_ref_image": "references/raw_ref_oncoming_white_car_attempt_01.png", + "diversify_input": "crops/diversify_input_oncoming_white_car.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/raw_ref_oncoming_white_car_attempt_01.png", + "output": "references/ref_oncoming_white_car.png", + "mask": "references/sam_mask_oncoming_white_car.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 13.0, + 304.0, + 1011.0, + 819.0 + ], + "mask_score": 3.453796, + "mask_area_ratio": 0.299096, + "elapsed_seconds": 8.2507 + }, + "reference_verify": "references/reference_verify_oncoming_white_car.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "windshield_mount", + "ref_image": "references/ref_windshield_mount.png", + "raw_ref_image": "references/raw_ref_windshield_mount_attempt_01.png", + "diversify_input": "crops/diversify_input_windshield_mount.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/raw_ref_windshield_mount_attempt_01.png", + "output": "references/ref_windshield_mount.png", + "mask": "references/sam_mask_windshield_mount.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 88.0, + 131.0, + 967.0, + 895.0 + ], + "mask_score": 3.43205, + "mask_area_ratio": 0.236237, + "elapsed_seconds": 8.17 + }, + "reference_verify": "references/reference_verify_windshield_mount.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "ego_car_dashboard", + "ref_image": "references/ref_ego_car_dashboard.png", + "raw_ref_image": "references/raw_ref_ego_car_dashboard_attempt_01.png", + "diversify_input": "crops/diversify_input_ego_car_dashboard.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/raw_ref_ego_car_dashboard_attempt_01.png", + "output": "references/ref_ego_car_dashboard.png", + "mask": "references/sam_mask_ego_car_dashboard.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 327.0, + 1023.0, + 788.0 + ], + "mask_score": 3.233951, + "mask_area_ratio": 0.206886, + "elapsed_seconds": 10.2609 + }, + "reference_verify": "references/reference_verify_ego_car_dashboard.json", + "reference_verify_passed": true, + "reference_attempts": 1 + }, + { + "name": "vintage_street_lamp", + "ref_image": "references/ref_vintage_street_lamp.png", + "raw_ref_image": "references/raw_ref_vintage_street_lamp_attempt_01.png", + "diversify_input": "crops/diversify_input_vintage_street_lamp.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/raw_ref_vintage_street_lamp_attempt_01.png", + "output": "references/ref_vintage_street_lamp.png", + "mask": "references/sam_mask_vintage_street_lamp.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 343.0, + 0.0, + 599.0, + 1023.0 + ], + "mask_score": 3.457917, + "mask_area_ratio": 0.047438, + "elapsed_seconds": 8.3114 + }, + "reference_verify": "references/reference_verify_vintage_street_lamp.json", + "reference_verify_passed": true, + "reference_attempts": 1 + } + ], + "reference_errors": {} +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/ref_background_street_light.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/ref_background_street_light.png new file mode 100644 index 0000000000000000000000000000000000000000..7c29be4cbb216f8e2a7151751d8b35c4320172d3 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/ref_background_street_light.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7785c313902f3e95b08696d3114451877800ac230279666d7ed3b0c5ea819f65 +size 138326 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/ref_blonde_woman.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/ref_blonde_woman.png new file mode 100644 index 0000000000000000000000000000000000000000..4064abd1eff3c1e5189653ed8fddb1587b39742d --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/ref_blonde_woman.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b390f117f14a324e14ceab80818d26b00a49368d38d8527f26886fad7df6126 +size 276502 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/ref_ego_car_dashboard.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/ref_ego_car_dashboard.png new file mode 100644 index 0000000000000000000000000000000000000000..d12de371123d9b497ad4a98f80451c8a5e1f4cce --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/ref_ego_car_dashboard.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3757b2616843ab28ab38b2a47ef2b908cdb7df35d81d915dce4ecbfbbefc3fe +size 359107 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/ref_metal_structure.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/ref_metal_structure.png new file mode 100644 index 0000000000000000000000000000000000000000..afb521eeeec787e38a6df9f7588303216d4ee9a1 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/ref_metal_structure.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47e849ea2e179231910bcc315f8964252ccd4538df222394462bcc8a52e7de6f +size 879543 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/ref_oncoming_white_car.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/ref_oncoming_white_car.png new file mode 100644 index 0000000000000000000000000000000000000000..a88149bda7584b2a96ef074e9f33a2027fcd56a5 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/ref_oncoming_white_car.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b14dc916a748619c6a6535f978a79efb5c857433e8164a59bacb61915a5814f8 +size 524114 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/ref_overhead_streetlights.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/ref_overhead_streetlights.png new file mode 100644 index 0000000000000000000000000000000000000000..9966605f03cb9cedddf1046b331b8d6ea80a39f9 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/ref_overhead_streetlights.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38737077353b24559f1e32aeb416a3152106188781029c60a774918a19785f3c +size 518215 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/ref_silver_car.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/ref_silver_car.png new file mode 100644 index 0000000000000000000000000000000000000000..893b256d5db5d51f2a331b91febb3fd041710643 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/ref_silver_car.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d7255ea96e0c7006eb15b4f45f29da3d9e1cf71bd2d26599929bdcdacbde410 +size 519926 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/ref_vintage_street_lamp.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/ref_vintage_street_lamp.png new file mode 100644 index 0000000000000000000000000000000000000000..b42ef1414c0c1912c2c77b15a8357c5f3bde576c --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/ref_vintage_street_lamp.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9672720b78ccad600c19da30a68945194dced02426615a22829db15d68c71b50 +size 103465 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/ref_white_suv.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/ref_white_suv.png new file mode 100644 index 0000000000000000000000000000000000000000..af6e593a2d398c32eed78c525ded909b94258595 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/ref_white_suv.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2355784f3085a203352f5649fee0d8ae0bd2e1986d440497e03854c3f3364b1 +size 522831 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/ref_windshield_mount.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/ref_windshield_mount.png new file mode 100644 index 0000000000000000000000000000000000000000..e9f2371d77ee8fc88ff781f23f0a09ae6bd19649 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/ref_windshield_mount.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48a825bc49e8f0253649f7e9c3516870ef65c1599da2f518345cae14148af265 +size 457669 diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/reference_verify_background_street_light.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/reference_verify_background_street_light.json new file mode 100644 index 0000000000000000000000000000000000000000..b88c84f9d403ce06b7ffc0dc72f1affbc4594fa7 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/reference_verify_background_street_light.json @@ -0,0 +1,46 @@ +{ + "name": "background_street_light", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_background_street_light_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_background_street_light_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_background_street_light_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/raw_ref_background_street_light_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/candidate_ref_background_street_light_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/candidate_sam_mask_background_street_light_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 152.0, + 95.0, + 875.0, + 938.0 + ], + "mask_score": 3.437329, + "mask_area_ratio": 0.065212, + "elapsed_seconds": 8.1426 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": true, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The subject is a street light, visible, mostly complete (the pole is truncated at the bottom, which is acceptable for large environmental features), isolated on a white background." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/reference_verify_blonde_woman.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/reference_verify_blonde_woman.json new file mode 100644 index 0000000000000000000000000000000000000000..3226006207afd5d0962ccdee5e854d0b6e1f9bb9 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/reference_verify_blonde_woman.json @@ -0,0 +1,87 @@ +{ + "name": "blonde_woman", + "passed": true, + "accepted_attempt": 2, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_blonde_woman_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_blonde_woman_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_blonde_woman_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/raw_ref_blonde_woman_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/candidate_ref_blonde_woman_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/candidate_sam_mask_blonde_woman_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 327.0, + 12.0, + 696.0, + 1017.0 + ], + "mask_score": 2.839221, + "mask_area_ratio": 0.137504, + "elapsed_seconds": 8.3884 + }, + "verify": { + "passed": false, + "subject_visible": true, + "complete_subject": false, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [ + "Severe visual artifacts obscure the face, neck, and upper chest, making the subject incomplete and unrecognizable." + ], + "notes": "Full body is within the frame with good margins, but large pixelated/white artifacts severely corrupt the face and chest area." + } + }, + { + "attempt": 2, + "raw_ref_image": "references/raw_ref_blonde_woman_attempt_02.png", + "candidate_ref_image": "references/candidate_ref_blonde_woman_attempt_02.png", + "candidate_sam_mask": "references/candidate_sam_mask_blonde_woman_attempt_02.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/raw_ref_blonde_woman_attempt_02.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/candidate_ref_blonde_woman_attempt_02.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/candidate_sam_mask_blonde_woman_attempt_02.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 347.0, + 60.0, + 686.0, + 982.0 + ], + "mask_score": 3.476833, + "mask_area_ratio": 0.13921, + "elapsed_seconds": 10.126 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows a full body of a blonde woman on a white background with no cropping." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/reference_verify_ego_car_dashboard.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/reference_verify_ego_car_dashboard.json new file mode 100644 index 0000000000000000000000000000000000000000..9a7d14c11e0df0d89f9c77e377da2ee7cc7afc3e --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/reference_verify_ego_car_dashboard.json @@ -0,0 +1,46 @@ +{ + "name": "ego_car_dashboard", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_ego_car_dashboard_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_ego_car_dashboard_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_ego_car_dashboard_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/raw_ref_ego_car_dashboard_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/candidate_ref_ego_car_dashboard_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/candidate_sam_mask_ego_car_dashboard_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 327.0, + 1023.0, + 788.0 + ], + "mask_score": 3.233951, + "mask_area_ratio": 0.206886, + "elapsed_seconds": 10.2609 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The ego car dashboard is fully visible, isolated on a white background, and not cropped." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/reference_verify_metal_structure.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/reference_verify_metal_structure.json new file mode 100644 index 0000000000000000000000000000000000000000..62c81e586e7dd1885b7a5f060596493c0adb676e --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/reference_verify_metal_structure.json @@ -0,0 +1,46 @@ +{ + "name": "metal_structure", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_metal_structure_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_metal_structure_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_metal_structure_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/raw_ref_metal_structure_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/candidate_ref_metal_structure_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/candidate_sam_mask_metal_structure_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 91.0, + 32.0, + 932.0, + 1001.0 + ], + "mask_score": 3.477494, + "mask_area_ratio": 0.534141, + "elapsed_seconds": 8.3774 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows a complete metal structure (utility box) on a white background." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/reference_verify_oncoming_white_car.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/reference_verify_oncoming_white_car.json new file mode 100644 index 0000000000000000000000000000000000000000..2357013d619706dff2255061d7f5c8d34b1ee743 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/reference_verify_oncoming_white_car.json @@ -0,0 +1,46 @@ +{ + "name": "oncoming_white_car", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_oncoming_white_car_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_oncoming_white_car_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_oncoming_white_car_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/raw_ref_oncoming_white_car_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/candidate_ref_oncoming_white_car_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/candidate_sam_mask_oncoming_white_car_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 13.0, + 304.0, + 1011.0, + 819.0 + ], + "mask_score": 3.453796, + "mask_area_ratio": 0.299096, + "elapsed_seconds": 8.2507 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows a white car isolated on a white background. It meets all the requirements for a non-person subject reference image." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/reference_verify_overhead_streetlights.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/reference_verify_overhead_streetlights.json new file mode 100644 index 0000000000000000000000000000000000000000..0d88ac9f3e577a9fc40c34d8c01f23892674093f --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/reference_verify_overhead_streetlights.json @@ -0,0 +1,129 @@ +{ + "name": "overhead_streetlights", + "passed": true, + "accepted_attempt": 3, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_overhead_streetlights_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_overhead_streetlights_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_overhead_streetlights_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/raw_ref_overhead_streetlights_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/candidate_ref_overhead_streetlights_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/candidate_sam_mask_overhead_streetlights_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 342.0, + 56.0, + 749.0, + 1023.0 + ], + "mask_score": -0.98249, + "mask_area_ratio": 0.818867, + "elapsed_seconds": 8.3203 + }, + "verify": { + "passed": false, + "subject_visible": false, + "complete_subject": false, + "cropped_or_truncated": true, + "single_main_subject": false, + "white_background": true, + "failure_reasons": [ + "intended subject is absent or completely washed out" + ], + "notes": "The image is almost entirely white, with only a very faint outline of what might be a streetlight pole. The subject is not clearly visible or recognizable as an overhead streetlight." + } + }, + { + "attempt": 2, + "raw_ref_image": "references/raw_ref_overhead_streetlights_attempt_02.png", + "candidate_ref_image": "references/candidate_ref_overhead_streetlights_attempt_02.png", + "candidate_sam_mask": "references/candidate_sam_mask_overhead_streetlights_attempt_02.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/raw_ref_overhead_streetlights_attempt_02.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/candidate_ref_overhead_streetlights_attempt_02.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/candidate_sam_mask_overhead_streetlights_attempt_02.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 298.0, + 61.0, + 767.0, + 1023.0 + ], + "mask_score": 1.401426, + "mask_area_ratio": 0.008588, + "elapsed_seconds": 8.1461 + }, + "verify": { + "passed": false, + "subject_visible": true, + "complete_subject": false, + "cropped_or_truncated": true, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [ + "subject is highly fragmented and unrecognizable as a streetlight", + "missing main recognizable form (the light head)" + ], + "notes": "The image shows a fragmented vertical line that is likely the pole, but the actual light and upper arm are mostly missing or fragmented into meaningless dots, failing the requirement for a rigid discrete object." + } + }, + { + "attempt": 3, + "raw_ref_image": "references/raw_ref_overhead_streetlights_attempt_03.png", + "candidate_ref_image": "references/candidate_ref_overhead_streetlights_attempt_03.png", + "candidate_sam_mask": "references/candidate_sam_mask_overhead_streetlights_attempt_03.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/raw_ref_overhead_streetlights_attempt_03.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/candidate_ref_overhead_streetlights_attempt_03.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/candidate_sam_mask_overhead_streetlights_attempt_03.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 18.0, + 185.0, + 995.0, + 821.0 + ], + "mask_score": 3.398942, + "mask_area_ratio": 0.279411, + "elapsed_seconds": 8.219 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows a single streetlight fixture isolated on a white background. It is complete and serves as a good reference for the object." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/reference_verify_silver_car.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/reference_verify_silver_car.json new file mode 100644 index 0000000000000000000000000000000000000000..55e9f092c7cdc13d3b22afb2b3afde0cc045df50 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/reference_verify_silver_car.json @@ -0,0 +1,46 @@ +{ + "name": "silver_car", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_silver_car_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_silver_car_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_silver_car_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/raw_ref_silver_car_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/candidate_ref_silver_car_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/candidate_sam_mask_silver_car_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 47.0, + 245.0, + 984.0, + 779.0 + ], + "mask_score": 3.398036, + "mask_area_ratio": 0.279834, + "elapsed_seconds": 8.4126 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows a silver car against a white background. It is complete, not cropped, and is the single main subject." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/reference_verify_vintage_street_lamp.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/reference_verify_vintage_street_lamp.json new file mode 100644 index 0000000000000000000000000000000000000000..10f094aa1d0be62c1a1fe41d0d1866d42043f665 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/reference_verify_vintage_street_lamp.json @@ -0,0 +1,46 @@ +{ + "name": "vintage_street_lamp", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_vintage_street_lamp_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_vintage_street_lamp_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_vintage_street_lamp_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/raw_ref_vintage_street_lamp_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/candidate_ref_vintage_street_lamp_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/candidate_sam_mask_vintage_street_lamp_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 343.0, + 0.0, + 599.0, + 1023.0 + ], + "mask_score": 3.457917, + "mask_area_ratio": 0.047438, + "elapsed_seconds": 8.3114 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows a complete, isolated vintage street lamp on a white background, fully satisfying the requirements." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/reference_verify_white_suv.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/reference_verify_white_suv.json new file mode 100644 index 0000000000000000000000000000000000000000..528c2355a0bb5a539c544ae7b244e4de3a705ba5 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/reference_verify_white_suv.json @@ -0,0 +1,46 @@ +{ + "name": "white_suv", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_white_suv_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_white_suv_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_white_suv_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/raw_ref_white_suv_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/candidate_ref_white_suv_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/candidate_sam_mask_white_suv_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 26.0, + 237.0, + 1002.0, + 800.0 + ], + "mask_score": 3.457781, + "mask_area_ratio": 0.315623, + "elapsed_seconds": 10.4584 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows a complete white SUV from the rear passenger side on a white background." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/reference_verify_windshield_mount.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/reference_verify_windshield_mount.json new file mode 100644 index 0000000000000000000000000000000000000000..b17111e9dd7c11c01e3ce56a8ace2358cf475bf1 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/reference_verify_windshield_mount.json @@ -0,0 +1,46 @@ +{ + "name": "windshield_mount", + "passed": true, + "accepted_attempt": 1, + "attempts": [ + { + "attempt": 1, + "raw_ref_image": "references/raw_ref_windshield_mount_attempt_01.png", + "candidate_ref_image": "references/candidate_ref_windshield_mount_attempt_01.png", + "candidate_sam_mask": "references/candidate_sam_mask_windshield_mount_attempt_01.png", + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/raw_ref_windshield_mount_attempt_01.png", + "output": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/candidate_ref_windshield_mount_attempt_01.png", + "mask": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/candidate_sam_mask_windshield_mount_attempt_01.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 88.0, + 131.0, + 967.0, + 895.0 + ], + "mask_score": 3.43205, + "mask_area_ratio": 0.236237, + "elapsed_seconds": 8.17 + }, + "verify": { + "passed": true, + "subject_visible": true, + "complete_subject": true, + "cropped_or_truncated": false, + "single_main_subject": true, + "white_background": true, + "failure_reasons": [], + "notes": "The image shows a single black windshield mount against a white background. It is complete, not cropped, and is clearly the main subject." + } + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/sam_mask_background_street_light.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/sam_mask_background_street_light.png new file mode 100644 index 0000000000000000000000000000000000000000..2de49a085d278181809e451cf8262fa48b0e0d87 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/sam_mask_background_street_light.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/sam_mask_blonde_woman.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/sam_mask_blonde_woman.png new file mode 100644 index 0000000000000000000000000000000000000000..2b68a108151726e7bf7edfe9ff856930a67867a6 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/sam_mask_blonde_woman.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/sam_mask_ego_car_dashboard.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/sam_mask_ego_car_dashboard.png new file mode 100644 index 0000000000000000000000000000000000000000..88b77c01355a779a0681e743c07b12a039fa33ff Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/sam_mask_ego_car_dashboard.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/sam_mask_metal_structure.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/sam_mask_metal_structure.png new file mode 100644 index 0000000000000000000000000000000000000000..c27edc000fbce22398fb492168af55789ec375a5 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/sam_mask_metal_structure.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/sam_mask_oncoming_white_car.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/sam_mask_oncoming_white_car.png new file mode 100644 index 0000000000000000000000000000000000000000..2909e0e158ba2d1b2b0675dc23387deb4155d7f5 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/sam_mask_oncoming_white_car.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/sam_mask_overhead_streetlights.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/sam_mask_overhead_streetlights.png new file mode 100644 index 0000000000000000000000000000000000000000..cdde5a61033f8631f116f26a07405feb17cb0108 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/sam_mask_overhead_streetlights.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/sam_mask_silver_car.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/sam_mask_silver_car.png new file mode 100644 index 0000000000000000000000000000000000000000..2d2842fa5cdbcc13b14389098e49aee0ecd2b3dc Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/sam_mask_silver_car.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/sam_mask_vintage_street_lamp.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/sam_mask_vintage_street_lamp.png new file mode 100644 index 0000000000000000000000000000000000000000..7d93f0caf282eedc9e9aa6e27783932c5d5fe222 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/sam_mask_vintage_street_lamp.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/sam_mask_white_suv.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/sam_mask_white_suv.png new file mode 100644 index 0000000000000000000000000000000000000000..180e5604a58517cdcfd3a48aeb689597896bc4c3 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/sam_mask_white_suv.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/sam_mask_windshield_mount.png b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/sam_mask_windshield_mount.png new file mode 100644 index 0000000000000000000000000000000000000000..0513292d061985415faa9643883393663f012f91 Binary files /dev/null and b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/sam_mask_windshield_mount.png differ diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/row.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/row.json new file mode 100644 index 0000000000000000000000000000000000000000..4006fd2caa2095600937e000701803693f3e6331 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/row.json @@ -0,0 +1,486 @@ +{ + "sample_id": "sample_000011", + "target_total": 10, + "target_people": 1, + "target_objects": 9, + "canvas_size": [ + 1280, + 720 + ], + "canvas_aspect_ratio": "16:9", + "main_image": "main_image.png", + "bbox_overlay": "bbox_overlay.png", + "plan": "plan.json", + "detections": "detections.json", + "vocab_task": "vocab_task.json", + "n_planned": 10, + "n_detected": 10, + "n_subjects": 10, + "subjects": [ + { + "name": "blonde_woman", + "is_person": true, + "subject_type": "person", + "source_set": "people_set", + "source_image_id": "CrowdHuman:data/data_8/273275,44ab30007bea86d2.jpg:person:5", + "source_name": "woman in foreground", + "source_description": "A woman with blonde hair, seen in profile in the bottom center of the foreground. Source dataset: CrowdHuman. Scene context: A large crowd of people is gathered on a city street, many standing behind metal barricades, while some are taking photos.", + "sub_caption": "woman in foreground: A woman with blonde hair, seen in profile.. Scene role: walking along the right sidewalk under the street lamps", + "measured_bbox": [ + 0.7873, + 0.3886, + 0.8283, + 0.5843 + ], + "detection_confidence": 0.95, + "ref_style": "white_bg_full_body_front", + "ref_image": "references/ref_blonde_woman.png", + "raw_ref_image": "references/raw_ref_blonde_woman_attempt_02.png", + "reference_verify": "references/reference_verify_blonde_woman.json", + "reference_verify_passed": true, + "reference_attempts": 2, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/raw_ref_blonde_woman_attempt_02.png", + "output": "references/ref_blonde_woman.png", + "mask": "references/sam_mask_blonde_woman.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 347.0, + 60.0, + 686.0, + 982.0 + ], + "mask_score": 3.476833, + "mask_area_ratio": 0.13921, + "elapsed_seconds": 10.126 + } + }, + { + "name": "metal_structure", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_54/273278,11abb000d12e48e5.jpg:object:5", + "source_name": "metal structure", + "source_description": "A tall, rectangular grey metal box or pillar on the right side, which one man is holding onto. Source dataset: CrowdHuman. Scene context: A crowded subway station with a metal barrier, where two people are climbing over the barrier while others stand by.", + "sub_caption": "metal structure: A tall, rectangular grey metal box or pillar.. Scene role: situated on the edge of the sidewalk as a utility box", + "measured_bbox": [ + 0.8171, + 0.3117, + 0.944, + 0.5699 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_metal_structure.png", + "raw_ref_image": "references/raw_ref_metal_structure_attempt_01.png", + "reference_verify": "references/reference_verify_metal_structure.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/raw_ref_metal_structure_attempt_01.png", + "output": "references/ref_metal_structure.png", + "mask": "references/sam_mask_metal_structure.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 91.0, + 32.0, + 932.0, + 1001.0 + ], + "mask_score": 3.477494, + "mask_area_ratio": 0.534141, + "elapsed_seconds": 8.3774 + } + }, + { + "name": "overhead_streetlights", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:bd1b8b79-829e787f:object:5", + "source_name": "streetlights", + "source_description": "Tall poles with bright lights illuminating the street from above, casting long reflections on the wet pavement. Source dataset: BDD100K. Scene context: Nighttime driving scene on a wet city street with streetlights reflecting on the road.", + "sub_caption": "streetlights: Tall poles with bright lights illuminating the street from above, casting long reflections on the wet pavement.. Scene role: providing overhead illumination for the wet road", + "measured_bbox": [ + 0.5014, + 0.0563, + 0.5225, + 0.1017 + ], + "detection_confidence": 0.9, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_overhead_streetlights.png", + "raw_ref_image": "references/raw_ref_overhead_streetlights_attempt_03.png", + "reference_verify": "references/reference_verify_overhead_streetlights.json", + "reference_verify_passed": true, + "reference_attempts": 3, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/raw_ref_overhead_streetlights_attempt_03.png", + "output": "references/ref_overhead_streetlights.png", + "mask": "references/sam_mask_overhead_streetlights.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 18.0, + 185.0, + 995.0, + 821.0 + ], + "mask_score": 3.398942, + "mask_area_ratio": 0.279411, + "elapsed_seconds": 8.219 + } + }, + { + "name": "silver_car", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:bba4ee51-3badc9f8:object:6", + "source_name": "silver car", + "source_description": "Silver car parked further down the street on the right. Source dataset: BDD100K. Scene context: View from inside a car driving down a residential street lined with parked cars and trees.", + "sub_caption": "silver car: A silver car.. Scene role: parked further down the street on the right curbside", + "measured_bbox": [ + 0.5294, + 0.4494, + 0.6488, + 0.6006 + ], + "detection_confidence": 1.0, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_silver_car.png", + "raw_ref_image": "references/raw_ref_silver_car_attempt_01.png", + "reference_verify": "references/reference_verify_silver_car.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/raw_ref_silver_car_attempt_01.png", + "output": "references/ref_silver_car.png", + "mask": "references/sam_mask_silver_car.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 47.0, + 245.0, + 984.0, + 779.0 + ], + "mask_score": 3.398036, + "mask_area_ratio": 0.279834, + "elapsed_seconds": 8.4126 + } + }, + { + "name": "white_suv", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c5e32cf6-7d2e04b4:object:0", + "source_name": "white suv", + "source_description": "A white Honda CR-V parked or stopped in the rightmost lane, showing its rear passenger side. Source dataset: BDD100K. Scene context: A view from a car driving down a multi-lane city street with parked cars on the right, oncoming traffic on the left, and a highway overpass in the distance on a sunny day.", + "sub_caption": "white suv: A white SUV showing its rear passenger side.. Scene role: stopped or parked in the rightmost lane ahead", + "measured_bbox": [ + 0.5082, + 0.4246, + 0.5826, + 0.5552 + ], + "detection_confidence": "high", + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_white_suv.png", + "raw_ref_image": "references/raw_ref_white_suv_attempt_01.png", + "reference_verify": "references/reference_verify_white_suv.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/raw_ref_white_suv_attempt_01.png", + "output": "references/ref_white_suv.png", + "mask": "references/sam_mask_white_suv.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 26.0, + 237.0, + 1002.0, + 800.0 + ], + "mask_score": 3.457781, + "mask_area_ratio": 0.315623, + "elapsed_seconds": 10.4584 + } + }, + { + "name": "background_street_light", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_15/273278,8d2ae00027075d75.jpg:object:5", + "source_name": "street light", + "source_description": "bright street lights illuminating the area in the background Source dataset: CrowdHuman. Scene context: A group of people wearing athletic clothing are posed for a group photo outdoors at night.", + "sub_caption": "street light: Bright street lights.. Scene role: illuminating the distant background area of the street", + "measured_bbox": [ + 0.5004, + 0.0592, + 0.5237, + 0.0998 + ], + "detection_confidence": 0.9, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_background_street_light.png", + "raw_ref_image": "references/raw_ref_background_street_light_attempt_01.png", + "reference_verify": "references/reference_verify_background_street_light.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/raw_ref_background_street_light_attempt_01.png", + "output": "references/ref_background_street_light.png", + "mask": "references/sam_mask_background_street_light.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 152.0, + 95.0, + 875.0, + 938.0 + ], + "mask_score": 3.437329, + "mask_area_ratio": 0.065212, + "elapsed_seconds": 8.1426 + } + }, + { + "name": "oncoming_white_car", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:be3159f3-13250ffe:object:2", + "source_name": "white car", + "source_description": "A white car driving toward the camera in the oncoming lane, further down the road. Source dataset: BDD100K. Scene context: A daytime street view from a vehicle approaching an intersection with traffic lights and several other cars.", + "sub_caption": "white car: A white car driving toward the camera with its headlights visible.. Scene role: active oncoming traffic in the left lane", + "measured_bbox": [ + 0.1529, + 0.4523, + 0.2891, + 0.5849 + ], + "detection_confidence": 0.98, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_oncoming_white_car.png", + "raw_ref_image": "references/raw_ref_oncoming_white_car_attempt_01.png", + "reference_verify": "references/reference_verify_oncoming_white_car.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/raw_ref_oncoming_white_car_attempt_01.png", + "output": "references/ref_oncoming_white_car.png", + "mask": "references/sam_mask_oncoming_white_car.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 13.0, + 304.0, + 1011.0, + 819.0 + ], + "mask_score": 3.453796, + "mask_area_ratio": 0.299096, + "elapsed_seconds": 8.2507 + } + }, + { + "name": "windshield_mount", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c5694077-e345e2a8:object:12", + "source_name": "windshield mount", + "source_description": "A black mount attached to the inside of the windshield, partially obscuring the view. Source dataset: BDD100K. Scene context: A view from inside a car driving down a busy city street flanked by tall buildings, with various vehicles including cars, a delivery truck, and a cyclist in the foreground.", + "sub_caption": "windshield mount: A black mount attached to the inside of the windshield.. Scene role: partially obscuring the top view, framing the dashcam perspective", + "measured_bbox": [ + 0.0, + 0.0, + 0.4767, + 0.1353 + ], + "detection_confidence": "high", + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_windshield_mount.png", + "raw_ref_image": "references/raw_ref_windshield_mount_attempt_01.png", + "reference_verify": "references/reference_verify_windshield_mount.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/raw_ref_windshield_mount_attempt_01.png", + "output": "references/ref_windshield_mount.png", + "mask": "references/sam_mask_windshield_mount.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 88.0, + 131.0, + 967.0, + 895.0 + ], + "mask_score": 3.43205, + "mask_area_ratio": 0.236237, + "elapsed_seconds": 8.17 + } + }, + { + "name": "ego_car_dashboard", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "BDD100K:c187431f-7b6ad6d6:object:0", + "source_name": "ego car dashboard", + "source_description": "The dark, lower foreground showing part of the dashboard and hood of the vehicle recording the video, with red reflections from taillights. Source dataset: BDD100K. Scene context: Nighttime driving on a multi-lane highway with moderate traffic, showing vehicles ahead and green overhead highway signs.", + "sub_caption": "ego car dashboard: The dark, lower foreground showing part of the dashboard and hood of the vehicle, with red ambient reflections.. Scene role: anchors the bottom of the frame, establishing the driver's perspective", + "measured_bbox": [ + 0.0, + 0.7306, + 1.0, + 1.0 + ], + "detection_confidence": 1.0, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_ego_car_dashboard.png", + "raw_ref_image": "references/raw_ref_ego_car_dashboard_attempt_01.png", + "reference_verify": "references/reference_verify_ego_car_dashboard.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/raw_ref_ego_car_dashboard_attempt_01.png", + "output": "references/ref_ego_car_dashboard.png", + "mask": "references/sam_mask_ego_car_dashboard.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 0.0, + 327.0, + 1023.0, + 788.0 + ], + "mask_score": 3.233951, + "mask_area_ratio": 0.206886, + "elapsed_seconds": 10.2609 + } + }, + { + "name": "vintage_street_lamp", + "is_person": false, + "subject_type": "object", + "source_set": "obj_set", + "source_image_id": "CrowdHuman:data/data_29/273275,2032200056dda99e.jpg:object:0", + "source_name": "street lamp", + "source_description": "Ornate, black, vintage-style street lamp post. Source dataset: CrowdHuman. Scene context: A densely crowded city street with numerous pedestrians walking along a sidewalk lined with shops and tall, leafy trees.", + "sub_caption": "street lamp: An ornate, black, vintage-style street lamp post.. Scene role: providing decorative lighting on the right sidewalk next to the pedestrian", + "measured_bbox": [ + 0.7202, + 0.0, + 0.7633, + 0.6024 + ], + "detection_confidence": 0.99, + "ref_style": "white_bg_encyclopedia_photo", + "ref_image": "references/ref_vintage_street_lamp.png", + "raw_ref_image": "references/raw_ref_vintage_street_lamp_attempt_01.png", + "reference_verify": "references/reference_verify_vintage_street_lamp.json", + "reference_verify_passed": true, + "reference_attempts": 1, + "sam_white_bg": { + "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000011/references/raw_ref_vintage_street_lamp_attempt_01.png", + "output": "references/ref_vintage_street_lamp.png", + "mask": "references/sam_mask_vintage_street_lamp.png", + "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth", + "sam_model_type": "vit_b", + "sam_device": "auto", + "sam_working_size": [ + 640, + 640 + ], + "sam_max_side": 640, + "sam_downscale": 0.625, + "prompt_box_xyxy": [ + 343.0, + 0.0, + 599.0, + 1023.0 + ], + "mask_score": 3.457917, + "mask_area_ratio": 0.047438, + "elapsed_seconds": 8.3114 + } + } + ], + "not_emitted": [], + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + } +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/vocab_task.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/vocab_task.json new file mode 100644 index 0000000000000000000000000000000000000000..e625ba3c80f2af67e192b1c90561d2abee86751a --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/vocab_task.json @@ -0,0 +1,154 @@ +{ + "task_id": "sample_000011", + "sample_id": "sample_000011", + "sample_index": 11, + "target_total": 10, + "target_people": 1, + "target_objects": 9, + "people_candidates": [ + { + "candidate_index": 0, + "source_offset": 143457, + "image_id": "CrowdHuman:data/data_59/273275,b3b26000fb07c352.jpg:person:5", + "name": "young adult in dark gray top", + "description": "A young adult female wearing a dark gray long-sleeved top and a light-colored hat, standing in the middle section of the group. Source dataset: CrowdHuman. Scene context: A large group of young adults is gathered in a park with a city skyline and river in the background, enjoying a picnic with various food items and bags scattered on the grass." + }, + { + "candidate_index": 1, + "source_offset": 189686, + "image_id": "CrowdHuman:data/data_8/273275,44ab30007bea86d2.jpg:person:5", + "name": "woman in foreground", + "description": "A woman with blonde hair, seen in profile in the bottom center of the foreground. Source dataset: CrowdHuman. Scene context: A large crowd of people is gathered on a city street, many standing behind metal barricades, while some are taking photos." + } + ], + "object_candidates": [ + { + "candidate_index": 0, + "source_offset": 90197, + "image_id": "CrowdHuman:data/data_54/273278,11abb000d12e48e5.jpg:object:5", + "name": "metal structure", + "description": "A tall, rectangular grey metal box or pillar on the right side, which one man is holding onto. Source dataset: CrowdHuman. Scene context: A crowded subway station with a metal barrier, where two people are climbing over the barrier while others stand by." + }, + { + "candidate_index": 1, + "source_offset": 130593, + "image_id": "CrowdHuman:data/data_8/283081,7e2200012d8a842.jpg:object:8", + "name": "grey bundled item", + "description": "A grey, textured piece of clothing or fabric bundled up and held by a person's hands in the foreground. Source dataset: CrowdHuman. Scene context: A woman in a white dress and black jacket is joyfully dancing in the middle of a street, surrounded by a crowd of onlookers watching and cheering." + }, + { + "candidate_index": 2, + "source_offset": 124140, + "image_id": "CrowdHuman:data/data_72/273275,4fe42000afaa1bc1.jpg:object:2", + "name": "fishing rod", + "description": "A long, thin fishing rod held by the person seated on the bench near the water. Source dataset: CrowdHuman. Scene context: People are relaxing, fishing, and walking near a wooden bridge and palm trees in a coastal park setting on a sunny day." + }, + { + "candidate_index": 3, + "source_offset": 184005, + "image_id": "BDD100K:bd1b8b79-829e787f:object:5", + "name": "streetlights", + "description": "Tall poles with bright lights illuminating the street from above, casting long reflections on the wet pavement. Source dataset: BDD100K. Scene context: Nighttime driving scene on a wet city street with streetlights reflecting on the road." + }, + { + "candidate_index": 4, + "source_offset": 177848, + "image_id": "BDD100K:bba4ee51-3badc9f8:object:6", + "name": "silver car", + "description": "Silver car parked further down the street on the right. Source dataset: BDD100K. Scene context: View from inside a car driving down a residential street lined with parked cars and trees." + }, + { + "candidate_index": 5, + "source_offset": 108392, + "image_id": "CrowdHuman:data/data_64/273275,292c9000f4c9c911.jpg:object:6", + "name": "banner", + "description": "A third large, vertical blue banner hanging between the columns. Source dataset: CrowdHuman. Scene context: A grand, classical building with numerous columns and statues, with several people walking in front of it." + }, + { + "candidate_index": 6, + "source_offset": 126971, + "image_id": "CrowdHuman:data/data_73/282555,4244f0002ad00dd5.jpg:object:2", + "name": "garden", + "description": "Area in the background with green trees, manicured hedges, and red flowers. Source dataset: CrowdHuman. Scene context: A large plaza with people strolling, surrounded by grand, classical architecture and manicured gardens." + }, + { + "candidate_index": 7, + "source_offset": 226308, + "image_id": "BDD100K:c5e32cf6-7d2e04b4:object:0", + "name": "white suv", + "description": "A white Honda CR-V parked or stopped in the rightmost lane, showing its rear passenger side. Source dataset: BDD100K. Scene context: A view from a car driving down a multi-lane city street with parked cars on the right, oncoming traffic on the left, and a highway overpass in the distance on a sunny day." + }, + { + "candidate_index": 8, + "source_offset": 117902, + "image_id": "CrowdHuman:data/data_69/282555,115c980004139b5c0.jpg:object:2", + "name": "cafe table", + "description": "A small round table placed outdoors for seating. Source dataset: CrowdHuman. Scene context: An aerial view of a bustling city square featuring a prominent, historic brick building, numerous people walking and gathering around tables with umbrellas, and adjacent modern multi-story buildings." + }, + { + "candidate_index": 9, + "source_offset": 167303, + "image_id": "BDD100K:b92caf7e-4fc447e8:object:8", + "name": "grassy hill", + "description": "A sloping area of green grass and trees on the right side behind the fence. Source dataset: BDD100K. Scene context: A view from a car driving on a multi-lane city road with a green bus and other cars ahead, next to a fenced grassy hill on the right and tall buildings on the left." + }, + { + "candidate_index": 10, + "source_offset": 12923, + "image_id": "CrowdHuman:data/data_15/273278,8d2ae00027075d75.jpg:object:5", + "name": "street light", + "description": "bright street lights illuminating the area in the background Source dataset: CrowdHuman. Scene context: A group of people wearing athletic clothing are posed for a group photo outdoors at night." + }, + { + "candidate_index": 11, + "source_offset": 189240, + "image_id": "BDD100K:be3159f3-13250ffe:object:2", + "name": "white car", + "description": "A white car driving toward the camera in the oncoming lane, further down the road. Source dataset: BDD100K. Scene context: A daytime street view from a vehicle approaching an intersection with traffic lights and several other cars." + }, + { + "candidate_index": 12, + "source_offset": 53652, + "image_id": "CrowdHuman:data/data_36/273278,b652b000a3bc68e1.jpg:object:4", + "name": "lamp post", + "description": "A black vintage-style lamp post with a white globe standing on the edge of the grass. Source dataset: CrowdHuman. Scene context: People walk along paved paths on a grassy campus in front of stone collegiate buildings on a partly cloudy day." + }, + { + "candidate_index": 13, + "source_offset": 123679, + "image_id": "CrowdHuman:data/data_71/283991,1733d000e5b44285.jpg:object:13", + "name": "awning", + "description": "Red awning above a storefront in the background. Source dataset: CrowdHuman. Scene context: A group of people standing on a city sidewalk on a sunny day." + }, + { + "candidate_index": 14, + "source_offset": 223578, + "image_id": "BDD100K:c5694077-e345e2a8:object:12", + "name": "windshield mount", + "description": "A black mount attached to the inside of the windshield, partially obscuring the view. Source dataset: BDD100K. Scene context: A view from inside a car driving down a busy city street flanked by tall buildings, with various vehicles including cars, a delivery truck, and a cyclist in the foreground." + }, + { + "candidate_index": 15, + "source_offset": 205080, + "image_id": "BDD100K:c187431f-7b6ad6d6:object:0", + "name": "ego car dashboard", + "description": "The dark, lower foreground showing part of the dashboard and hood of the vehicle recording the video, with red reflections from taillights. Source dataset: BDD100K. Scene context: Nighttime driving on a multi-lane highway with moderate traffic, showing vehicles ahead and green overhead highway signs." + }, + { + "candidate_index": 16, + "source_offset": 38874, + "image_id": "CrowdHuman:data/data_29/273275,2032200056dda99e.jpg:object:0", + "name": "street lamp", + "description": "Ornate, black, vintage-style street lamp post. Source dataset: CrowdHuman. Scene context: A densely crowded city street with numerous pedestrians walking along a sidewalk lined with shops and tall, leafy trees." + }, + { + "candidate_index": 17, + "source_offset": 70036, + "image_id": "CrowdHuman:data/data_44/282555,6e8a00043c33d5c.jpg:object:4", + "name": "statue", + "description": "A light-colored, partial torso statue displayed on a pedestal. Source dataset: CrowdHuman. Scene context: People explore a spacious, well-lit museum gallery showcasing ancient sculptures and artifacts displayed on pedestals and within glass cases." + } + ], + "rng_seed": 1783080012, + "created_at": 1782223839.6232827 +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/scene_pool/done/sample_000001.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/scene_pool/done/sample_000001.json new file mode 100644 index 0000000000000000000000000000000000000000..d1fee1e6196d64692f35d48f03180662e7984032 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/scene_pool/done/sample_000001.json @@ -0,0 +1,16 @@ +{ + "sample_id": "sample_000001", + "plan_path": "sample_000001/plan.json", + "task_path": "sample_000001/vocab_task.json", + "main_image": "sample_000001/main_image.png", + "compose_prompt": "sample_000001/compose_prompt.txt", + "prompt_hash": "789c8d85c6e57e6254e3572cc7e92520ceb18df60af32a05d849eeb5ec948b4c", + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + }, + "item_id": "sample_000001", + "pool": "scene_pool", + "retry_count": 0, + "errors": [] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/scene_pool/done/sample_000002.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/scene_pool/done/sample_000002.json new file mode 100644 index 0000000000000000000000000000000000000000..30fd5158e2bab1da9948a73be44016561b230e30 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/scene_pool/done/sample_000002.json @@ -0,0 +1,16 @@ +{ + "sample_id": "sample_000002", + "plan_path": "sample_000002/plan.json", + "task_path": "sample_000002/vocab_task.json", + "main_image": "sample_000002/main_image.png", + "compose_prompt": "sample_000002/compose_prompt.txt", + "prompt_hash": "0db2e74733705e9df14d316803fb0b3543017ab6d3a83fabbf7bdc3dc3192241", + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + }, + "item_id": "sample_000002", + "pool": "scene_pool", + "retry_count": 0, + "errors": [] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/scene_pool/done/sample_000003.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/scene_pool/done/sample_000003.json new file mode 100644 index 0000000000000000000000000000000000000000..f2d471905e7d49415b3d72a95cbd8da76f06b5f4 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/scene_pool/done/sample_000003.json @@ -0,0 +1,16 @@ +{ + "sample_id": "sample_000003", + "plan_path": "sample_000003/plan.json", + "task_path": "sample_000003/vocab_task.json", + "main_image": "sample_000003/main_image.png", + "compose_prompt": "sample_000003/compose_prompt.txt", + "prompt_hash": "5df34bfe8b8cd7ca463459fbf944508ed0a9f19aa858a90af2e5cd9ec7cb652e", + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + }, + "item_id": "sample_000003", + "pool": "scene_pool", + "retry_count": 0, + "errors": [] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/scene_pool/done/sample_000004.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/scene_pool/done/sample_000004.json new file mode 100644 index 0000000000000000000000000000000000000000..d884a80359e6f53a51d1df563b1e27f3165c875b --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/scene_pool/done/sample_000004.json @@ -0,0 +1,16 @@ +{ + "sample_id": "sample_000004", + "plan_path": "sample_000004/plan.json", + "task_path": "sample_000004/vocab_task.json", + "main_image": "sample_000004/main_image.png", + "compose_prompt": "sample_000004/compose_prompt.txt", + "prompt_hash": "406b1886d9afac1d49d984bbc0bee062d4a8dcfd6f50762f59a16891be555505", + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + }, + "item_id": "sample_000004", + "pool": "scene_pool", + "retry_count": 0, + "errors": [] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/scene_pool/done/sample_000005.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/scene_pool/done/sample_000005.json new file mode 100644 index 0000000000000000000000000000000000000000..ca650f42b1defe43b1fa307f159b26c012b31ae6 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/scene_pool/done/sample_000005.json @@ -0,0 +1,16 @@ +{ + "sample_id": "sample_000005", + "plan_path": "sample_000005/plan.json", + "task_path": "sample_000005/vocab_task.json", + "main_image": "sample_000005/main_image.png", + "compose_prompt": "sample_000005/compose_prompt.txt", + "prompt_hash": "f2cfc0f077d29e069efc31501aca5fd9005be9a292ffa76842fa2ae0e4908c9a", + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + }, + "item_id": "sample_000005", + "pool": "scene_pool", + "retry_count": 0, + "errors": [] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/scene_pool/done/sample_000006.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/scene_pool/done/sample_000006.json new file mode 100644 index 0000000000000000000000000000000000000000..45c047db5114c468cb253a5b7843c49c39940354 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/scene_pool/done/sample_000006.json @@ -0,0 +1,16 @@ +{ + "sample_id": "sample_000006", + "plan_path": "sample_000006/plan.json", + "task_path": "sample_000006/vocab_task.json", + "main_image": "sample_000006/main_image.png", + "compose_prompt": "sample_000006/compose_prompt.txt", + "prompt_hash": "90b62b1c3b3296af079950b152ba970de5ac4d5dd30c954f1773ab503dc608fb", + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + }, + "item_id": "sample_000006", + "pool": "scene_pool", + "retry_count": 0, + "errors": [] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/scene_pool/done/sample_000008.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/scene_pool/done/sample_000008.json new file mode 100644 index 0000000000000000000000000000000000000000..06d9ef51dc5d5b4fb9d9ad91685bf3c5078fc09b --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/scene_pool/done/sample_000008.json @@ -0,0 +1,16 @@ +{ + "sample_id": "sample_000008", + "plan_path": "sample_000008/plan.json", + "task_path": "sample_000008/vocab_task.json", + "main_image": "sample_000008/main_image.png", + "compose_prompt": "sample_000008/compose_prompt.txt", + "prompt_hash": "67a09c741f5c454018008551387ce1a49f257b9e5cc0745d89d615c3011b1214", + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + }, + "item_id": "sample_000008", + "pool": "scene_pool", + "retry_count": 0, + "errors": [] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/scene_pool/done/sample_000009.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/scene_pool/done/sample_000009.json new file mode 100644 index 0000000000000000000000000000000000000000..8fc861bca532081156c4d6d12d39befa882af60c --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/scene_pool/done/sample_000009.json @@ -0,0 +1,16 @@ +{ + "sample_id": "sample_000009", + "plan_path": "sample_000009/plan.json", + "task_path": "sample_000009/vocab_task.json", + "main_image": "sample_000009/main_image.png", + "compose_prompt": "sample_000009/compose_prompt.txt", + "prompt_hash": "bb5f95c2621af98b16411bccdb53e88f3a4f9d644071b10a72620cff1559f4db", + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + }, + "item_id": "sample_000009", + "pool": "scene_pool", + "retry_count": 0, + "errors": [] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/scene_pool/done/sample_000010.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/scene_pool/done/sample_000010.json new file mode 100644 index 0000000000000000000000000000000000000000..1c78a2c0961d18646d5caf350e1b76adac07cb53 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/scene_pool/done/sample_000010.json @@ -0,0 +1,16 @@ +{ + "sample_id": "sample_000010", + "plan_path": "sample_000010/plan.json", + "task_path": "sample_000010/vocab_task.json", + "main_image": "sample_000010/main_image.png", + "compose_prompt": "sample_000010/compose_prompt.txt", + "prompt_hash": "5baacce70772af790e4c569bdc802d39b2a5feff87dab5e3fa976e66efcc6bf2", + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + }, + "item_id": "sample_000010", + "pool": "scene_pool", + "retry_count": 0, + "errors": [] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/scene_pool/done/sample_000011.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/scene_pool/done/sample_000011.json new file mode 100644 index 0000000000000000000000000000000000000000..511dad29ea4ee284ca3d6f067ccdc966b84c856f --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/scene_pool/done/sample_000011.json @@ -0,0 +1,16 @@ +{ + "sample_id": "sample_000011", + "plan_path": "sample_000011/plan.json", + "task_path": "sample_000011/vocab_task.json", + "main_image": "sample_000011/main_image.png", + "compose_prompt": "sample_000011/compose_prompt.txt", + "prompt_hash": "0d5c4c7c5c5225fd2623ef9d6052d7a01ee80174d4f1c795b5222097791d71f5", + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + }, + "item_id": "sample_000011", + "pool": "scene_pool", + "retry_count": 0, + "errors": [] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/scene_pool/failed/sample_000007.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/scene_pool/failed/sample_000007.json new file mode 100644 index 0000000000000000000000000000000000000000..05afafbf774cd7992cec596824389e3ceace6cd4 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/scene_pool/failed/sample_000007.json @@ -0,0 +1,37 @@ +{ + "sample_id": "sample_000007", + "plan_path": "sample_000007/plan.json", + "task_path": "sample_000007/vocab_task.json", + "main_image": "sample_000007/main_image.png", + "compose_prompt": "sample_000007/compose_prompt.txt", + "prompt_hash": "5b43196e777d433f2cef1ad23d9e0918f8b4cde55a7bcee55d2103677d164367", + "model_ids": { + "chat_model": "gcp/google/gemini-3.1-pro-preview", + "image_model": "gcp/google/gemini-3-pro-image-preview" + }, + "item_id": "sample_000007", + "pool": "scene_pool", + "retry_count": 4, + "errors": [ + { + "time": 1782223644.792998, + "error": "RuntimeError: detection incomplete after 3 VLM attempts: missing 1/14 planned subjects: illuminated_store_sign", + "traceback": "Traceback (most recent call last):\n File \"/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/samples_v8/generate_samples_v8.py\", line 1020, in worker_loop\n handler(manifest)\n File \"/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/samples_v8/generate_samples_v8.py\", line 1084, in handler\n detections = detect_all_subjects(sample_id, plan, main_image, subject_workers)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/samples_v8/generate_samples_v8.py\", line 839, in detect_all_subjects\n raise RuntimeError(\nRuntimeError: detection incomplete after 3 VLM attempts: missing 1/14 planned subjects: illuminated_store_sign\n" + }, + { + "time": 1782223712.302278, + "error": "RuntimeError: detection incomplete after 3 VLM attempts: missing 1/14 planned subjects: illuminated_store_sign", + "traceback": "Traceback (most recent call last):\n File \"/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/samples_v8/generate_samples_v8.py\", line 1020, in worker_loop\n handler(manifest)\n File \"/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/samples_v8/generate_samples_v8.py\", line 1084, in handler\n detections = detect_all_subjects(sample_id, plan, main_image, subject_workers)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/samples_v8/generate_samples_v8.py\", line 839, in detect_all_subjects\n raise RuntimeError(\nRuntimeError: detection incomplete after 3 VLM attempts: missing 1/14 planned subjects: illuminated_store_sign\n" + }, + { + "time": 1782223796.6851494, + "error": "RuntimeError: detection incomplete after 3 VLM attempts: missing 1/14 planned subjects: illuminated_store_sign", + "traceback": "Traceback (most recent call last):\n File \"/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/samples_v8/generate_samples_v8.py\", line 1020, in worker_loop\n handler(manifest)\n File \"/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/samples_v8/generate_samples_v8.py\", line 1084, in handler\n detections = detect_all_subjects(sample_id, plan, main_image, subject_workers)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/samples_v8/generate_samples_v8.py\", line 839, in detect_all_subjects\n raise RuntimeError(\nRuntimeError: detection incomplete after 3 VLM attempts: missing 1/14 planned subjects: illuminated_store_sign\n" + }, + { + "time": 1782223834.0098677, + "error": "RuntimeError: detection incomplete after 3 VLM attempts: missing 1/14 planned subjects: illuminated_store_sign", + "traceback": "Traceback (most recent call last):\n File \"/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/samples_v8/generate_samples_v8.py\", line 1020, in worker_loop\n handler(manifest)\n File \"/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/samples_v8/generate_samples_v8.py\", line 1084, in handler\n detections = detect_all_subjects(sample_id, plan, main_image, subject_workers)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/samples_v8/generate_samples_v8.py\", line 839, in detect_all_subjects\n raise RuntimeError(\nRuntimeError: detection incomplete after 3 VLM attempts: missing 1/14 planned subjects: illuminated_store_sign\n" + } + ] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/stats.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/stats.json new file mode 100644 index 0000000000000000000000000000000000000000..1783a82990e5930776be24f79e7c534821a37742 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/stats.json @@ -0,0 +1,134 @@ +{ + "started_at": 1782223458.2299504, + "elapsed_seconds": 1118.4, + "stage": { + "plan": { + "attempts": 11, + "successes": 11, + "errors": 0, + "permanent_failures": 0, + "last_error": "", + "attempts_per_hour": 35.406, + "successes_per_hour": 35.406, + "stage_error_rate": 0.0 + }, + "compose": { + "attempts": 11, + "successes": 11, + "errors": 0, + "permanent_failures": 0, + "last_error": "", + "attempts_per_hour": 35.406, + "successes_per_hour": 35.406, + "stage_error_rate": 0.0 + }, + "detect": { + "attempts": 14, + "successes": 10, + "errors": 0, + "permanent_failures": 0, + "last_error": "", + "attempts_per_hour": 45.063, + "successes_per_hour": 32.188, + "stage_error_rate": 0.0 + }, + "reference": { + "attempts": 10, + "successes": 10, + "errors": 0, + "permanent_failures": 0, + "last_error": "", + "attempts_per_hour": 32.188, + "successes_per_hour": 32.188, + "stage_error_rate": 0.0 + }, + "scene_pool": { + "attempts": 0, + "successes": 0, + "errors": 4, + "permanent_failures": 1, + "last_error": "RuntimeError: detection incomplete after 3 VLM attempts: missing 1/14 planned subjects: illuminated_store_sign", + "attempts_per_hour": 0.0, + "successes_per_hour": 0.0, + "stage_error_rate": 4.0 + }, + "emit": { + "attempts": 10, + "successes": 10, + "errors": 0, + "permanent_failures": 0, + "last_error": "", + "attempts_per_hour": 32.188, + "successes_per_hour": 32.188, + "stage_error_rate": 0.0 + } + }, + "total_attempts": 56, + "total_errors": 4, + "error_rate": 0.0714, + "recent_events": [ + { + "time": 1782223644.793541, + "stage": "scene_pool", + "error": "RuntimeError: detection incomplete after 3 VLM attempts: missing 1/14 planned subjects: illuminated_store_sign", + "permanent": false + }, + { + "time": 1782223712.302779, + "stage": "scene_pool", + "error": "RuntimeError: detection incomplete after 3 VLM attempts: missing 1/14 planned subjects: illuminated_store_sign", + "permanent": false + }, + { + "time": 1782223796.685505, + "stage": "scene_pool", + "error": "RuntimeError: detection incomplete after 3 VLM attempts: missing 1/14 planned subjects: illuminated_store_sign", + "permanent": false + }, + { + "time": 1782223834.0102983, + "stage": "scene_pool", + "error": "RuntimeError: detection incomplete after 3 VLM attempts: missing 1/14 planned subjects: illuminated_store_sign", + "permanent": true + } + ], + "row_count": 10, + "pool_counts": { + "vocab_task_pool": { + "pending": 0, + "in_progress": 0, + "done": 11, + "failed": 0 + }, + "plan_pool": { + "pending": 0, + "in_progress": 0, + "done": 11, + "failed": 0 + }, + "scene_pool": { + "pending": 0, + "in_progress": 0, + "done": 10, + "failed": 1 + }, + "detection_pool": { + "pending": 0, + "in_progress": 0, + "done": 10, + "failed": 0 + }, + "reference_pool": { + "pending": 0, + "in_progress": 0, + "done": 10, + "failed": 0 + }, + "emit_pool": { + "pending": 0, + "in_progress": 0, + "done": 10, + "failed": 0 + } + } +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/vocab_task_pool/done/sample_000001.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/vocab_task_pool/done/sample_000001.json new file mode 100644 index 0000000000000000000000000000000000000000..14d87a9d99b25e7b13e4665d260ef816fa5db084 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/vocab_task_pool/done/sample_000001.json @@ -0,0 +1,8 @@ +{ + "sample_id": "sample_000001", + "task_path": "sample_000001/vocab_task.json", + "item_id": "sample_000001", + "pool": "vocab_task_pool", + "retry_count": 0, + "errors": [] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/vocab_task_pool/done/sample_000002.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/vocab_task_pool/done/sample_000002.json new file mode 100644 index 0000000000000000000000000000000000000000..a04e497e7efb153dd1e225039f23aa0b126dee1c --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/vocab_task_pool/done/sample_000002.json @@ -0,0 +1,8 @@ +{ + "sample_id": "sample_000002", + "task_path": "sample_000002/vocab_task.json", + "item_id": "sample_000002", + "pool": "vocab_task_pool", + "retry_count": 0, + "errors": [] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/vocab_task_pool/done/sample_000003.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/vocab_task_pool/done/sample_000003.json new file mode 100644 index 0000000000000000000000000000000000000000..169df4be1c8cb90c6996d4dc44374cad4b3a6603 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/vocab_task_pool/done/sample_000003.json @@ -0,0 +1,8 @@ +{ + "sample_id": "sample_000003", + "task_path": "sample_000003/vocab_task.json", + "item_id": "sample_000003", + "pool": "vocab_task_pool", + "retry_count": 0, + "errors": [] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/vocab_task_pool/done/sample_000004.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/vocab_task_pool/done/sample_000004.json new file mode 100644 index 0000000000000000000000000000000000000000..893a53cacead4612460e6e5108985f3c41963e56 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/vocab_task_pool/done/sample_000004.json @@ -0,0 +1,8 @@ +{ + "sample_id": "sample_000004", + "task_path": "sample_000004/vocab_task.json", + "item_id": "sample_000004", + "pool": "vocab_task_pool", + "retry_count": 0, + "errors": [] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/vocab_task_pool/done/sample_000005.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/vocab_task_pool/done/sample_000005.json new file mode 100644 index 0000000000000000000000000000000000000000..2518d4a0db8aecf1b0650ffdec9932f8f6bfac5a --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/vocab_task_pool/done/sample_000005.json @@ -0,0 +1,8 @@ +{ + "sample_id": "sample_000005", + "task_path": "sample_000005/vocab_task.json", + "item_id": "sample_000005", + "pool": "vocab_task_pool", + "retry_count": 0, + "errors": [] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/vocab_task_pool/done/sample_000006.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/vocab_task_pool/done/sample_000006.json new file mode 100644 index 0000000000000000000000000000000000000000..2cdb892c59f209cda1a9d5c2c7308ebc857c06fa --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/vocab_task_pool/done/sample_000006.json @@ -0,0 +1,8 @@ +{ + "sample_id": "sample_000006", + "task_path": "sample_000006/vocab_task.json", + "item_id": "sample_000006", + "pool": "vocab_task_pool", + "retry_count": 0, + "errors": [] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/vocab_task_pool/done/sample_000007.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/vocab_task_pool/done/sample_000007.json new file mode 100644 index 0000000000000000000000000000000000000000..bfcbb8a053b1c5624cea0ab51d2a6c7e6410e47d --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/vocab_task_pool/done/sample_000007.json @@ -0,0 +1,8 @@ +{ + "sample_id": "sample_000007", + "task_path": "sample_000007/vocab_task.json", + "item_id": "sample_000007", + "pool": "vocab_task_pool", + "retry_count": 0, + "errors": [] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/vocab_task_pool/done/sample_000008.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/vocab_task_pool/done/sample_000008.json new file mode 100644 index 0000000000000000000000000000000000000000..9bfd3d2a2e9f585229217f20ef5a1cbf32cf771a --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/vocab_task_pool/done/sample_000008.json @@ -0,0 +1,8 @@ +{ + "sample_id": "sample_000008", + "task_path": "sample_000008/vocab_task.json", + "item_id": "sample_000008", + "pool": "vocab_task_pool", + "retry_count": 0, + "errors": [] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/vocab_task_pool/done/sample_000009.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/vocab_task_pool/done/sample_000009.json new file mode 100644 index 0000000000000000000000000000000000000000..961f18edd93a499789dfc998e18e95ff951d47b3 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/vocab_task_pool/done/sample_000009.json @@ -0,0 +1,8 @@ +{ + "sample_id": "sample_000009", + "task_path": "sample_000009/vocab_task.json", + "item_id": "sample_000009", + "pool": "vocab_task_pool", + "retry_count": 0, + "errors": [] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/vocab_task_pool/done/sample_000010.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/vocab_task_pool/done/sample_000010.json new file mode 100644 index 0000000000000000000000000000000000000000..0059f5f064cfc8ca21f7de4b2d22205efe9ca6cf --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/vocab_task_pool/done/sample_000010.json @@ -0,0 +1,8 @@ +{ + "sample_id": "sample_000010", + "task_path": "sample_000010/vocab_task.json", + "item_id": "sample_000010", + "pool": "vocab_task_pool", + "retry_count": 0, + "errors": [] +} diff --git a/samples_v8/driving/BDD100K_CrowdHuman_samples/vocab_task_pool/done/sample_000011.json b/samples_v8/driving/BDD100K_CrowdHuman_samples/vocab_task_pool/done/sample_000011.json new file mode 100644 index 0000000000000000000000000000000000000000..9d100735805b3b24b3183d7a949ff455b69916b9 --- /dev/null +++ b/samples_v8/driving/BDD100K_CrowdHuman_samples/vocab_task_pool/done/sample_000011.json @@ -0,0 +1,8 @@ +{ + "sample_id": "sample_000011", + "task_path": "sample_000011/vocab_task.json", + "item_id": "sample_000011", + "pool": "vocab_task_pool", + "retry_count": 0, + "errors": [] +}