kimi000 commited on
Commit
588c4ad
·
verified ·
1 Parent(s): d4118e5

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +150 -0
  2. samples_v8/driving/BDD100K_CrowdHuman_samples/README.md +16 -0
  3. samples_v8/driving/BDD100K_CrowdHuman_samples/dataset.json +0 -0
  4. samples_v8/driving/BDD100K_CrowdHuman_samples/dataset.jsonl +0 -0
  5. samples_v8/driving/BDD100K_CrowdHuman_samples/detection_pool/done/sample_000001.json +16 -0
  6. samples_v8/driving/BDD100K_CrowdHuman_samples/detection_pool/done/sample_000002.json +16 -0
  7. samples_v8/driving/BDD100K_CrowdHuman_samples/detection_pool/done/sample_000003.json +16 -0
  8. samples_v8/driving/BDD100K_CrowdHuman_samples/detection_pool/done/sample_000004.json +16 -0
  9. samples_v8/driving/BDD100K_CrowdHuman_samples/detection_pool/done/sample_000005.json +16 -0
  10. samples_v8/driving/BDD100K_CrowdHuman_samples/detection_pool/done/sample_000006.json +16 -0
  11. samples_v8/driving/BDD100K_CrowdHuman_samples/detection_pool/done/sample_000008.json +16 -0
  12. samples_v8/driving/BDD100K_CrowdHuman_samples/detection_pool/done/sample_000009.json +16 -0
  13. samples_v8/driving/BDD100K_CrowdHuman_samples/detection_pool/done/sample_000010.json +16 -0
  14. samples_v8/driving/BDD100K_CrowdHuman_samples/detection_pool/done/sample_000011.json +16 -0
  15. samples_v8/driving/BDD100K_CrowdHuman_samples/emit_pool/done/sample_000001.json +4 -0
  16. samples_v8/driving/BDD100K_CrowdHuman_samples/emit_pool/done/sample_000002.json +4 -0
  17. samples_v8/driving/BDD100K_CrowdHuman_samples/emit_pool/done/sample_000003.json +4 -0
  18. samples_v8/driving/BDD100K_CrowdHuman_samples/emit_pool/done/sample_000004.json +4 -0
  19. samples_v8/driving/BDD100K_CrowdHuman_samples/emit_pool/done/sample_000005.json +4 -0
  20. samples_v8/driving/BDD100K_CrowdHuman_samples/emit_pool/done/sample_000006.json +4 -0
  21. samples_v8/driving/BDD100K_CrowdHuman_samples/emit_pool/done/sample_000008.json +4 -0
  22. samples_v8/driving/BDD100K_CrowdHuman_samples/emit_pool/done/sample_000009.json +4 -0
  23. samples_v8/driving/BDD100K_CrowdHuman_samples/emit_pool/done/sample_000010.json +4 -0
  24. samples_v8/driving/BDD100K_CrowdHuman_samples/emit_pool/done/sample_000011.json +4 -0
  25. samples_v8/driving/BDD100K_CrowdHuman_samples/plan_pool/done/sample_000001.json +14 -0
  26. samples_v8/driving/BDD100K_CrowdHuman_samples/plan_pool/done/sample_000002.json +14 -0
  27. samples_v8/driving/BDD100K_CrowdHuman_samples/plan_pool/done/sample_000003.json +14 -0
  28. samples_v8/driving/BDD100K_CrowdHuman_samples/plan_pool/done/sample_000004.json +14 -0
  29. samples_v8/driving/BDD100K_CrowdHuman_samples/plan_pool/done/sample_000005.json +14 -0
  30. samples_v8/driving/BDD100K_CrowdHuman_samples/plan_pool/done/sample_000006.json +14 -0
  31. samples_v8/driving/BDD100K_CrowdHuman_samples/plan_pool/done/sample_000007.json +14 -0
  32. samples_v8/driving/BDD100K_CrowdHuman_samples/plan_pool/done/sample_000008.json +14 -0
  33. samples_v8/driving/BDD100K_CrowdHuman_samples/plan_pool/done/sample_000009.json +14 -0
  34. samples_v8/driving/BDD100K_CrowdHuman_samples/plan_pool/done/sample_000010.json +14 -0
  35. samples_v8/driving/BDD100K_CrowdHuman_samples/plan_pool/done/sample_000011.json +14 -0
  36. samples_v8/driving/BDD100K_CrowdHuman_samples/reference_pool/done/sample_000001.json +18 -0
  37. samples_v8/driving/BDD100K_CrowdHuman_samples/reference_pool/done/sample_000002.json +18 -0
  38. samples_v8/driving/BDD100K_CrowdHuman_samples/reference_pool/done/sample_000003.json +18 -0
  39. samples_v8/driving/BDD100K_CrowdHuman_samples/reference_pool/done/sample_000004.json +18 -0
  40. samples_v8/driving/BDD100K_CrowdHuman_samples/reference_pool/done/sample_000005.json +18 -0
  41. samples_v8/driving/BDD100K_CrowdHuman_samples/reference_pool/done/sample_000006.json +18 -0
  42. samples_v8/driving/BDD100K_CrowdHuman_samples/reference_pool/done/sample_000008.json +18 -0
  43. samples_v8/driving/BDD100K_CrowdHuman_samples/reference_pool/done/sample_000009.json +18 -0
  44. samples_v8/driving/BDD100K_CrowdHuman_samples/reference_pool/done/sample_000010.json +18 -0
  45. samples_v8/driving/BDD100K_CrowdHuman_samples/reference_pool/done/sample_000011.json +18 -0
  46. samples_v8/driving/BDD100K_CrowdHuman_samples/rows/sample_000001.json +164 -0
  47. samples_v8/driving/BDD100K_CrowdHuman_samples/rows/sample_000002.json +716 -0
  48. samples_v8/driving/BDD100K_CrowdHuman_samples/rows/sample_000003.json +164 -0
  49. samples_v8/driving/BDD100K_CrowdHuman_samples/rows/sample_000004.json +256 -0
  50. samples_v8/driving/BDD100K_CrowdHuman_samples/rows/sample_000005.json +302 -0
.gitattributes CHANGED
@@ -52,3 +52,153 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
52
  10samples/sample_0007/overlays/overlay_accepted.png filter=lfs diff=lfs merge=lfs -text
53
  10samples/sample_0007/overlays/overlay_intended.png filter=lfs diff=lfs merge=lfs -text
54
  10samples/sample_0007/overlays/overlay_measured.png filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  10samples/sample_0007/overlays/overlay_accepted.png filter=lfs diff=lfs merge=lfs -text
53
  10samples/sample_0007/overlays/overlay_intended.png filter=lfs diff=lfs merge=lfs -text
54
  10samples/sample_0007/overlays/overlay_measured.png filter=lfs diff=lfs merge=lfs -text
55
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/bbox_overlay.png filter=lfs diff=lfs merge=lfs -text
56
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/crops/detect_refine_metal_barrier.png filter=lfs diff=lfs merge=lfs -text
57
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/crops/detect_refine_parked_dark_car.png filter=lfs diff=lfs merge=lfs -text
58
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/crops/detect_refine_pedestrian.png filter=lfs diff=lfs merge=lfs -text
59
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/crops/diversify_input_metal_barrier.png filter=lfs diff=lfs merge=lfs -text
60
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/crops/diversify_input_parked_dark_car.png filter=lfs diff=lfs merge=lfs -text
61
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/crops/diversify_input_pedestrian.png filter=lfs diff=lfs merge=lfs -text
62
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/main_image.png filter=lfs diff=lfs merge=lfs -text
63
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/references/ref_metal_barrier.png filter=lfs diff=lfs merge=lfs -text
64
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/references/ref_parked_dark_car.png filter=lfs diff=lfs merge=lfs -text
65
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000001/references/ref_pedestrian.png filter=lfs diff=lfs merge=lfs -text
66
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/bbox_overlay.png filter=lfs diff=lfs merge=lfs -text
67
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/detect_refine_street_trees.png filter=lfs diff=lfs merge=lfs -text
68
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/detect_refine_twilight_sky.png filter=lfs diff=lfs merge=lfs -text
69
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/detect_refine_vehicle_dashboard.png filter=lfs diff=lfs merge=lfs -text
70
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/diversify_input_double_yellow_lines.png filter=lfs diff=lfs merge=lfs -text
71
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/diversify_input_street_trees.png filter=lfs diff=lfs merge=lfs -text
72
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/crops/diversify_input_twilight_sky.png filter=lfs diff=lfs merge=lfs -text
73
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/main_image.png filter=lfs diff=lfs merge=lfs -text
74
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_city_buildings.png filter=lfs diff=lfs merge=lfs -text
75
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_dark_car_left.png filter=lfs diff=lfs merge=lfs -text
76
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_dark_suv_driving.png filter=lfs diff=lfs merge=lfs -text
77
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_double_yellow_lines.png filter=lfs diff=lfs merge=lfs -text
78
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_parked_suv_right.png filter=lfs diff=lfs merge=lfs -text
79
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_pedestrian_left.png filter=lfs diff=lfs merge=lfs -text
80
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_pedestrian_right.png filter=lfs diff=lfs merge=lfs -text
81
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_pink_scooter.png filter=lfs diff=lfs merge=lfs -text
82
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_storefront_sign.png filter=lfs diff=lfs merge=lfs -text
83
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_street_light.png filter=lfs diff=lfs merge=lfs -text
84
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_street_signs.png filter=lfs diff=lfs merge=lfs -text
85
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_street_trees.png filter=lfs diff=lfs merge=lfs -text
86
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_twilight_sky.png filter=lfs diff=lfs merge=lfs -text
87
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_vehicle_dashboard.png filter=lfs diff=lfs merge=lfs -text
88
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000002/references/ref_white_car_ahead.png filter=lfs diff=lfs merge=lfs -text
89
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/bbox_overlay.png filter=lfs diff=lfs merge=lfs -text
90
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/crops/detect_refine_black_sedan.png filter=lfs diff=lfs merge=lfs -text
91
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/crops/detect_refine_silver_car.png filter=lfs diff=lfs merge=lfs -text
92
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/crops/diversify_input_black_sedan.png filter=lfs diff=lfs merge=lfs -text
93
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/crops/diversify_input_shopper.png filter=lfs diff=lfs merge=lfs -text
94
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/crops/diversify_input_silver_car.png filter=lfs diff=lfs merge=lfs -text
95
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/main_image.png filter=lfs diff=lfs merge=lfs -text
96
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/references/ref_black_sedan.png filter=lfs diff=lfs merge=lfs -text
97
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/references/ref_shopper.png filter=lfs diff=lfs merge=lfs -text
98
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000003/references/ref_silver_car.png filter=lfs diff=lfs merge=lfs -text
99
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/bbox_overlay.png filter=lfs diff=lfs merge=lfs -text
100
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/crops/detect_refine_dark_parked_car.png filter=lfs diff=lfs merge=lfs -text
101
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/crops/detect_refine_plain_delivery_truck.png filter=lfs diff=lfs merge=lfs -text
102
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/crops/detect_refine_street_lines.png filter=lfs diff=lfs merge=lfs -text
103
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/crops/diversify_input_dark_parked_car.png filter=lfs diff=lfs merge=lfs -text
104
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/crops/diversify_input_plain_delivery_truck.png filter=lfs diff=lfs merge=lfs -text
105
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/crops/diversify_input_street_lines.png filter=lfs diff=lfs merge=lfs -text
106
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/main_image.png filter=lfs diff=lfs merge=lfs -text
107
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/references/ref_dark_parked_car.png filter=lfs diff=lfs merge=lfs -text
108
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/references/ref_pedestrian_walker.png filter=lfs diff=lfs merge=lfs -text
109
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/references/ref_plain_delivery_truck.png filter=lfs diff=lfs merge=lfs -text
110
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/references/ref_red_traffic_light.png filter=lfs diff=lfs merge=lfs -text
111
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000004/references/ref_street_lines.png filter=lfs diff=lfs merge=lfs -text
112
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/bbox_overlay.png filter=lfs diff=lfs merge=lfs -text
113
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/crops/detect_refine_silver_car.png filter=lfs diff=lfs merge=lfs -text
114
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/crops/detect_refine_street_trees.png filter=lfs diff=lfs merge=lfs -text
115
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/crops/diversify_input_silver_car.png filter=lfs diff=lfs merge=lfs -text
116
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/main_image.png filter=lfs diff=lfs merge=lfs -text
117
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/ref_man_in_suit.png filter=lfs diff=lfs merge=lfs -text
118
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/ref_person_yellow_top.png filter=lfs diff=lfs merge=lfs -text
119
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/ref_silver_car.png filter=lfs diff=lfs merge=lfs -text
120
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/ref_street_trees.png filter=lfs diff=lfs merge=lfs -text
121
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/ref_traffic_light.png filter=lfs diff=lfs merge=lfs -text
122
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000005/references/ref_young_girl.png filter=lfs diff=lfs merge=lfs -text
123
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/bbox_overlay.png filter=lfs diff=lfs merge=lfs -text
124
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/crops/detect_refine_blooming_trees.png filter=lfs diff=lfs merge=lfs -text
125
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/crops/detect_refine_double_solid_white_line.png filter=lfs diff=lfs merge=lfs -text
126
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/crops/detect_refine_white_panel_van.png filter=lfs diff=lfs merge=lfs -text
127
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/crops/diversify_input_blooming_trees.png filter=lfs diff=lfs merge=lfs -text
128
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/crops/diversify_input_double_solid_white_line.png filter=lfs diff=lfs merge=lfs -text
129
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/crops/diversify_input_white_panel_van.png filter=lfs diff=lfs merge=lfs -text
130
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/main_image.png filter=lfs diff=lfs merge=lfs -text
131
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/ref_blooming_trees.png filter=lfs diff=lfs merge=lfs -text
132
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/ref_curbside_trash_can.png filter=lfs diff=lfs merge=lfs -text
133
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/ref_double_solid_white_line.png filter=lfs diff=lfs merge=lfs -text
134
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/ref_emergency_vehicle.png filter=lfs diff=lfs merge=lfs -text
135
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/ref_green_street_sign.png filter=lfs diff=lfs merge=lfs -text
136
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/ref_museum_banner.png filter=lfs diff=lfs merge=lfs -text
137
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/ref_pedestrian.png filter=lfs diff=lfs merge=lfs -text
138
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000006/references/ref_white_panel_van.png filter=lfs diff=lfs merge=lfs -text
139
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000007/crops/detect_refine_dashboard.png filter=lfs diff=lfs merge=lfs -text
140
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000007/crops/detect_refine_multi_story_building_left.png filter=lfs diff=lfs merge=lfs -text
141
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000007/crops/detect_refine_overhead_wires.png filter=lfs diff=lfs merge=lfs -text
142
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000007/crops/detect_refine_red_brick_building.png filter=lfs diff=lfs merge=lfs -text
143
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000007/crops/detect_refine_street_light_pole.png filter=lfs diff=lfs merge=lfs -text
144
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000007/main_image.png filter=lfs diff=lfs merge=lfs -text
145
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/bbox_overlay.png filter=lfs diff=lfs merge=lfs -text
146
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/detect_refine_dashboard_reflection.png filter=lfs diff=lfs merge=lfs -text
147
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/detect_refine_passenger.png filter=lfs diff=lfs merge=lfs -text
148
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/diversify_input_dashboard_reflection.png filter=lfs diff=lfs merge=lfs -text
149
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/diversify_input_passenger.png filter=lfs diff=lfs merge=lfs -text
150
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/diversify_input_pedestrian_crossing.png filter=lfs diff=lfs merge=lfs -text
151
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/crops/diversify_input_shopper_waiting.png filter=lfs diff=lfs merge=lfs -text
152
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/main_image.png filter=lfs diff=lfs merge=lfs -text
153
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/ref_businessman.png filter=lfs diff=lfs merge=lfs -text
154
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/ref_dashboard_reflection.png filter=lfs diff=lfs merge=lfs -text
155
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/ref_passenger.png filter=lfs diff=lfs merge=lfs -text
156
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/ref_pedestrian_crossing.png filter=lfs diff=lfs merge=lfs -text
157
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/ref_pedestrian_walking_away.png filter=lfs diff=lfs merge=lfs -text
158
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/ref_protester.png filter=lfs diff=lfs merge=lfs -text
159
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/ref_shopper_standing.png filter=lfs diff=lfs merge=lfs -text
160
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/ref_shopper_waiting.png filter=lfs diff=lfs merge=lfs -text
161
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/ref_street_lamp.png filter=lfs diff=lfs merge=lfs -text
162
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000008/references/ref_young_man.png filter=lfs diff=lfs merge=lfs -text
163
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/bbox_overlay.png filter=lfs diff=lfs merge=lfs -text
164
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/crops/detect_refine_overpass.png filter=lfs diff=lfs merge=lfs -text
165
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/crops/detect_refine_yellow_building.png filter=lfs diff=lfs merge=lfs -text
166
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/crops/detect_refine_yellow_lane_line.png filter=lfs diff=lfs merge=lfs -text
167
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/crops/diversify_input_overpass.png filter=lfs diff=lfs merge=lfs -text
168
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/crops/diversify_input_yellow_building.png filter=lfs diff=lfs merge=lfs -text
169
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/crops/diversify_input_yellow_lane_line.png filter=lfs diff=lfs merge=lfs -text
170
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/main_image.png filter=lfs diff=lfs merge=lfs -text
171
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/references/ref_overpass.png filter=lfs diff=lfs merge=lfs -text
172
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/references/ref_pedestrian_in_suit.png filter=lfs diff=lfs merge=lfs -text
173
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/references/ref_street_light.png filter=lfs diff=lfs merge=lfs -text
174
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/references/ref_yellow_building.png filter=lfs diff=lfs merge=lfs -text
175
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000009/references/ref_yellow_lane_line.png filter=lfs diff=lfs merge=lfs -text
176
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/bbox_overlay.png filter=lfs diff=lfs merge=lfs -text
177
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/crops/detect_refine_black_suv.png filter=lfs diff=lfs merge=lfs -text
178
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/crops/diversify_input_black_suv.png filter=lfs diff=lfs merge=lfs -text
179
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/crops/diversify_input_pedestrian_black_jacket.png filter=lfs diff=lfs merge=lfs -text
180
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/main_image.png filter=lfs diff=lfs merge=lfs -text
181
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/ref_black_suv.png filter=lfs diff=lfs merge=lfs -text
182
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/ref_distant_pedestrian.png filter=lfs diff=lfs merge=lfs -text
183
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/ref_man_pink_shirt.png filter=lfs diff=lfs merge=lfs -text
184
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/ref_pedestrian_backpack.png filter=lfs diff=lfs merge=lfs -text
185
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/ref_pedestrian_black_jacket.png filter=lfs diff=lfs merge=lfs -text
186
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/ref_pedestrian_light_blue_shirt.png filter=lfs diff=lfs merge=lfs -text
187
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/ref_pedestrian_light_jacket.png filter=lfs diff=lfs merge=lfs -text
188
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/ref_pedestrian_red_jacket.png filter=lfs diff=lfs merge=lfs -text
189
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000010/references/ref_pedestrian_striped_shirt.png filter=lfs diff=lfs merge=lfs -text
190
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/bbox_overlay.png filter=lfs diff=lfs merge=lfs -text
191
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/detect_refine_ego_car_dashboard.png filter=lfs diff=lfs merge=lfs -text
192
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/diversify_input_ego_car_dashboard.png filter=lfs diff=lfs merge=lfs -text
193
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/crops/diversify_input_windshield_mount.png filter=lfs diff=lfs merge=lfs -text
194
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/main_image.png filter=lfs diff=lfs merge=lfs -text
195
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/ref_background_street_light.png filter=lfs diff=lfs merge=lfs -text
196
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/ref_blonde_woman.png filter=lfs diff=lfs merge=lfs -text
197
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/ref_ego_car_dashboard.png filter=lfs diff=lfs merge=lfs -text
198
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/ref_metal_structure.png filter=lfs diff=lfs merge=lfs -text
199
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/ref_oncoming_white_car.png filter=lfs diff=lfs merge=lfs -text
200
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/ref_overhead_streetlights.png filter=lfs diff=lfs merge=lfs -text
201
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/ref_silver_car.png filter=lfs diff=lfs merge=lfs -text
202
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/ref_vintage_street_lamp.png filter=lfs diff=lfs merge=lfs -text
203
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/ref_white_suv.png filter=lfs diff=lfs merge=lfs -text
204
+ samples_v8/driving/BDD100K_CrowdHuman_samples/sample_000011/references/ref_windshield_mount.png filter=lfs diff=lfs merge=lfs -text
samples_v8/driving/BDD100K_CrowdHuman_samples/README.md ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # samples_v8
2
+
3
+ Generated with `data_recipe_v8.md`: vocabulary-first planning, adaptive canvas selection, structured JSON compose prompts, no identity verification, no gate, SAM white-background reference postprocessing, and strict reference-completeness verification with regenerate-until-pass behavior.
4
+
5
+ - chat model: `gcp/google/gemini-3.1-pro-preview`
6
+ - image model: `gcp/google/gemini-3-pro-image-preview`
7
+ - people references: `white_bg_full_body_front`
8
+ - non-person references: `white_bg_encyclopedia_photo`
9
+ - SAM postprocess: every generated reference is segmented with `sam_vit_b` and pasted onto pure `#ffffff` background
10
+ - reference verify max attempts per subject: `10`
11
+ - allowed canvases: `[{"aspect_ratio": "1:1", "size": [1024, 1024], "style": "photorealistic"}, {"aspect_ratio": "4:3", "size": [1152, 864], "style": "photorealistic"}, {"aspect_ratio": "3:4", "size": [864, 1152], "style": "photorealistic"}, {"aspect_ratio": "3:2", "size": [1248, 832], "style": "photorealistic"}, {"aspect_ratio": "2:3", "size": [832, 1248], "style": "photorealistic"}, {"aspect_ratio": "16:9", "size": [1280, 720], "style": "photorealistic"}, {"aspect_ratio": "9:16", "size": [720, 1280], "style": "photorealistic"}]`
12
+ - scenario mode: `driving`
13
+ - pools: `vocab_task_pool`, `plan_pool`, `scene_pool`, `detection_pool`, `reference_pool`
14
+ - bbox overlay: `bbox_overlay.png` draws every planned subject bbox; a sample is rejected and regenerated if any planned subject is still missing after VLM detection retries
15
+ - detection max attempts per subject: `3`
16
+ - launch args: `{"compose_workers": 3, "detect_max_attempts": 3, "detect_workers": 3, "emit_workers": 4, "idle_sleep": 1.0, "image_inflight": 32, "image_interval": 0.05, "image_max_retries": 8, "max_retries": 3, "no_topup": false, "plan_workers": 6, "ref_verify_max_attempts": 10, "reference_workers": 6, "requeue_in_progress": true, "seed": 1781927993, "status_interval": 30.0, "subject_detect_workers": 24, "target_samples": 10}`
samples_v8/driving/BDD100K_CrowdHuman_samples/dataset.json ADDED
The diff for this file is too large to render. See raw diff
 
samples_v8/driving/BDD100K_CrowdHuman_samples/dataset.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
samples_v8/driving/BDD100K_CrowdHuman_samples/detection_pool/done/sample_000001.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "sample_id": "sample_000001",
3
+ "plan_path": "sample_000001/plan.json",
4
+ "task_path": "sample_000001/vocab_task.json",
5
+ "main_image": "sample_000001/main_image.png",
6
+ "detections": "sample_000001/detections.json",
7
+ "n_detected": 3,
8
+ "model_ids": {
9
+ "chat_model": "gcp/google/gemini-3.1-pro-preview",
10
+ "image_model": "gcp/google/gemini-3-pro-image-preview"
11
+ },
12
+ "item_id": "sample_000001",
13
+ "pool": "detection_pool",
14
+ "retry_count": 0,
15
+ "errors": []
16
+ }
samples_v8/driving/BDD100K_CrowdHuman_samples/detection_pool/done/sample_000002.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "sample_id": "sample_000002",
3
+ "plan_path": "sample_000002/plan.json",
4
+ "task_path": "sample_000002/vocab_task.json",
5
+ "main_image": "sample_000002/main_image.png",
6
+ "detections": "sample_000002/detections.json",
7
+ "n_detected": 15,
8
+ "model_ids": {
9
+ "chat_model": "gcp/google/gemini-3.1-pro-preview",
10
+ "image_model": "gcp/google/gemini-3-pro-image-preview"
11
+ },
12
+ "item_id": "sample_000002",
13
+ "pool": "detection_pool",
14
+ "retry_count": 0,
15
+ "errors": []
16
+ }
samples_v8/driving/BDD100K_CrowdHuman_samples/detection_pool/done/sample_000003.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "sample_id": "sample_000003",
3
+ "plan_path": "sample_000003/plan.json",
4
+ "task_path": "sample_000003/vocab_task.json",
5
+ "main_image": "sample_000003/main_image.png",
6
+ "detections": "sample_000003/detections.json",
7
+ "n_detected": 3,
8
+ "model_ids": {
9
+ "chat_model": "gcp/google/gemini-3.1-pro-preview",
10
+ "image_model": "gcp/google/gemini-3-pro-image-preview"
11
+ },
12
+ "item_id": "sample_000003",
13
+ "pool": "detection_pool",
14
+ "retry_count": 0,
15
+ "errors": []
16
+ }
samples_v8/driving/BDD100K_CrowdHuman_samples/detection_pool/done/sample_000004.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "sample_id": "sample_000004",
3
+ "plan_path": "sample_000004/plan.json",
4
+ "task_path": "sample_000004/vocab_task.json",
5
+ "main_image": "sample_000004/main_image.png",
6
+ "detections": "sample_000004/detections.json",
7
+ "n_detected": 5,
8
+ "model_ids": {
9
+ "chat_model": "gcp/google/gemini-3.1-pro-preview",
10
+ "image_model": "gcp/google/gemini-3-pro-image-preview"
11
+ },
12
+ "item_id": "sample_000004",
13
+ "pool": "detection_pool",
14
+ "retry_count": 0,
15
+ "errors": []
16
+ }
samples_v8/driving/BDD100K_CrowdHuman_samples/detection_pool/done/sample_000005.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "sample_id": "sample_000005",
3
+ "plan_path": "sample_000005/plan.json",
4
+ "task_path": "sample_000005/vocab_task.json",
5
+ "main_image": "sample_000005/main_image.png",
6
+ "detections": "sample_000005/detections.json",
7
+ "n_detected": 6,
8
+ "model_ids": {
9
+ "chat_model": "gcp/google/gemini-3.1-pro-preview",
10
+ "image_model": "gcp/google/gemini-3-pro-image-preview"
11
+ },
12
+ "item_id": "sample_000005",
13
+ "pool": "detection_pool",
14
+ "retry_count": 0,
15
+ "errors": []
16
+ }
samples_v8/driving/BDD100K_CrowdHuman_samples/detection_pool/done/sample_000006.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "sample_id": "sample_000006",
3
+ "plan_path": "sample_000006/plan.json",
4
+ "task_path": "sample_000006/vocab_task.json",
5
+ "main_image": "sample_000006/main_image.png",
6
+ "detections": "sample_000006/detections.json",
7
+ "n_detected": 8,
8
+ "model_ids": {
9
+ "chat_model": "gcp/google/gemini-3.1-pro-preview",
10
+ "image_model": "gcp/google/gemini-3-pro-image-preview"
11
+ },
12
+ "item_id": "sample_000006",
13
+ "pool": "detection_pool",
14
+ "retry_count": 0,
15
+ "errors": []
16
+ }
samples_v8/driving/BDD100K_CrowdHuman_samples/detection_pool/done/sample_000008.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "sample_id": "sample_000008",
3
+ "plan_path": "sample_000008/plan.json",
4
+ "task_path": "sample_000008/vocab_task.json",
5
+ "main_image": "sample_000008/main_image.png",
6
+ "detections": "sample_000008/detections.json",
7
+ "n_detected": 10,
8
+ "model_ids": {
9
+ "chat_model": "gcp/google/gemini-3.1-pro-preview",
10
+ "image_model": "gcp/google/gemini-3-pro-image-preview"
11
+ },
12
+ "item_id": "sample_000008",
13
+ "pool": "detection_pool",
14
+ "retry_count": 0,
15
+ "errors": []
16
+ }
samples_v8/driving/BDD100K_CrowdHuman_samples/detection_pool/done/sample_000009.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "sample_id": "sample_000009",
3
+ "plan_path": "sample_000009/plan.json",
4
+ "task_path": "sample_000009/vocab_task.json",
5
+ "main_image": "sample_000009/main_image.png",
6
+ "detections": "sample_000009/detections.json",
7
+ "n_detected": 5,
8
+ "model_ids": {
9
+ "chat_model": "gcp/google/gemini-3.1-pro-preview",
10
+ "image_model": "gcp/google/gemini-3-pro-image-preview"
11
+ },
12
+ "item_id": "sample_000009",
13
+ "pool": "detection_pool",
14
+ "retry_count": 0,
15
+ "errors": []
16
+ }
samples_v8/driving/BDD100K_CrowdHuman_samples/detection_pool/done/sample_000010.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "sample_id": "sample_000010",
3
+ "plan_path": "sample_000010/plan.json",
4
+ "task_path": "sample_000010/vocab_task.json",
5
+ "main_image": "sample_000010/main_image.png",
6
+ "detections": "sample_000010/detections.json",
7
+ "n_detected": 9,
8
+ "model_ids": {
9
+ "chat_model": "gcp/google/gemini-3.1-pro-preview",
10
+ "image_model": "gcp/google/gemini-3-pro-image-preview"
11
+ },
12
+ "item_id": "sample_000010",
13
+ "pool": "detection_pool",
14
+ "retry_count": 0,
15
+ "errors": []
16
+ }
samples_v8/driving/BDD100K_CrowdHuman_samples/detection_pool/done/sample_000011.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "sample_id": "sample_000011",
3
+ "plan_path": "sample_000011/plan.json",
4
+ "task_path": "sample_000011/vocab_task.json",
5
+ "main_image": "sample_000011/main_image.png",
6
+ "detections": "sample_000011/detections.json",
7
+ "n_detected": 10,
8
+ "model_ids": {
9
+ "chat_model": "gcp/google/gemini-3.1-pro-preview",
10
+ "image_model": "gcp/google/gemini-3-pro-image-preview"
11
+ },
12
+ "item_id": "sample_000011",
13
+ "pool": "detection_pool",
14
+ "retry_count": 0,
15
+ "errors": []
16
+ }
samples_v8/driving/BDD100K_CrowdHuman_samples/emit_pool/done/sample_000001.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "sample_id": "sample_000001",
3
+ "row": "sample_000001/row.json"
4
+ }
samples_v8/driving/BDD100K_CrowdHuman_samples/emit_pool/done/sample_000002.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "sample_id": "sample_000002",
3
+ "row": "sample_000002/row.json"
4
+ }
samples_v8/driving/BDD100K_CrowdHuman_samples/emit_pool/done/sample_000003.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "sample_id": "sample_000003",
3
+ "row": "sample_000003/row.json"
4
+ }
samples_v8/driving/BDD100K_CrowdHuman_samples/emit_pool/done/sample_000004.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "sample_id": "sample_000004",
3
+ "row": "sample_000004/row.json"
4
+ }
samples_v8/driving/BDD100K_CrowdHuman_samples/emit_pool/done/sample_000005.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "sample_id": "sample_000005",
3
+ "row": "sample_000005/row.json"
4
+ }
samples_v8/driving/BDD100K_CrowdHuman_samples/emit_pool/done/sample_000006.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "sample_id": "sample_000006",
3
+ "row": "sample_000006/row.json"
4
+ }
samples_v8/driving/BDD100K_CrowdHuman_samples/emit_pool/done/sample_000008.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "sample_id": "sample_000008",
3
+ "row": "sample_000008/row.json"
4
+ }
samples_v8/driving/BDD100K_CrowdHuman_samples/emit_pool/done/sample_000009.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "sample_id": "sample_000009",
3
+ "row": "sample_000009/row.json"
4
+ }
samples_v8/driving/BDD100K_CrowdHuman_samples/emit_pool/done/sample_000010.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "sample_id": "sample_000010",
3
+ "row": "sample_000010/row.json"
4
+ }
samples_v8/driving/BDD100K_CrowdHuman_samples/emit_pool/done/sample_000011.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "sample_id": "sample_000011",
3
+ "row": "sample_000011/row.json"
4
+ }
samples_v8/driving/BDD100K_CrowdHuman_samples/plan_pool/done/sample_000001.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "sample_id": "sample_000001",
3
+ "plan_path": "sample_000001/plan.json",
4
+ "task_path": "sample_000001/vocab_task.json",
5
+ "prompt_hash": "ee63c678fc09a67bb20b6d08e8ef2b19732be3312cf3b823e6d07e1c5dd44f63",
6
+ "model_ids": {
7
+ "chat_model": "gcp/google/gemini-3.1-pro-preview",
8
+ "image_model": "gcp/google/gemini-3-pro-image-preview"
9
+ },
10
+ "item_id": "sample_000001",
11
+ "pool": "plan_pool",
12
+ "retry_count": 0,
13
+ "errors": []
14
+ }
samples_v8/driving/BDD100K_CrowdHuman_samples/plan_pool/done/sample_000002.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "sample_id": "sample_000002",
3
+ "plan_path": "sample_000002/plan.json",
4
+ "task_path": "sample_000002/vocab_task.json",
5
+ "prompt_hash": "c3e70d0e58500cbcc95ef2a96d5d6793951917c615216803cc4c45dcc2a3a379",
6
+ "model_ids": {
7
+ "chat_model": "gcp/google/gemini-3.1-pro-preview",
8
+ "image_model": "gcp/google/gemini-3-pro-image-preview"
9
+ },
10
+ "item_id": "sample_000002",
11
+ "pool": "plan_pool",
12
+ "retry_count": 0,
13
+ "errors": []
14
+ }
samples_v8/driving/BDD100K_CrowdHuman_samples/plan_pool/done/sample_000003.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "sample_id": "sample_000003",
3
+ "plan_path": "sample_000003/plan.json",
4
+ "task_path": "sample_000003/vocab_task.json",
5
+ "prompt_hash": "117c1932f1edb3ce9fdfb3e81dfacb7b09d1402b748b361e9e812444f5375b35",
6
+ "model_ids": {
7
+ "chat_model": "gcp/google/gemini-3.1-pro-preview",
8
+ "image_model": "gcp/google/gemini-3-pro-image-preview"
9
+ },
10
+ "item_id": "sample_000003",
11
+ "pool": "plan_pool",
12
+ "retry_count": 0,
13
+ "errors": []
14
+ }
samples_v8/driving/BDD100K_CrowdHuman_samples/plan_pool/done/sample_000004.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "sample_id": "sample_000004",
3
+ "plan_path": "sample_000004/plan.json",
4
+ "task_path": "sample_000004/vocab_task.json",
5
+ "prompt_hash": "6960fecb1d8acad95182b833cb50f6e2533d54c0b80d08771e7bc3b42d40e3d8",
6
+ "model_ids": {
7
+ "chat_model": "gcp/google/gemini-3.1-pro-preview",
8
+ "image_model": "gcp/google/gemini-3-pro-image-preview"
9
+ },
10
+ "item_id": "sample_000004",
11
+ "pool": "plan_pool",
12
+ "retry_count": 0,
13
+ "errors": []
14
+ }
samples_v8/driving/BDD100K_CrowdHuman_samples/plan_pool/done/sample_000005.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "sample_id": "sample_000005",
3
+ "plan_path": "sample_000005/plan.json",
4
+ "task_path": "sample_000005/vocab_task.json",
5
+ "prompt_hash": "a48c0aa92c5c1cd47384926ff0c246c81330b81484f9df4241df125f4b568141",
6
+ "model_ids": {
7
+ "chat_model": "gcp/google/gemini-3.1-pro-preview",
8
+ "image_model": "gcp/google/gemini-3-pro-image-preview"
9
+ },
10
+ "item_id": "sample_000005",
11
+ "pool": "plan_pool",
12
+ "retry_count": 0,
13
+ "errors": []
14
+ }
samples_v8/driving/BDD100K_CrowdHuman_samples/plan_pool/done/sample_000006.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "sample_id": "sample_000006",
3
+ "plan_path": "sample_000006/plan.json",
4
+ "task_path": "sample_000006/vocab_task.json",
5
+ "prompt_hash": "d570570e4c1353a40b5e3c9c048efe37edf22c7e2a5e5977e66dc49033cbd19d",
6
+ "model_ids": {
7
+ "chat_model": "gcp/google/gemini-3.1-pro-preview",
8
+ "image_model": "gcp/google/gemini-3-pro-image-preview"
9
+ },
10
+ "item_id": "sample_000006",
11
+ "pool": "plan_pool",
12
+ "retry_count": 0,
13
+ "errors": []
14
+ }
samples_v8/driving/BDD100K_CrowdHuman_samples/plan_pool/done/sample_000007.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "sample_id": "sample_000007",
3
+ "plan_path": "sample_000007/plan.json",
4
+ "task_path": "sample_000007/vocab_task.json",
5
+ "prompt_hash": "9bdbac530e2f6fdaa0bef9408c2676077dc19d2a8c0de7167594ca7912f19985",
6
+ "model_ids": {
7
+ "chat_model": "gcp/google/gemini-3.1-pro-preview",
8
+ "image_model": "gcp/google/gemini-3-pro-image-preview"
9
+ },
10
+ "item_id": "sample_000007",
11
+ "pool": "plan_pool",
12
+ "retry_count": 0,
13
+ "errors": []
14
+ }
samples_v8/driving/BDD100K_CrowdHuman_samples/plan_pool/done/sample_000008.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "sample_id": "sample_000008",
3
+ "plan_path": "sample_000008/plan.json",
4
+ "task_path": "sample_000008/vocab_task.json",
5
+ "prompt_hash": "587eff1134028954e2fb620b54fe1638b5a6fcbd3fe47f6dbdbd07d717459d81",
6
+ "model_ids": {
7
+ "chat_model": "gcp/google/gemini-3.1-pro-preview",
8
+ "image_model": "gcp/google/gemini-3-pro-image-preview"
9
+ },
10
+ "item_id": "sample_000008",
11
+ "pool": "plan_pool",
12
+ "retry_count": 0,
13
+ "errors": []
14
+ }
samples_v8/driving/BDD100K_CrowdHuman_samples/plan_pool/done/sample_000009.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "sample_id": "sample_000009",
3
+ "plan_path": "sample_000009/plan.json",
4
+ "task_path": "sample_000009/vocab_task.json",
5
+ "prompt_hash": "a012f98a386adac92d188c68bde72a52c6cbb0dcfd864a08e70d6f08881a15de",
6
+ "model_ids": {
7
+ "chat_model": "gcp/google/gemini-3.1-pro-preview",
8
+ "image_model": "gcp/google/gemini-3-pro-image-preview"
9
+ },
10
+ "item_id": "sample_000009",
11
+ "pool": "plan_pool",
12
+ "retry_count": 0,
13
+ "errors": []
14
+ }
samples_v8/driving/BDD100K_CrowdHuman_samples/plan_pool/done/sample_000010.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "sample_id": "sample_000010",
3
+ "plan_path": "sample_000010/plan.json",
4
+ "task_path": "sample_000010/vocab_task.json",
5
+ "prompt_hash": "0ac9cdba25e93fc329b5dd2d566e77d7add5c4dffbf9c53d4f9b8998e0b21917",
6
+ "model_ids": {
7
+ "chat_model": "gcp/google/gemini-3.1-pro-preview",
8
+ "image_model": "gcp/google/gemini-3-pro-image-preview"
9
+ },
10
+ "item_id": "sample_000010",
11
+ "pool": "plan_pool",
12
+ "retry_count": 0,
13
+ "errors": []
14
+ }
samples_v8/driving/BDD100K_CrowdHuman_samples/plan_pool/done/sample_000011.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "sample_id": "sample_000011",
3
+ "plan_path": "sample_000011/plan.json",
4
+ "task_path": "sample_000011/vocab_task.json",
5
+ "prompt_hash": "9b1f48926b835cfec59e58b4564016b13af9e8f32197b9bfe4a85cddce267178",
6
+ "model_ids": {
7
+ "chat_model": "gcp/google/gemini-3.1-pro-preview",
8
+ "image_model": "gcp/google/gemini-3-pro-image-preview"
9
+ },
10
+ "item_id": "sample_000011",
11
+ "pool": "plan_pool",
12
+ "retry_count": 0,
13
+ "errors": []
14
+ }
samples_v8/driving/BDD100K_CrowdHuman_samples/reference_pool/done/sample_000001.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "sample_id": "sample_000001",
3
+ "plan_path": "sample_000001/plan.json",
4
+ "task_path": "sample_000001/vocab_task.json",
5
+ "main_image": "sample_000001/main_image.png",
6
+ "detections": "sample_000001/detections.json",
7
+ "references": "sample_000001/references.json",
8
+ "n_references": 3,
9
+ "reference_errors": {},
10
+ "model_ids": {
11
+ "chat_model": "gcp/google/gemini-3.1-pro-preview",
12
+ "image_model": "gcp/google/gemini-3-pro-image-preview"
13
+ },
14
+ "item_id": "sample_000001",
15
+ "pool": "reference_pool",
16
+ "retry_count": 0,
17
+ "errors": []
18
+ }
samples_v8/driving/BDD100K_CrowdHuman_samples/reference_pool/done/sample_000002.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "sample_id": "sample_000002",
3
+ "plan_path": "sample_000002/plan.json",
4
+ "task_path": "sample_000002/vocab_task.json",
5
+ "main_image": "sample_000002/main_image.png",
6
+ "detections": "sample_000002/detections.json",
7
+ "references": "sample_000002/references.json",
8
+ "n_references": 15,
9
+ "reference_errors": {},
10
+ "model_ids": {
11
+ "chat_model": "gcp/google/gemini-3.1-pro-preview",
12
+ "image_model": "gcp/google/gemini-3-pro-image-preview"
13
+ },
14
+ "item_id": "sample_000002",
15
+ "pool": "reference_pool",
16
+ "retry_count": 0,
17
+ "errors": []
18
+ }
samples_v8/driving/BDD100K_CrowdHuman_samples/reference_pool/done/sample_000003.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "sample_id": "sample_000003",
3
+ "plan_path": "sample_000003/plan.json",
4
+ "task_path": "sample_000003/vocab_task.json",
5
+ "main_image": "sample_000003/main_image.png",
6
+ "detections": "sample_000003/detections.json",
7
+ "references": "sample_000003/references.json",
8
+ "n_references": 3,
9
+ "reference_errors": {},
10
+ "model_ids": {
11
+ "chat_model": "gcp/google/gemini-3.1-pro-preview",
12
+ "image_model": "gcp/google/gemini-3-pro-image-preview"
13
+ },
14
+ "item_id": "sample_000003",
15
+ "pool": "reference_pool",
16
+ "retry_count": 0,
17
+ "errors": []
18
+ }
samples_v8/driving/BDD100K_CrowdHuman_samples/reference_pool/done/sample_000004.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "sample_id": "sample_000004",
3
+ "plan_path": "sample_000004/plan.json",
4
+ "task_path": "sample_000004/vocab_task.json",
5
+ "main_image": "sample_000004/main_image.png",
6
+ "detections": "sample_000004/detections.json",
7
+ "references": "sample_000004/references.json",
8
+ "n_references": 5,
9
+ "reference_errors": {},
10
+ "model_ids": {
11
+ "chat_model": "gcp/google/gemini-3.1-pro-preview",
12
+ "image_model": "gcp/google/gemini-3-pro-image-preview"
13
+ },
14
+ "item_id": "sample_000004",
15
+ "pool": "reference_pool",
16
+ "retry_count": 0,
17
+ "errors": []
18
+ }
samples_v8/driving/BDD100K_CrowdHuman_samples/reference_pool/done/sample_000005.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "sample_id": "sample_000005",
3
+ "plan_path": "sample_000005/plan.json",
4
+ "task_path": "sample_000005/vocab_task.json",
5
+ "main_image": "sample_000005/main_image.png",
6
+ "detections": "sample_000005/detections.json",
7
+ "references": "sample_000005/references.json",
8
+ "n_references": 6,
9
+ "reference_errors": {},
10
+ "model_ids": {
11
+ "chat_model": "gcp/google/gemini-3.1-pro-preview",
12
+ "image_model": "gcp/google/gemini-3-pro-image-preview"
13
+ },
14
+ "item_id": "sample_000005",
15
+ "pool": "reference_pool",
16
+ "retry_count": 0,
17
+ "errors": []
18
+ }
samples_v8/driving/BDD100K_CrowdHuman_samples/reference_pool/done/sample_000006.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "sample_id": "sample_000006",
3
+ "plan_path": "sample_000006/plan.json",
4
+ "task_path": "sample_000006/vocab_task.json",
5
+ "main_image": "sample_000006/main_image.png",
6
+ "detections": "sample_000006/detections.json",
7
+ "references": "sample_000006/references.json",
8
+ "n_references": 8,
9
+ "reference_errors": {},
10
+ "model_ids": {
11
+ "chat_model": "gcp/google/gemini-3.1-pro-preview",
12
+ "image_model": "gcp/google/gemini-3-pro-image-preview"
13
+ },
14
+ "item_id": "sample_000006",
15
+ "pool": "reference_pool",
16
+ "retry_count": 0,
17
+ "errors": []
18
+ }
samples_v8/driving/BDD100K_CrowdHuman_samples/reference_pool/done/sample_000008.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "sample_id": "sample_000008",
3
+ "plan_path": "sample_000008/plan.json",
4
+ "task_path": "sample_000008/vocab_task.json",
5
+ "main_image": "sample_000008/main_image.png",
6
+ "detections": "sample_000008/detections.json",
7
+ "references": "sample_000008/references.json",
8
+ "n_references": 10,
9
+ "reference_errors": {},
10
+ "model_ids": {
11
+ "chat_model": "gcp/google/gemini-3.1-pro-preview",
12
+ "image_model": "gcp/google/gemini-3-pro-image-preview"
13
+ },
14
+ "item_id": "sample_000008",
15
+ "pool": "reference_pool",
16
+ "retry_count": 0,
17
+ "errors": []
18
+ }
samples_v8/driving/BDD100K_CrowdHuman_samples/reference_pool/done/sample_000009.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "sample_id": "sample_000009",
3
+ "plan_path": "sample_000009/plan.json",
4
+ "task_path": "sample_000009/vocab_task.json",
5
+ "main_image": "sample_000009/main_image.png",
6
+ "detections": "sample_000009/detections.json",
7
+ "references": "sample_000009/references.json",
8
+ "n_references": 5,
9
+ "reference_errors": {},
10
+ "model_ids": {
11
+ "chat_model": "gcp/google/gemini-3.1-pro-preview",
12
+ "image_model": "gcp/google/gemini-3-pro-image-preview"
13
+ },
14
+ "item_id": "sample_000009",
15
+ "pool": "reference_pool",
16
+ "retry_count": 0,
17
+ "errors": []
18
+ }
samples_v8/driving/BDD100K_CrowdHuman_samples/reference_pool/done/sample_000010.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "sample_id": "sample_000010",
3
+ "plan_path": "sample_000010/plan.json",
4
+ "task_path": "sample_000010/vocab_task.json",
5
+ "main_image": "sample_000010/main_image.png",
6
+ "detections": "sample_000010/detections.json",
7
+ "references": "sample_000010/references.json",
8
+ "n_references": 9,
9
+ "reference_errors": {},
10
+ "model_ids": {
11
+ "chat_model": "gcp/google/gemini-3.1-pro-preview",
12
+ "image_model": "gcp/google/gemini-3-pro-image-preview"
13
+ },
14
+ "item_id": "sample_000010",
15
+ "pool": "reference_pool",
16
+ "retry_count": 0,
17
+ "errors": []
18
+ }
samples_v8/driving/BDD100K_CrowdHuman_samples/reference_pool/done/sample_000011.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "sample_id": "sample_000011",
3
+ "plan_path": "sample_000011/plan.json",
4
+ "task_path": "sample_000011/vocab_task.json",
5
+ "main_image": "sample_000011/main_image.png",
6
+ "detections": "sample_000011/detections.json",
7
+ "references": "sample_000011/references.json",
8
+ "n_references": 10,
9
+ "reference_errors": {},
10
+ "model_ids": {
11
+ "chat_model": "gcp/google/gemini-3.1-pro-preview",
12
+ "image_model": "gcp/google/gemini-3-pro-image-preview"
13
+ },
14
+ "item_id": "sample_000011",
15
+ "pool": "reference_pool",
16
+ "retry_count": 0,
17
+ "errors": []
18
+ }
samples_v8/driving/BDD100K_CrowdHuman_samples/rows/sample_000001.json ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "sample_id": "sample_000001",
3
+ "target_total": 3,
4
+ "target_people": 1,
5
+ "target_objects": 2,
6
+ "canvas_size": [
7
+ 1248,
8
+ 832
9
+ ],
10
+ "canvas_aspect_ratio": "3:2",
11
+ "main_image": "main_image.png",
12
+ "bbox_overlay": "bbox_overlay.png",
13
+ "plan": "plan.json",
14
+ "detections": "detections.json",
15
+ "vocab_task": "vocab_task.json",
16
+ "n_planned": 3,
17
+ "n_detected": 3,
18
+ "n_subjects": 3,
19
+ "subjects": [
20
+ {
21
+ "name": "pedestrian",
22
+ "is_person": true,
23
+ "subject_type": "person",
24
+ "source_set": "people_set",
25
+ "source_image_id": "CrowdHuman:data/data_69/273278,12fc4700013112375.jpg:person:3",
26
+ "source_name": "pedestrian",
27
+ "source_description": "A person wearing a dark coat and trousers. Source dataset: CrowdHuman. Scene context: A bustling city street lined with trees showcasing vibrant yellow autumn foliage, with many pedestrians walking in both directions.",
28
+ "sub_caption": "pedestrian: A person wearing a dark coat and trousers.. Scene role: Walking on the sidewalk alongside the street.",
29
+ "measured_bbox": [
30
+ 0.177,
31
+ 0.0,
32
+ 0.3091,
33
+ 0.4552
34
+ ],
35
+ "detection_confidence": 0.95,
36
+ "ref_style": "white_bg_full_body_front",
37
+ "ref_image": "references/ref_pedestrian.png",
38
+ "raw_ref_image": "references/raw_ref_pedestrian_attempt_01.png",
39
+ "reference_verify": "references/reference_verify_pedestrian.json",
40
+ "reference_verify_passed": true,
41
+ "reference_attempts": 1,
42
+ "sam_white_bg": {
43
+ "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000001/references/raw_ref_pedestrian_attempt_01.png",
44
+ "output": "references/ref_pedestrian.png",
45
+ "mask": "references/sam_mask_pedestrian.png",
46
+ "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth",
47
+ "sam_model_type": "vit_b",
48
+ "sam_device": "auto",
49
+ "sam_working_size": [
50
+ 640,
51
+ 640
52
+ ],
53
+ "sam_max_side": 640,
54
+ "sam_downscale": 0.625,
55
+ "prompt_box_xyxy": [
56
+ 332.0,
57
+ 40.0,
58
+ 693.0,
59
+ 999.0
60
+ ],
61
+ "mask_score": 3.438137,
62
+ "mask_area_ratio": 0.157722,
63
+ "elapsed_seconds": 8.8351
64
+ }
65
+ },
66
+ {
67
+ "name": "parked_dark_car",
68
+ "is_person": false,
69
+ "subject_type": "object",
70
+ "source_set": "obj_set",
71
+ "source_image_id": "BDD100K:bcb356f6-520dd65c:object:9",
72
+ "source_name": "parked dark car",
73
+ "source_description": "A dark-colored sedan parked on the right side of the street, behind the silver car. Source dataset: BDD100K. Scene context: A large white New York City bus is driving down a city street on an overcast day with other cars parked and driving around it.",
74
+ "sub_caption": "parked dark car: A dark-colored sedan.. Scene role: Parked on the right side of the street next to the curb.",
75
+ "measured_bbox": [
76
+ 0.5856,
77
+ 0.0522,
78
+ 0.9973,
79
+ 0.6586
80
+ ],
81
+ "detection_confidence": 0.98,
82
+ "ref_style": "white_bg_encyclopedia_photo",
83
+ "ref_image": "references/ref_parked_dark_car.png",
84
+ "raw_ref_image": "references/raw_ref_parked_dark_car_attempt_02.png",
85
+ "reference_verify": "references/reference_verify_parked_dark_car.json",
86
+ "reference_verify_passed": true,
87
+ "reference_attempts": 2,
88
+ "sam_white_bg": {
89
+ "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000001/references/raw_ref_parked_dark_car_attempt_02.png",
90
+ "output": "references/ref_parked_dark_car.png",
91
+ "mask": "references/sam_mask_parked_dark_car.png",
92
+ "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth",
93
+ "sam_model_type": "vit_b",
94
+ "sam_device": "auto",
95
+ "sam_working_size": [
96
+ 640,
97
+ 640
98
+ ],
99
+ "sam_max_side": 640,
100
+ "sam_downscale": 0.625,
101
+ "prompt_box_xyxy": [
102
+ 17.0,
103
+ 427.0,
104
+ 1006.0,
105
+ 796.0
106
+ ],
107
+ "mask_score": 3.312519,
108
+ "mask_area_ratio": 0.186911,
109
+ "elapsed_seconds": 8.4991
110
+ }
111
+ },
112
+ {
113
+ "name": "metal_barrier",
114
+ "is_person": false,
115
+ "subject_type": "object",
116
+ "source_set": "obj_set",
117
+ "source_image_id": "CrowdHuman:data/data_74/284193,1da20000b642be5b.jpg:object:5",
118
+ "source_name": "metal barrier",
119
+ "source_description": "silver metal barricade placed near the entrance Source dataset: CrowdHuman. Scene context: People are gathered outside the entrance of a stone building with arched doorways and large windows.",
120
+ "sub_caption": "metal barrier: A silver metal barricade.. Scene role: Placed along the edge of the sidewalk near the parked car, separating the walkway from the street.",
121
+ "measured_bbox": [
122
+ 0.0,
123
+ 0.1355,
124
+ 0.6068,
125
+ 0.558
126
+ ],
127
+ "detection_confidence": 0.95,
128
+ "ref_style": "white_bg_encyclopedia_photo",
129
+ "ref_image": "references/ref_metal_barrier.png",
130
+ "raw_ref_image": "references/raw_ref_metal_barrier_attempt_01.png",
131
+ "reference_verify": "references/reference_verify_metal_barrier.json",
132
+ "reference_verify_passed": true,
133
+ "reference_attempts": 1,
134
+ "sam_white_bg": {
135
+ "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000001/references/raw_ref_metal_barrier_attempt_01.png",
136
+ "output": "references/ref_metal_barrier.png",
137
+ "mask": "references/sam_mask_metal_barrier.png",
138
+ "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth",
139
+ "sam_model_type": "vit_b",
140
+ "sam_device": "auto",
141
+ "sam_working_size": [
142
+ 640,
143
+ 640
144
+ ],
145
+ "sam_max_side": 640,
146
+ "sam_downscale": 0.625,
147
+ "prompt_box_xyxy": [
148
+ 37.0,
149
+ 201.0,
150
+ 1011.0,
151
+ 889.0
152
+ ],
153
+ "mask_score": 2.936982,
154
+ "mask_area_ratio": 0.305722,
155
+ "elapsed_seconds": 9.8709
156
+ }
157
+ }
158
+ ],
159
+ "not_emitted": [],
160
+ "model_ids": {
161
+ "chat_model": "gcp/google/gemini-3.1-pro-preview",
162
+ "image_model": "gcp/google/gemini-3-pro-image-preview"
163
+ }
164
+ }
samples_v8/driving/BDD100K_CrowdHuman_samples/rows/sample_000002.json ADDED
@@ -0,0 +1,716 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "sample_id": "sample_000002",
3
+ "target_total": 15,
4
+ "target_people": 2,
5
+ "target_objects": 13,
6
+ "canvas_size": [
7
+ 1280,
8
+ 720
9
+ ],
10
+ "canvas_aspect_ratio": "16:9",
11
+ "main_image": "main_image.png",
12
+ "bbox_overlay": "bbox_overlay.png",
13
+ "plan": "plan.json",
14
+ "detections": "detections.json",
15
+ "vocab_task": "vocab_task.json",
16
+ "n_planned": 15,
17
+ "n_detected": 15,
18
+ "n_subjects": 15,
19
+ "subjects": [
20
+ {
21
+ "name": "pedestrian_right",
22
+ "is_person": true,
23
+ "subject_type": "person",
24
+ "source_set": "people_set",
25
+ "source_image_id": "BDD100K:c5d864fa-b0b2380b:person:2",
26
+ "source_name": "pedestrian",
27
+ "source_description": "Person walking away from the camera on the right sidewalk, wearing dark clothing. Source dataset: BDD100K. Scene context: View from inside a vehicle driving down a wet city street lined with tall buildings, with pedestrians on the sidewalk and several cars and taxis ahead in the rain.",
28
+ "sub_caption": "pedestrian: Person walking away from the camera on the right sidewalk, wearing dark clothing.. Scene role: Walking along the right-hand sidewalk under the trees.",
29
+ "measured_bbox": [
30
+ 0.8872,
31
+ 0.491,
32
+ 0.9451,
33
+ 0.6701
34
+ ],
35
+ "detection_confidence": 0.95,
36
+ "ref_style": "white_bg_full_body_front",
37
+ "ref_image": "references/ref_pedestrian_right.png",
38
+ "raw_ref_image": "references/raw_ref_pedestrian_right_attempt_01.png",
39
+ "reference_verify": "references/reference_verify_pedestrian_right.json",
40
+ "reference_verify_passed": true,
41
+ "reference_attempts": 1,
42
+ "sam_white_bg": {
43
+ "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_pedestrian_right_attempt_01.png",
44
+ "output": "references/ref_pedestrian_right.png",
45
+ "mask": "references/sam_mask_pedestrian_right.png",
46
+ "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth",
47
+ "sam_model_type": "vit_b",
48
+ "sam_device": "auto",
49
+ "sam_working_size": [
50
+ 640,
51
+ 640
52
+ ],
53
+ "sam_max_side": 640,
54
+ "sam_downscale": 0.625,
55
+ "prompt_box_xyxy": [
56
+ 345.0,
57
+ 55.0,
58
+ 678.0,
59
+ 982.0
60
+ ],
61
+ "mask_score": 3.462354,
62
+ "mask_area_ratio": 0.14014,
63
+ "elapsed_seconds": 8.2387
64
+ }
65
+ },
66
+ {
67
+ "name": "pedestrian_left",
68
+ "is_person": true,
69
+ "subject_type": "person",
70
+ "source_set": "people_set",
71
+ "source_image_id": "BDD100K:b714a088-861a043b:person:2",
72
+ "source_name": "pedestrian",
73
+ "source_description": "another person near the shop entrance on the left, partially obscured Source dataset: BDD100K. Scene context: a city street scene from the perspective of a vehicle, showing multiple cars in motion, buildings lining the road, and pedestrians on the sidewalks during dusk or early evening",
74
+ "sub_caption": "pedestrian: A person near a shop entrance on the left, partially obscured by shadows.. Scene role: Standing near a building entrance on the left side of the street.",
75
+ "measured_bbox": [
76
+ 0.1301,
77
+ 0.5154,
78
+ 0.1517,
79
+ 0.611
80
+ ],
81
+ "detection_confidence": 0.95,
82
+ "ref_style": "white_bg_full_body_front",
83
+ "ref_image": "references/ref_pedestrian_left.png",
84
+ "raw_ref_image": "references/raw_ref_pedestrian_left_attempt_01.png",
85
+ "reference_verify": "references/reference_verify_pedestrian_left.json",
86
+ "reference_verify_passed": true,
87
+ "reference_attempts": 1,
88
+ "sam_white_bg": {
89
+ "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_pedestrian_left_attempt_01.png",
90
+ "output": "references/ref_pedestrian_left.png",
91
+ "mask": "references/sam_mask_pedestrian_left.png",
92
+ "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth",
93
+ "sam_model_type": "vit_b",
94
+ "sam_device": "auto",
95
+ "sam_working_size": [
96
+ 640,
97
+ 640
98
+ ],
99
+ "sam_max_side": 640,
100
+ "sam_downscale": 0.625,
101
+ "prompt_box_xyxy": [
102
+ 342.0,
103
+ 67.0,
104
+ 681.0,
105
+ 996.0
106
+ ],
107
+ "mask_score": 3.481605,
108
+ "mask_area_ratio": 0.150858,
109
+ "elapsed_seconds": 8.1403
110
+ }
111
+ },
112
+ {
113
+ "name": "city_buildings",
114
+ "is_person": false,
115
+ "subject_type": "object",
116
+ "source_set": "obj_set",
117
+ "source_image_id": "CrowdHuman:data/data_9/283554,31eeb000e9237b31.jpg:object:9",
118
+ "source_name": "building",
119
+ "source_description": "Various city buildings of different heights forming the skyline in the background. Source dataset: CrowdHuman. Scene context: People are walking and resting on the wooden walkway of a large suspension bridge with a city skyline in the background.",
120
+ "sub_caption": "building: Various city buildings of different heights forming the urban landscape along the street.. Scene role: Lining the street and forming the architectural background on both sides.",
121
+ "measured_bbox": [
122
+ 0.3358,
123
+ 0.3425,
124
+ 0.4929,
125
+ 0.5277
126
+ ],
127
+ "detection_confidence": 0.95,
128
+ "ref_style": "white_bg_encyclopedia_photo",
129
+ "ref_image": "references/ref_city_buildings.png",
130
+ "raw_ref_image": "references/raw_ref_city_buildings_attempt_01.png",
131
+ "reference_verify": "references/reference_verify_city_buildings.json",
132
+ "reference_verify_passed": true,
133
+ "reference_attempts": 1,
134
+ "sam_white_bg": {
135
+ "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_city_buildings_attempt_01.png",
136
+ "output": "references/ref_city_buildings.png",
137
+ "mask": "references/sam_mask_city_buildings.png",
138
+ "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth",
139
+ "sam_model_type": "vit_b",
140
+ "sam_device": "auto",
141
+ "sam_working_size": [
142
+ 640,
143
+ 640
144
+ ],
145
+ "sam_max_side": 640,
146
+ "sam_downscale": 0.625,
147
+ "prompt_box_xyxy": [
148
+ 27.0,
149
+ 178.0,
150
+ 996.0,
151
+ 865.0
152
+ ],
153
+ "mask_score": 3.420089,
154
+ "mask_area_ratio": 0.463421,
155
+ "elapsed_seconds": 8.2735
156
+ }
157
+ },
158
+ {
159
+ "name": "pink_scooter",
160
+ "is_person": false,
161
+ "subject_type": "object",
162
+ "source_set": "obj_set",
163
+ "source_image_id": "CrowdHuman:data/data_66/283991,a60c0001c08f63b.jpg:object:3",
164
+ "source_name": "pink scooter",
165
+ "source_description": "A prominent pink step-through style motor scooter. Source dataset: CrowdHuman. Scene context: A busy city intersection with many people riding scooters and some cars in the background.",
166
+ "sub_caption": "pink scooter: A prominent pink step-through style motor scooter.. Scene role: Parked on the right sidewalk near the street signs.",
167
+ "measured_bbox": [
168
+ 0.677,
169
+ 0.56,
170
+ 0.7935,
171
+ 0.7095
172
+ ],
173
+ "detection_confidence": 0.95,
174
+ "ref_style": "white_bg_encyclopedia_photo",
175
+ "ref_image": "references/ref_pink_scooter.png",
176
+ "raw_ref_image": "references/raw_ref_pink_scooter_attempt_01.png",
177
+ "reference_verify": "references/reference_verify_pink_scooter.json",
178
+ "reference_verify_passed": true,
179
+ "reference_attempts": 1,
180
+ "sam_white_bg": {
181
+ "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_pink_scooter_attempt_01.png",
182
+ "output": "references/ref_pink_scooter.png",
183
+ "mask": "references/sam_mask_pink_scooter.png",
184
+ "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth",
185
+ "sam_model_type": "vit_b",
186
+ "sam_device": "auto",
187
+ "sam_working_size": [
188
+ 640,
189
+ 640
190
+ ],
191
+ "sam_max_side": 640,
192
+ "sam_downscale": 0.625,
193
+ "prompt_box_xyxy": [
194
+ 41.0,
195
+ 120.0,
196
+ 982.0,
197
+ 920.0
198
+ ],
199
+ "mask_score": 3.414017,
200
+ "mask_area_ratio": 0.259921,
201
+ "elapsed_seconds": 8.1841
202
+ }
203
+ },
204
+ {
205
+ "name": "street_signs",
206
+ "is_person": false,
207
+ "subject_type": "object",
208
+ "source_set": "obj_set",
209
+ "source_image_id": "BDD100K:b5ab0e46-8eab4733:object:8",
210
+ "source_name": "street signs",
211
+ "source_description": "Various street signs attached to a pole on the right side of the street, including a speed limit sign. Source dataset: BDD100K. Scene context: A narrow city street with several parked and moving box trucks and vans, bordered by multi-story buildings and construction barriers.",
212
+ "sub_caption": "street signs: Various street signs attached to a metal pole on the right side of the street.. Scene role: Mounted on a pole alongside the road on the right.",
213
+ "measured_bbox": [
214
+ 0.8162,
215
+ 0.2869,
216
+ 0.8575,
217
+ 0.4063
218
+ ],
219
+ "detection_confidence": 0.99,
220
+ "ref_style": "white_bg_encyclopedia_photo",
221
+ "ref_image": "references/ref_street_signs.png",
222
+ "raw_ref_image": "references/raw_ref_street_signs_attempt_01.png",
223
+ "reference_verify": "references/reference_verify_street_signs.json",
224
+ "reference_verify_passed": true,
225
+ "reference_attempts": 1,
226
+ "sam_white_bg": {
227
+ "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_street_signs_attempt_01.png",
228
+ "output": "references/ref_street_signs.png",
229
+ "mask": "references/sam_mask_street_signs.png",
230
+ "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth",
231
+ "sam_model_type": "vit_b",
232
+ "sam_device": "auto",
233
+ "sam_working_size": [
234
+ 640,
235
+ 640
236
+ ],
237
+ "sam_max_side": 640,
238
+ "sam_downscale": 0.625,
239
+ "prompt_box_xyxy": [
240
+ 226.0,
241
+ 71.0,
242
+ 707.0,
243
+ 967.0
244
+ ],
245
+ "mask_score": 3.475593,
246
+ "mask_area_ratio": 0.25818,
247
+ "elapsed_seconds": 9.8621
248
+ }
249
+ },
250
+ {
251
+ "name": "storefront_sign",
252
+ "is_person": false,
253
+ "subject_type": "object",
254
+ "source_set": "obj_set",
255
+ "source_image_id": "CrowdHuman:data/data_4/273275,46a6f0005d04fc24.jpg:object:7",
256
+ "source_name": "storefront sign",
257
+ "source_description": "A dark sign with white text 'DELI' and a smaller red 'ATM' sign beneath it on the left. Source dataset: CrowdHuman. Scene context: A densely crowded city street with numerous pedestrians walking towards the camera, while emergency vehicles with flashing lights are visible in the background traffic.",
258
+ "sub_caption": "storefront sign: A dark, illuminated sign structure above a shop entrance.. Scene role: Hanging above a shop entrance on the left side of the street, illuminating the adjacent pedestrian.",
259
+ "measured_bbox": [
260
+ 0.1052,
261
+ 0.4218,
262
+ 0.186,
263
+ 0.4781
264
+ ],
265
+ "detection_confidence": 0.95,
266
+ "ref_style": "white_bg_encyclopedia_photo",
267
+ "ref_image": "references/ref_storefront_sign.png",
268
+ "raw_ref_image": "references/raw_ref_storefront_sign_attempt_01.png",
269
+ "reference_verify": "references/reference_verify_storefront_sign.json",
270
+ "reference_verify_passed": true,
271
+ "reference_attempts": 1,
272
+ "sam_white_bg": {
273
+ "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_storefront_sign_attempt_01.png",
274
+ "output": "references/ref_storefront_sign.png",
275
+ "mask": "references/sam_mask_storefront_sign.png",
276
+ "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth",
277
+ "sam_model_type": "vit_b",
278
+ "sam_device": "auto",
279
+ "sam_working_size": [
280
+ 640,
281
+ 640
282
+ ],
283
+ "sam_max_side": 640,
284
+ "sam_downscale": 0.625,
285
+ "prompt_box_xyxy": [
286
+ 107.0,
287
+ 274.0,
288
+ 900.0,
289
+ 749.0
290
+ ],
291
+ "mask_score": 3.354337,
292
+ "mask_area_ratio": 0.167885,
293
+ "elapsed_seconds": 8.1782
294
+ }
295
+ },
296
+ {
297
+ "name": "parked_suv_right",
298
+ "is_person": false,
299
+ "subject_type": "object",
300
+ "source_set": "obj_set",
301
+ "source_image_id": "BDD100K:c54441e6-400c221e:object:4",
302
+ "source_name": "parked SUV",
303
+ "source_description": "Dark-colored SUV parked ahead of the sedan on the right side of the road. Source dataset: BDD100K. Scene context: Nighttime driving scene approaching an intersection with green traffic lights and parked cars on the right.",
304
+ "sub_caption": "parked SUV: Dark-colored SUV parked on the right side of the road.. Scene role: Parked parallel to the curb on the right side of the street.",
305
+ "measured_bbox": [
306
+ 0.6057,
307
+ 0.5099,
308
+ 0.7451,
309
+ 0.6703
310
+ ],
311
+ "detection_confidence": 0.95,
312
+ "ref_style": "white_bg_encyclopedia_photo",
313
+ "ref_image": "references/ref_parked_suv_right.png",
314
+ "raw_ref_image": "references/raw_ref_parked_suv_right_attempt_01.png",
315
+ "reference_verify": "references/reference_verify_parked_suv_right.json",
316
+ "reference_verify_passed": true,
317
+ "reference_attempts": 1,
318
+ "sam_white_bg": {
319
+ "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_parked_suv_right_attempt_01.png",
320
+ "output": "references/ref_parked_suv_right.png",
321
+ "mask": "references/sam_mask_parked_suv_right.png",
322
+ "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth",
323
+ "sam_model_type": "vit_b",
324
+ "sam_device": "auto",
325
+ "sam_working_size": [
326
+ 640,
327
+ 640
328
+ ],
329
+ "sam_max_side": 640,
330
+ "sam_downscale": 0.625,
331
+ "prompt_box_xyxy": [
332
+ 52.0,
333
+ 216.0,
334
+ 993.0,
335
+ 835.0
336
+ ],
337
+ "mask_score": 3.459027,
338
+ "mask_area_ratio": 0.361156,
339
+ "elapsed_seconds": 10.1865
340
+ }
341
+ },
342
+ {
343
+ "name": "dark_car_left",
344
+ "is_person": false,
345
+ "subject_type": "object",
346
+ "source_set": "obj_set",
347
+ "source_image_id": "BDD100K:bb2e43e4-5e7a7129:object:6",
348
+ "source_name": "car",
349
+ "source_description": "A dark car parked along the left curb further ahead. Source dataset: BDD100K. Scene context: A dashcam view from a vehicle driving down a multi-lane road on a partly cloudy day, with a few other cars and pedestrians on the sidewalk.",
350
+ "sub_caption": "car: A dark car parked along the left curb further ahead.. Scene role: Parked alongside the left curb.",
351
+ "measured_bbox": [
352
+ 0.2139,
353
+ 0.5323,
354
+ 0.3044,
355
+ 0.6201
356
+ ],
357
+ "detection_confidence": 0.95,
358
+ "ref_style": "white_bg_encyclopedia_photo",
359
+ "ref_image": "references/ref_dark_car_left.png",
360
+ "raw_ref_image": "references/raw_ref_dark_car_left_attempt_01.png",
361
+ "reference_verify": "references/reference_verify_dark_car_left.json",
362
+ "reference_verify_passed": true,
363
+ "reference_attempts": 1,
364
+ "sam_white_bg": {
365
+ "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_dark_car_left_attempt_01.png",
366
+ "output": "references/ref_dark_car_left.png",
367
+ "mask": "references/sam_mask_dark_car_left.png",
368
+ "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth",
369
+ "sam_model_type": "vit_b",
370
+ "sam_device": "auto",
371
+ "sam_working_size": [
372
+ 640,
373
+ 640
374
+ ],
375
+ "sam_max_side": 640,
376
+ "sam_downscale": 0.625,
377
+ "prompt_box_xyxy": [
378
+ 42.0,
379
+ 237.0,
380
+ 982.0,
381
+ 794.0
382
+ ],
383
+ "mask_score": 3.479099,
384
+ "mask_area_ratio": 0.30617,
385
+ "elapsed_seconds": 8.2274
386
+ }
387
+ },
388
+ {
389
+ "name": "dark_suv_driving",
390
+ "is_person": false,
391
+ "subject_type": "object",
392
+ "source_set": "obj_set",
393
+ "source_image_id": "BDD100K:c889c950-865ca5b6:object:0",
394
+ "source_name": "dark SUV",
395
+ "source_description": "A dark SUV traveling in the left lane, with visible red taillights. Source dataset: BDD100K. Scene context: Nighttime driving on a highway with several cars visible ahead, illuminated mainly by taillights and headlights.",
396
+ "sub_caption": "dark SUV: A dark SUV traveling in the left lane, with visible glowing red taillights.. Scene role: Driving ahead of the camera vehicle in the opposing or adjacent left lane.",
397
+ "measured_bbox": [
398
+ 0.3005,
399
+ 0.5101,
400
+ 0.4179,
401
+ 0.6508
402
+ ],
403
+ "detection_confidence": 0.98,
404
+ "ref_style": "white_bg_encyclopedia_photo",
405
+ "ref_image": "references/ref_dark_suv_driving.png",
406
+ "raw_ref_image": "references/raw_ref_dark_suv_driving_attempt_01.png",
407
+ "reference_verify": "references/reference_verify_dark_suv_driving.json",
408
+ "reference_verify_passed": true,
409
+ "reference_attempts": 1,
410
+ "sam_white_bg": {
411
+ "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_dark_suv_driving_attempt_01.png",
412
+ "output": "references/ref_dark_suv_driving.png",
413
+ "mask": "references/sam_mask_dark_suv_driving.png",
414
+ "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth",
415
+ "sam_model_type": "vit_b",
416
+ "sam_device": "auto",
417
+ "sam_working_size": [
418
+ 640,
419
+ 640
420
+ ],
421
+ "sam_max_side": 640,
422
+ "sam_downscale": 0.625,
423
+ "prompt_box_xyxy": [
424
+ 96.0,
425
+ 294.0,
426
+ 928.0,
427
+ 812.0
428
+ ],
429
+ "mask_score": 3.455576,
430
+ "mask_area_ratio": 0.251452,
431
+ "elapsed_seconds": 9.8494
432
+ }
433
+ },
434
+ {
435
+ "name": "street_light",
436
+ "is_person": false,
437
+ "subject_type": "object",
438
+ "source_set": "obj_set",
439
+ "source_image_id": "BDD100K:c84f848e-2a5e0737:object:2",
440
+ "source_name": "street light",
441
+ "source_description": "Tall pole with a bright light on top, illuminating the road from the right side. Source dataset: BDD100K. Scene context: Nighttime driving view on a multi-lane highway with streetlights illuminating the dark road ahead.",
442
+ "sub_caption": "street light: Tall pole with a bright light on top, illuminating the road.. Scene role: Casting warm light onto the street from the right-hand sidewalk.",
443
+ "measured_bbox": [
444
+ 0.8171,
445
+ 0.1755,
446
+ 0.8719,
447
+ 0.2202
448
+ ],
449
+ "detection_confidence": 0.95,
450
+ "ref_style": "white_bg_encyclopedia_photo",
451
+ "ref_image": "references/ref_street_light.png",
452
+ "raw_ref_image": "references/raw_ref_street_light_attempt_01.png",
453
+ "reference_verify": "references/reference_verify_street_light.json",
454
+ "reference_verify_passed": true,
455
+ "reference_attempts": 1,
456
+ "sam_white_bg": {
457
+ "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_street_light_attempt_01.png",
458
+ "output": "references/ref_street_light.png",
459
+ "mask": "references/sam_mask_street_light.png",
460
+ "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth",
461
+ "sam_model_type": "vit_b",
462
+ "sam_device": "auto",
463
+ "sam_working_size": [
464
+ 640,
465
+ 640
466
+ ],
467
+ "sam_max_side": 640,
468
+ "sam_downscale": 0.625,
469
+ "prompt_box_xyxy": [
470
+ 71.0,
471
+ 277.0,
472
+ 913.0,
473
+ 727.0
474
+ ],
475
+ "mask_score": 3.350243,
476
+ "mask_area_ratio": 0.068855,
477
+ "elapsed_seconds": 8.2963
478
+ }
479
+ },
480
+ {
481
+ "name": "vehicle_dashboard",
482
+ "is_person": false,
483
+ "subject_type": "object",
484
+ "source_set": "obj_set",
485
+ "source_image_id": "BDD100K:b5032e1d-dad95b60:object:9",
486
+ "source_name": "dashboard",
487
+ "source_description": "The dark, reflective dashboard and lower windshield area of the camera vehicle in the foreground. Source dataset: BDD100K. Scene context: View from inside a vehicle driving down a residential city street lined with trees, parked cars, and multi-story brick buildings.",
488
+ "sub_caption": "dashboard: The dark, reflective dashboard and lower windshield area of the camera vehicle.. Scene role: Occupying the bottom foreground of the image, establishing the perspective from inside the car.",
489
+ "measured_bbox": [
490
+ 0.0,
491
+ 0.9261,
492
+ 1.0,
493
+ 1.0
494
+ ],
495
+ "detection_confidence": "high",
496
+ "ref_style": "white_bg_encyclopedia_photo",
497
+ "ref_image": "references/ref_vehicle_dashboard.png",
498
+ "raw_ref_image": "references/raw_ref_vehicle_dashboard_attempt_01.png",
499
+ "reference_verify": "references/reference_verify_vehicle_dashboard.json",
500
+ "reference_verify_passed": true,
501
+ "reference_attempts": 1,
502
+ "sam_white_bg": {
503
+ "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_vehicle_dashboard_attempt_01.png",
504
+ "output": "references/ref_vehicle_dashboard.png",
505
+ "mask": "references/sam_mask_vehicle_dashboard.png",
506
+ "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth",
507
+ "sam_model_type": "vit_b",
508
+ "sam_device": "auto",
509
+ "sam_working_size": [
510
+ 640,
511
+ 640
512
+ ],
513
+ "sam_max_side": 640,
514
+ "sam_downscale": 0.625,
515
+ "prompt_box_xyxy": [
516
+ 24.0,
517
+ 317.0,
518
+ 1001.0,
519
+ 706.0
520
+ ],
521
+ "mask_score": 2.942001,
522
+ "mask_area_ratio": 0.133658,
523
+ "elapsed_seconds": 8.3645
524
+ }
525
+ },
526
+ {
527
+ "name": "white_car_ahead",
528
+ "is_person": false,
529
+ "subject_type": "object",
530
+ "source_set": "obj_set",
531
+ "source_image_id": "BDD100K:b3a102ed-6ef54f5e:object:3",
532
+ "source_name": "white car",
533
+ "source_description": "A white car visible further down the road in the right lane. Source dataset: BDD100K. Scene context: Nighttime driving scene in a city with cars stopped in traffic and a construction site on the right.",
534
+ "sub_caption": "white car: A white car visible further down the road.. Scene role: Driving away in the right lane, further in the distance.",
535
+ "measured_bbox": [
536
+ 0.4811,
537
+ 0.5382,
538
+ 0.5174,
539
+ 0.5915
540
+ ],
541
+ "detection_confidence": 0.98,
542
+ "ref_style": "white_bg_encyclopedia_photo",
543
+ "ref_image": "references/ref_white_car_ahead.png",
544
+ "raw_ref_image": "references/raw_ref_white_car_ahead_attempt_01.png",
545
+ "reference_verify": "references/reference_verify_white_car_ahead.json",
546
+ "reference_verify_passed": true,
547
+ "reference_attempts": 1,
548
+ "sam_white_bg": {
549
+ "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_white_car_ahead_attempt_01.png",
550
+ "output": "references/ref_white_car_ahead.png",
551
+ "mask": "references/sam_mask_white_car_ahead.png",
552
+ "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth",
553
+ "sam_model_type": "vit_b",
554
+ "sam_device": "auto",
555
+ "sam_working_size": [
556
+ 640,
557
+ 640
558
+ ],
559
+ "sam_max_side": 640,
560
+ "sam_downscale": 0.625,
561
+ "prompt_box_xyxy": [
562
+ 112.0,
563
+ 168.0,
564
+ 912.0,
565
+ 855.0
566
+ ],
567
+ "mask_score": 3.412999,
568
+ "mask_area_ratio": 0.338258,
569
+ "elapsed_seconds": 8.3339
570
+ }
571
+ },
572
+ {
573
+ "name": "double_yellow_lines",
574
+ "is_person": false,
575
+ "subject_type": "object",
576
+ "source_set": "obj_set",
577
+ "source_image_id": "BDD100K:c417a291-7802692d:object:8",
578
+ "source_name": "yellow lines",
579
+ "source_description": "Double yellow painted lines separating opposite directions of traffic. Source dataset: BDD100K. Scene context: A pedestrian crossing a street at a crosswalk with several cars stopped around them, with storefronts visible in the background.",
580
+ "sub_caption": "yellow lines: Double yellow painted lines separating opposite directions of traffic.. Scene role: Running down the center of the road, receding into the distance.",
581
+ "measured_bbox": [
582
+ 0.3008,
583
+ 0.5732,
584
+ 0.4776,
585
+ 0.8029
586
+ ],
587
+ "detection_confidence": 0.98,
588
+ "ref_style": "white_bg_encyclopedia_photo",
589
+ "ref_image": "references/ref_double_yellow_lines.png",
590
+ "raw_ref_image": "references/raw_ref_double_yellow_lines_attempt_01.png",
591
+ "reference_verify": "references/reference_verify_double_yellow_lines.json",
592
+ "reference_verify_passed": true,
593
+ "reference_attempts": 1,
594
+ "sam_white_bg": {
595
+ "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_double_yellow_lines_attempt_01.png",
596
+ "output": "references/ref_double_yellow_lines.png",
597
+ "mask": "references/sam_mask_double_yellow_lines.png",
598
+ "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth",
599
+ "sam_model_type": "vit_b",
600
+ "sam_device": "auto",
601
+ "sam_working_size": [
602
+ 640,
603
+ 640
604
+ ],
605
+ "sam_max_side": 640,
606
+ "sam_downscale": 0.625,
607
+ "prompt_box_xyxy": [
608
+ 0.0,
609
+ 0.0,
610
+ 1023.0,
611
+ 1023.0
612
+ ],
613
+ "mask_score": 2.141169,
614
+ "mask_area_ratio": 0.667065,
615
+ "elapsed_seconds": 8.2719
616
+ }
617
+ },
618
+ {
619
+ "name": "street_trees",
620
+ "is_person": false,
621
+ "subject_type": "object",
622
+ "source_set": "obj_set",
623
+ "source_image_id": "BDD100K:c4891df0-24371ae1:object:3",
624
+ "source_name": "trees",
625
+ "source_description": "Numerous trees with dense green and yellowish foliage lining both sides of the road. Source dataset: BDD100K. Scene context: A dark SUV drives ahead on a wet road lined with trees and a fence under a cloudy sky.",
626
+ "sub_caption": "trees: Numerous trees with dense foliage lining both sides of the road.. Scene role: Planted along the sidewalks, softening the urban environment and framing the street.",
627
+ "measured_bbox": [
628
+ 0.001,
629
+ 0.002,
630
+ 0.375,
631
+ 0.63
632
+ ],
633
+ "detection_confidence": 0.95,
634
+ "ref_style": "white_bg_encyclopedia_photo",
635
+ "ref_image": "references/ref_street_trees.png",
636
+ "raw_ref_image": "references/raw_ref_street_trees_attempt_01.png",
637
+ "reference_verify": "references/reference_verify_street_trees.json",
638
+ "reference_verify_passed": true,
639
+ "reference_attempts": 1,
640
+ "sam_white_bg": {
641
+ "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_street_trees_attempt_01.png",
642
+ "output": "references/ref_street_trees.png",
643
+ "mask": "references/sam_mask_street_trees.png",
644
+ "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth",
645
+ "sam_model_type": "vit_b",
646
+ "sam_device": "auto",
647
+ "sam_working_size": [
648
+ 640,
649
+ 640
650
+ ],
651
+ "sam_max_side": 640,
652
+ "sam_downscale": 0.625,
653
+ "prompt_box_xyxy": [
654
+ 65.0,
655
+ 64.0,
656
+ 958.0,
657
+ 969.0
658
+ ],
659
+ "mask_score": 3.478968,
660
+ "mask_area_ratio": 0.365667,
661
+ "elapsed_seconds": 8.231
662
+ }
663
+ },
664
+ {
665
+ "name": "twilight_sky",
666
+ "is_person": false,
667
+ "subject_type": "object",
668
+ "source_set": "obj_set",
669
+ "source_image_id": "BDD100K:c13c0d1f-00dfd075:object:9",
670
+ "source_name": "sky",
671
+ "source_description": "Clear twilight sky transitioning from bright near the horizon to dark blue at the top. Source dataset: BDD100K. Scene context: View from a moving vehicle looking down a multi-lane city street lined with parked cars and large trees under a clear twilight sky.",
672
+ "sub_caption": "sky: Clear twilight sky transitioning from bright near the horizon to dark blue at the top.. Scene role: Providing the atmospheric backdrop above the buildings and street.",
673
+ "measured_bbox": [
674
+ 0.116,
675
+ 0.0,
676
+ 0.714,
677
+ 0.4742
678
+ ],
679
+ "detection_confidence": 0.95,
680
+ "ref_style": "white_bg_encyclopedia_photo",
681
+ "ref_image": "references/ref_twilight_sky.png",
682
+ "raw_ref_image": "references/raw_ref_twilight_sky_attempt_01.png",
683
+ "reference_verify": "references/reference_verify_twilight_sky.json",
684
+ "reference_verify_passed": true,
685
+ "reference_attempts": 1,
686
+ "sam_white_bg": {
687
+ "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000002/references/raw_ref_twilight_sky_attempt_01.png",
688
+ "output": "references/ref_twilight_sky.png",
689
+ "mask": "references/sam_mask_twilight_sky.png",
690
+ "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth",
691
+ "sam_model_type": "vit_b",
692
+ "sam_device": "auto",
693
+ "sam_working_size": [
694
+ 640,
695
+ 640
696
+ ],
697
+ "sam_max_side": 640,
698
+ "sam_downscale": 0.625,
699
+ "prompt_box_xyxy": [
700
+ 0.0,
701
+ 208.0,
702
+ 1023.0,
703
+ 814.0
704
+ ],
705
+ "mask_score": 2.437955,
706
+ "mask_area_ratio": 0.529621,
707
+ "elapsed_seconds": 9.8292
708
+ }
709
+ }
710
+ ],
711
+ "not_emitted": [],
712
+ "model_ids": {
713
+ "chat_model": "gcp/google/gemini-3.1-pro-preview",
714
+ "image_model": "gcp/google/gemini-3-pro-image-preview"
715
+ }
716
+ }
samples_v8/driving/BDD100K_CrowdHuman_samples/rows/sample_000003.json ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "sample_id": "sample_000003",
3
+ "target_total": 3,
4
+ "target_people": 1,
5
+ "target_objects": 2,
6
+ "canvas_size": [
7
+ 1248,
8
+ 832
9
+ ],
10
+ "canvas_aspect_ratio": "3:2",
11
+ "main_image": "main_image.png",
12
+ "bbox_overlay": "bbox_overlay.png",
13
+ "plan": "plan.json",
14
+ "detections": "detections.json",
15
+ "vocab_task": "vocab_task.json",
16
+ "n_planned": 3,
17
+ "n_detected": 3,
18
+ "n_subjects": 3,
19
+ "subjects": [
20
+ {
21
+ "name": "shopper",
22
+ "is_person": true,
23
+ "subject_type": "person",
24
+ "source_set": "people_set",
25
+ "source_image_id": "CrowdHuman:data/data_12/282555,5c403000efcca35d.jpg:person:13",
26
+ "source_name": "shopper",
27
+ "source_description": "A person standing on the top level, wearing a dark top and dark pants. Source dataset: CrowdHuman. Scene context: A multi-level outdoor shopping mall with various people walking and shopping.",
28
+ "sub_caption": "shopper: A person standing and waiting, wearing a dark top and dark pants. Scene role: waiting at the crosswalk curb",
29
+ "measured_bbox": [
30
+ 0.7364,
31
+ 0.2825,
32
+ 0.8267,
33
+ 0.7222
34
+ ],
35
+ "detection_confidence": 100,
36
+ "ref_style": "white_bg_full_body_front",
37
+ "ref_image": "references/ref_shopper.png",
38
+ "raw_ref_image": "references/raw_ref_shopper_attempt_01.png",
39
+ "reference_verify": "references/reference_verify_shopper.json",
40
+ "reference_verify_passed": true,
41
+ "reference_attempts": 1,
42
+ "sam_white_bg": {
43
+ "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000003/references/raw_ref_shopper_attempt_01.png",
44
+ "output": "references/ref_shopper.png",
45
+ "mask": "references/sam_mask_shopper.png",
46
+ "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth",
47
+ "sam_model_type": "vit_b",
48
+ "sam_device": "auto",
49
+ "sam_working_size": [
50
+ 640,
51
+ 640
52
+ ],
53
+ "sam_max_side": 640,
54
+ "sam_downscale": 0.625,
55
+ "prompt_box_xyxy": [
56
+ 348.0,
57
+ 80.0,
58
+ 678.0,
59
+ 995.0
60
+ ],
61
+ "mask_score": 3.467753,
62
+ "mask_area_ratio": 0.132874,
63
+ "elapsed_seconds": 49.4008
64
+ }
65
+ },
66
+ {
67
+ "name": "black_sedan",
68
+ "is_person": false,
69
+ "subject_type": "object",
70
+ "source_set": "obj_set",
71
+ "source_image_id": "BDD100K:c0c183ff-1b24f541:object:7",
72
+ "source_name": "black sedan",
73
+ "source_description": "A black sedan driving in the right lane ahead of the red hatchback. Source dataset: BDD100K. Scene context: A view from a vehicle driving down a multi-lane city street flanked by tall buildings, with various cars moving in the same direction and parked along the side.",
74
+ "sub_caption": "black sedan: A dark, modern black sedan. Scene role: driving in the nearest lane on the street",
75
+ "measured_bbox": [
76
+ 0.0883,
77
+ 0.2514,
78
+ 0.5002,
79
+ 0.449
80
+ ],
81
+ "detection_confidence": 0.99,
82
+ "ref_style": "white_bg_encyclopedia_photo",
83
+ "ref_image": "references/ref_black_sedan.png",
84
+ "raw_ref_image": "references/raw_ref_black_sedan_attempt_01.png",
85
+ "reference_verify": "references/reference_verify_black_sedan.json",
86
+ "reference_verify_passed": true,
87
+ "reference_attempts": 1,
88
+ "sam_white_bg": {
89
+ "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000003/references/raw_ref_black_sedan_attempt_01.png",
90
+ "output": "references/ref_black_sedan.png",
91
+ "mask": "references/sam_mask_black_sedan.png",
92
+ "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth",
93
+ "sam_model_type": "vit_b",
94
+ "sam_device": "auto",
95
+ "sam_working_size": [
96
+ 640,
97
+ 640
98
+ ],
99
+ "sam_max_side": 640,
100
+ "sam_downscale": 0.625,
101
+ "prompt_box_xyxy": [
102
+ 0.0,
103
+ 321.0,
104
+ 1023.0,
105
+ 700.0
106
+ ],
107
+ "mask_score": 2.52477,
108
+ "mask_area_ratio": 0.559944,
109
+ "elapsed_seconds": 8.5091
110
+ }
111
+ },
112
+ {
113
+ "name": "silver_car",
114
+ "is_person": false,
115
+ "subject_type": "object",
116
+ "source_set": "obj_set",
117
+ "source_image_id": "BDD100K:be49ae7a-1ffaa683:object:2",
118
+ "source_name": "silver car",
119
+ "source_description": "A silver car driving in the adjacent right lane, slightly ahead of the white car. Source dataset: BDD100K. Scene context: View from inside a vehicle driving in heavy traffic on a multi-lane road under a clear sky.",
120
+ "sub_caption": "silver car: A sleek silver car. Scene role: driving in the adjacent lane slightly ahead of the black sedan",
121
+ "measured_bbox": [
122
+ 0.3669,
123
+ 0.2463,
124
+ 0.7048,
125
+ 0.409
126
+ ],
127
+ "detection_confidence": 0.99,
128
+ "ref_style": "white_bg_encyclopedia_photo",
129
+ "ref_image": "references/ref_silver_car.png",
130
+ "raw_ref_image": "references/raw_ref_silver_car_attempt_01.png",
131
+ "reference_verify": "references/reference_verify_silver_car.json",
132
+ "reference_verify_passed": true,
133
+ "reference_attempts": 1,
134
+ "sam_white_bg": {
135
+ "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000003/references/raw_ref_silver_car_attempt_01.png",
136
+ "output": "references/ref_silver_car.png",
137
+ "mask": "references/sam_mask_silver_car.png",
138
+ "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth",
139
+ "sam_model_type": "vit_b",
140
+ "sam_device": "auto",
141
+ "sam_working_size": [
142
+ 640,
143
+ 640
144
+ ],
145
+ "sam_max_side": 640,
146
+ "sam_downscale": 0.625,
147
+ "prompt_box_xyxy": [
148
+ 16.0,
149
+ 369.0,
150
+ 1006.0,
151
+ 693.0
152
+ ],
153
+ "mask_score": 3.457475,
154
+ "mask_area_ratio": 0.178123,
155
+ "elapsed_seconds": 9.7472
156
+ }
157
+ }
158
+ ],
159
+ "not_emitted": [],
160
+ "model_ids": {
161
+ "chat_model": "gcp/google/gemini-3.1-pro-preview",
162
+ "image_model": "gcp/google/gemini-3-pro-image-preview"
163
+ }
164
+ }
samples_v8/driving/BDD100K_CrowdHuman_samples/rows/sample_000004.json ADDED
@@ -0,0 +1,256 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "sample_id": "sample_000004",
3
+ "target_total": 5,
4
+ "target_people": 1,
5
+ "target_objects": 4,
6
+ "canvas_size": [
7
+ 1248,
8
+ 832
9
+ ],
10
+ "canvas_aspect_ratio": "3:2",
11
+ "main_image": "main_image.png",
12
+ "bbox_overlay": "bbox_overlay.png",
13
+ "plan": "plan.json",
14
+ "detections": "detections.json",
15
+ "vocab_task": "vocab_task.json",
16
+ "n_planned": 5,
17
+ "n_detected": 5,
18
+ "n_subjects": 5,
19
+ "subjects": [
20
+ {
21
+ "name": "pedestrian_walker",
22
+ "is_person": true,
23
+ "subject_type": "person",
24
+ "source_set": "people_set",
25
+ "source_image_id": "CrowdHuman:data/data_51/273275,145927000354f7525.jpg:person:10",
26
+ "source_name": "walker",
27
+ "source_description": "A man walking, wearing a blue hoodie with 'EMO' written on it, grey pants, and dark shoes. Source dataset: CrowdHuman. Scene context: A group of people, some wearing matching white t-shirts, are walking and jogging along a paved path next to a road, with a grey SUV parked on a grassy hill in the background.",
28
+ "sub_caption": "walker: A man walking, wearing a plain blue hoodie, grey pants, and dark shoes.. Scene role: Walking near the crosswalk on the side of the street.",
29
+ "measured_bbox": [
30
+ 0.5948,
31
+ 0.3939,
32
+ 0.6378,
33
+ 0.5698
34
+ ],
35
+ "detection_confidence": 0.98,
36
+ "ref_style": "white_bg_full_body_front",
37
+ "ref_image": "references/ref_pedestrian_walker.png",
38
+ "raw_ref_image": "references/raw_ref_pedestrian_walker_attempt_01.png",
39
+ "reference_verify": "references/reference_verify_pedestrian_walker.json",
40
+ "reference_verify_passed": true,
41
+ "reference_attempts": 1,
42
+ "sam_white_bg": {
43
+ "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000004/references/raw_ref_pedestrian_walker_attempt_01.png",
44
+ "output": "references/ref_pedestrian_walker.png",
45
+ "mask": "references/sam_mask_pedestrian_walker.png",
46
+ "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth",
47
+ "sam_model_type": "vit_b",
48
+ "sam_device": "auto",
49
+ "sam_working_size": [
50
+ 640,
51
+ 640
52
+ ],
53
+ "sam_max_side": 640,
54
+ "sam_downscale": 0.625,
55
+ "prompt_box_xyxy": [
56
+ 334.0,
57
+ 56.0,
58
+ 706.0,
59
+ 996.0
60
+ ],
61
+ "mask_score": 3.43302,
62
+ "mask_area_ratio": 0.160827,
63
+ "elapsed_seconds": 9.8914
64
+ }
65
+ },
66
+ {
67
+ "name": "red_traffic_light",
68
+ "is_person": false,
69
+ "subject_type": "object",
70
+ "source_set": "obj_set",
71
+ "source_image_id": "BDD100K:b6df605f-51c158b8:object:6",
72
+ "source_name": "traffic light",
73
+ "source_description": "A set of traffic lights suspended over the intersection, showing a red light. Source dataset: BDD100K. Scene context: A view from a vehicle driving down a city street on a sunny day with a U-Haul truck in the opposite lane, parked cars along the right curb, and pedestrians crossing a crosswalk.",
74
+ "sub_caption": "traffic light: A traffic signal suspended over the intersection, illuminated with a bright red light.. Scene role: Hanging high above the center of the intersection in the driver's line of sight.",
75
+ "measured_bbox": [
76
+ 0.4668,
77
+ 0.0722,
78
+ 0.5093,
79
+ 0.1896
80
+ ],
81
+ "detection_confidence": 0.95,
82
+ "ref_style": "white_bg_encyclopedia_photo",
83
+ "ref_image": "references/ref_red_traffic_light.png",
84
+ "raw_ref_image": "references/raw_ref_red_traffic_light_attempt_01.png",
85
+ "reference_verify": "references/reference_verify_red_traffic_light.json",
86
+ "reference_verify_passed": true,
87
+ "reference_attempts": 1,
88
+ "sam_white_bg": {
89
+ "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000004/references/raw_ref_red_traffic_light_attempt_01.png",
90
+ "output": "references/ref_red_traffic_light.png",
91
+ "mask": "references/sam_mask_red_traffic_light.png",
92
+ "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth",
93
+ "sam_model_type": "vit_b",
94
+ "sam_device": "auto",
95
+ "sam_working_size": [
96
+ 640,
97
+ 640
98
+ ],
99
+ "sam_max_side": 640,
100
+ "sam_downscale": 0.625,
101
+ "prompt_box_xyxy": [
102
+ 286.0,
103
+ 103.0,
104
+ 1023.0,
105
+ 893.0
106
+ ],
107
+ "mask_score": 3.25218,
108
+ "mask_area_ratio": 0.200515,
109
+ "elapsed_seconds": 8.1927
110
+ }
111
+ },
112
+ {
113
+ "name": "plain_delivery_truck",
114
+ "is_person": false,
115
+ "subject_type": "object",
116
+ "source_set": "obj_set",
117
+ "source_image_id": "BDD100K:b3a7b21a-48bcf2b8:object:2",
118
+ "source_name": "delivery truck",
119
+ "source_description": "A large white box truck parked behind the fence on the right, with visible green and purple logos. Source dataset: BDD100K. Scene context: A view from a vehicle driving down a wide, paved road flanked by an industrial area with fences and delivery trucks under a cloudy sky.",
120
+ "sub_caption": "delivery truck: A large, plain white box truck without any visible markings or graphics.. Scene role: Parked alongside the right edge of the street curb.",
121
+ "measured_bbox": [
122
+ 0.6504,
123
+ 0.2022,
124
+ 0.966,
125
+ 0.6212
126
+ ],
127
+ "detection_confidence": 0.99,
128
+ "ref_style": "white_bg_encyclopedia_photo",
129
+ "ref_image": "references/ref_plain_delivery_truck.png",
130
+ "raw_ref_image": "references/raw_ref_plain_delivery_truck_attempt_01.png",
131
+ "reference_verify": "references/reference_verify_plain_delivery_truck.json",
132
+ "reference_verify_passed": true,
133
+ "reference_attempts": 1,
134
+ "sam_white_bg": {
135
+ "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000004/references/raw_ref_plain_delivery_truck_attempt_01.png",
136
+ "output": "references/ref_plain_delivery_truck.png",
137
+ "mask": "references/sam_mask_plain_delivery_truck.png",
138
+ "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth",
139
+ "sam_model_type": "vit_b",
140
+ "sam_device": "auto",
141
+ "sam_working_size": [
142
+ 640,
143
+ 640
144
+ ],
145
+ "sam_max_side": 640,
146
+ "sam_downscale": 0.625,
147
+ "prompt_box_xyxy": [
148
+ 9.0,
149
+ 166.0,
150
+ 1017.0,
151
+ 852.0
152
+ ],
153
+ "mask_score": 3.45107,
154
+ "mask_area_ratio": 0.437578,
155
+ "elapsed_seconds": 10.0386
156
+ }
157
+ },
158
+ {
159
+ "name": "dark_parked_car",
160
+ "is_person": false,
161
+ "subject_type": "object",
162
+ "source_set": "obj_set",
163
+ "source_image_id": "BDD100K:bcd37eef-1b958ae3:object:5",
164
+ "source_name": "dark parked car",
165
+ "source_description": "Another dark-colored car parked on the right side of the street. Source dataset: BDD100K. Scene context: A nighttime view from a vehicle driving down a multi-lane city street with parked cars on the right and oncoming traffic on the left.",
166
+ "sub_caption": "dark parked car: A dark-colored passenger vehicle.. Scene role: Parked parallel to the curb directly behind the delivery truck.",
167
+ "measured_bbox": [
168
+ 0.8339,
169
+ 0.4566,
170
+ 0.9965,
171
+ 0.7781
172
+ ],
173
+ "detection_confidence": 0.98,
174
+ "ref_style": "white_bg_encyclopedia_photo",
175
+ "ref_image": "references/ref_dark_parked_car.png",
176
+ "raw_ref_image": "references/raw_ref_dark_parked_car_attempt_01.png",
177
+ "reference_verify": "references/reference_verify_dark_parked_car.json",
178
+ "reference_verify_passed": true,
179
+ "reference_attempts": 1,
180
+ "sam_white_bg": {
181
+ "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000004/references/raw_ref_dark_parked_car_attempt_01.png",
182
+ "output": "references/ref_dark_parked_car.png",
183
+ "mask": "references/sam_mask_dark_parked_car.png",
184
+ "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth",
185
+ "sam_model_type": "vit_b",
186
+ "sam_device": "auto",
187
+ "sam_working_size": [
188
+ 640,
189
+ 640
190
+ ],
191
+ "sam_max_side": 640,
192
+ "sam_downscale": 0.625,
193
+ "prompt_box_xyxy": [
194
+ 0.0,
195
+ 301.0,
196
+ 1023.0,
197
+ 694.0
198
+ ],
199
+ "mask_score": 3.113868,
200
+ "mask_area_ratio": 0.207836,
201
+ "elapsed_seconds": 8.5697
202
+ }
203
+ },
204
+ {
205
+ "name": "street_lines",
206
+ "is_person": false,
207
+ "subject_type": "object",
208
+ "source_set": "obj_set",
209
+ "source_image_id": "BDD100K:c0c9ec9a-d3638a82:object:6",
210
+ "source_name": "street lines",
211
+ "source_description": "Double yellow lines separating traffic directions and white painted lines indicating lanes and crosswalks. Source dataset: BDD100K. Scene context: A view from inside a car driving on a city street, approaching an intersection underneath an elevated railway structure.",
212
+ "sub_caption": "street lines: Double yellow center lines separating traffic directions, and solid white painted lines forming a distinct crosswalk.. Scene role: Painted on the asphalt, guiding traffic and defining the pedestrian crossing area in the foreground.",
213
+ "measured_bbox": [
214
+ 0.003,
215
+ 0.432,
216
+ 0.971,
217
+ 0.794
218
+ ],
219
+ "detection_confidence": 0.95,
220
+ "ref_style": "white_bg_encyclopedia_photo",
221
+ "ref_image": "references/ref_street_lines.png",
222
+ "raw_ref_image": "references/raw_ref_street_lines_attempt_01.png",
223
+ "reference_verify": "references/reference_verify_street_lines.json",
224
+ "reference_verify_passed": true,
225
+ "reference_attempts": 1,
226
+ "sam_white_bg": {
227
+ "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000004/references/raw_ref_street_lines_attempt_01.png",
228
+ "output": "references/ref_street_lines.png",
229
+ "mask": "references/sam_mask_street_lines.png",
230
+ "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth",
231
+ "sam_model_type": "vit_b",
232
+ "sam_device": "auto",
233
+ "sam_working_size": [
234
+ 640,
235
+ 640
236
+ ],
237
+ "sam_max_side": 640,
238
+ "sam_downscale": 0.625,
239
+ "prompt_box_xyxy": [
240
+ 384.0,
241
+ 98.0,
242
+ 639.0,
243
+ 925.0
244
+ ],
245
+ "mask_score": 3.44596,
246
+ "mask_area_ratio": 0.067441,
247
+ "elapsed_seconds": 8.1646
248
+ }
249
+ }
250
+ ],
251
+ "not_emitted": [],
252
+ "model_ids": {
253
+ "chat_model": "gcp/google/gemini-3.1-pro-preview",
254
+ "image_model": "gcp/google/gemini-3-pro-image-preview"
255
+ }
256
+ }
samples_v8/driving/BDD100K_CrowdHuman_samples/rows/sample_000005.json ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "sample_id": "sample_000005",
3
+ "target_total": 6,
4
+ "target_people": 3,
5
+ "target_objects": 3,
6
+ "canvas_size": [
7
+ 1248,
8
+ 832
9
+ ],
10
+ "canvas_aspect_ratio": "3:2",
11
+ "main_image": "main_image.png",
12
+ "bbox_overlay": "bbox_overlay.png",
13
+ "plan": "plan.json",
14
+ "detections": "detections.json",
15
+ "vocab_task": "vocab_task.json",
16
+ "n_planned": 6,
17
+ "n_detected": 6,
18
+ "n_subjects": 6,
19
+ "subjects": [
20
+ {
21
+ "name": "person_yellow_top",
22
+ "is_person": true,
23
+ "subject_type": "person",
24
+ "source_set": "people_set",
25
+ "source_image_id": "CrowdHuman:data/data_24/282555,49a4e000d44469c6.jpg:person:49",
26
+ "source_name": "person",
27
+ "source_description": "Standing, wearing a bright yellow top Source dataset: CrowdHuman. Scene context: A large crowd of tourists is gathered in front of the Louvre museum and its iconic glass pyramid on a sunny day.",
28
+ "sub_caption": "person: Standing, wearing a bright yellow top. Scene role: Crossing the street on the crosswalk in front of the stopped silver car",
29
+ "measured_bbox": [
30
+ 0.5309,
31
+ 0.4516,
32
+ 0.5607,
33
+ 0.6301
34
+ ],
35
+ "detection_confidence": 0.95,
36
+ "ref_style": "white_bg_full_body_front",
37
+ "ref_image": "references/ref_person_yellow_top.png",
38
+ "raw_ref_image": "references/raw_ref_person_yellow_top_attempt_01.png",
39
+ "reference_verify": "references/reference_verify_person_yellow_top.json",
40
+ "reference_verify_passed": true,
41
+ "reference_attempts": 1,
42
+ "sam_white_bg": {
43
+ "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_person_yellow_top_attempt_01.png",
44
+ "output": "references/ref_person_yellow_top.png",
45
+ "mask": "references/sam_mask_person_yellow_top.png",
46
+ "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth",
47
+ "sam_model_type": "vit_b",
48
+ "sam_device": "auto",
49
+ "sam_working_size": [
50
+ 640,
51
+ 640
52
+ ],
53
+ "sam_max_side": 640,
54
+ "sam_downscale": 0.625,
55
+ "prompt_box_xyxy": [
56
+ 329.0,
57
+ 42.0,
58
+ 701.0,
59
+ 1012.0
60
+ ],
61
+ "mask_score": 3.348943,
62
+ "mask_area_ratio": 0.150169,
63
+ "elapsed_seconds": 8.25
64
+ }
65
+ },
66
+ {
67
+ "name": "man_in_suit",
68
+ "is_person": true,
69
+ "subject_type": "person",
70
+ "source_set": "people_set",
71
+ "source_image_id": "CrowdHuman:data/data_28/273278,b62280001bda6f1a.jpg:person:19",
72
+ "source_name": "crowd member",
73
+ "source_description": "A person far right in the background wearing a suit. Source dataset: CrowdHuman. Scene context: A crowd of people gathers in front of an old stone building with a prominent arched doorway and two large animal statues.",
74
+ "sub_caption": "crowd member: A person wearing a professional suit.. Scene role: Walking alongside the other pedestrians across the crosswalk",
75
+ "measured_bbox": [
76
+ 0.5767,
77
+ 0.4388,
78
+ 0.6397,
79
+ 0.6278
80
+ ],
81
+ "detection_confidence": 0.99,
82
+ "ref_style": "white_bg_full_body_front",
83
+ "ref_image": "references/ref_man_in_suit.png",
84
+ "raw_ref_image": "references/raw_ref_man_in_suit_attempt_01.png",
85
+ "reference_verify": "references/reference_verify_man_in_suit.json",
86
+ "reference_verify_passed": true,
87
+ "reference_attempts": 1,
88
+ "sam_white_bg": {
89
+ "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_man_in_suit_attempt_01.png",
90
+ "output": "references/ref_man_in_suit.png",
91
+ "mask": "references/sam_mask_man_in_suit.png",
92
+ "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth",
93
+ "sam_model_type": "vit_b",
94
+ "sam_device": "auto",
95
+ "sam_working_size": [
96
+ 640,
97
+ 640
98
+ ],
99
+ "sam_max_side": 640,
100
+ "sam_downscale": 0.625,
101
+ "prompt_box_xyxy": [
102
+ 351.0,
103
+ 27.0,
104
+ 671.0,
105
+ 1004.0
106
+ ],
107
+ "mask_score": 3.48496,
108
+ "mask_area_ratio": 0.144686,
109
+ "elapsed_seconds": 9.7885
110
+ }
111
+ },
112
+ {
113
+ "name": "young_girl",
114
+ "is_person": true,
115
+ "subject_type": "person",
116
+ "source_set": "people_set",
117
+ "source_image_id": "CrowdHuman:data/data_51/282555,57b9f000182048e6.jpg:person:13",
118
+ "source_name": "pedestrian",
119
+ "source_description": "Young girl with brown hair, wearing a blue patterned top. Source dataset: CrowdHuman. Scene context: A sunny outdoor scene featuring the red entrance arch to Navy Pier Beer Garden and a tall brick tower, with a diverse crowd of people walking along the waterfront promenade.",
120
+ "sub_caption": "pedestrian: Young girl with brown hair, wearing a blue patterned top.. Scene role: Walking across the intersection near the person in the yellow top",
121
+ "measured_bbox": [
122
+ 0.6354,
123
+ 0.4889,
124
+ 0.6677,
125
+ 0.6337
126
+ ],
127
+ "detection_confidence": 0.98,
128
+ "ref_style": "white_bg_full_body_front",
129
+ "ref_image": "references/ref_young_girl.png",
130
+ "raw_ref_image": "references/raw_ref_young_girl_attempt_01.png",
131
+ "reference_verify": "references/reference_verify_young_girl.json",
132
+ "reference_verify_passed": true,
133
+ "reference_attempts": 1,
134
+ "sam_white_bg": {
135
+ "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_young_girl_attempt_01.png",
136
+ "output": "references/ref_young_girl.png",
137
+ "mask": "references/sam_mask_young_girl.png",
138
+ "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth",
139
+ "sam_model_type": "vit_b",
140
+ "sam_device": "auto",
141
+ "sam_working_size": [
142
+ 640,
143
+ 640
144
+ ],
145
+ "sam_max_side": 640,
146
+ "sam_downscale": 0.625,
147
+ "prompt_box_xyxy": [
148
+ 369.0,
149
+ 52.0,
150
+ 661.0,
151
+ 1003.0
152
+ ],
153
+ "mask_score": 3.482282,
154
+ "mask_area_ratio": 0.133298,
155
+ "elapsed_seconds": 8.3216
156
+ }
157
+ },
158
+ {
159
+ "name": "traffic_light",
160
+ "is_person": false,
161
+ "subject_type": "object",
162
+ "source_set": "obj_set",
163
+ "source_image_id": "CrowdHuman:data/data_36/273275,6a11d000f52c34a9.jpg:object:0",
164
+ "source_name": "traffic light",
165
+ "source_description": "A black multi-lens traffic light fixture mounted on a pole above the street. Source dataset: CrowdHuman. Scene context: A male tour guide is speaking to a group of people standing on a city sidewalk next to a road crossing.",
166
+ "sub_caption": "traffic light: A black multi-lens traffic light fixture mounted on a pole above the street.. Scene role: Suspended over the intersection, showing a red light to halt the vehicles",
167
+ "measured_bbox": [
168
+ 0.5513,
169
+ 0.0408,
170
+ 0.6462,
171
+ 0.1518
172
+ ],
173
+ "detection_confidence": 0.99,
174
+ "ref_style": "white_bg_encyclopedia_photo",
175
+ "ref_image": "references/ref_traffic_light.png",
176
+ "raw_ref_image": "references/raw_ref_traffic_light_attempt_01.png",
177
+ "reference_verify": "references/reference_verify_traffic_light.json",
178
+ "reference_verify_passed": true,
179
+ "reference_attempts": 1,
180
+ "sam_white_bg": {
181
+ "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_traffic_light_attempt_01.png",
182
+ "output": "references/ref_traffic_light.png",
183
+ "mask": "references/sam_mask_traffic_light.png",
184
+ "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth",
185
+ "sam_model_type": "vit_b",
186
+ "sam_device": "auto",
187
+ "sam_working_size": [
188
+ 640,
189
+ 640
190
+ ],
191
+ "sam_max_side": 640,
192
+ "sam_downscale": 0.625,
193
+ "prompt_box_xyxy": [
194
+ 113.0,
195
+ 201.0,
196
+ 923.0,
197
+ 826.0
198
+ ],
199
+ "mask_score": 3.467034,
200
+ "mask_area_ratio": 0.289252,
201
+ "elapsed_seconds": 9.874
202
+ }
203
+ },
204
+ {
205
+ "name": "street_trees",
206
+ "is_person": false,
207
+ "subject_type": "object",
208
+ "source_set": "obj_set",
209
+ "source_image_id": "CrowdHuman:data/data_37/282555,a4aef000d9c10933.jpg:object:3",
210
+ "source_name": "trees",
211
+ "source_description": "Various green trees and shrubs lining the pathway and visible in the background gardens. Source dataset: CrowdHuman. Scene context: A large crowd of tourists walks along the pathway towards the Taj Mahal on a clear day.",
212
+ "sub_caption": "trees: Various green trees and shrubs lining the pathway and visible in the background gardens.. Scene role: Planted along the sidewalks on both sides of the street, providing urban greenery",
213
+ "measured_bbox": [
214
+ 0.542,
215
+ 0.2363,
216
+ 0.636,
217
+ 0.493
218
+ ],
219
+ "detection_confidence": 0.9,
220
+ "ref_style": "white_bg_encyclopedia_photo",
221
+ "ref_image": "references/ref_street_trees.png",
222
+ "raw_ref_image": "references/raw_ref_street_trees_attempt_03.png",
223
+ "reference_verify": "references/reference_verify_street_trees.json",
224
+ "reference_verify_passed": true,
225
+ "reference_attempts": 3,
226
+ "sam_white_bg": {
227
+ "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_street_trees_attempt_03.png",
228
+ "output": "references/ref_street_trees.png",
229
+ "mask": "references/sam_mask_street_trees.png",
230
+ "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth",
231
+ "sam_model_type": "vit_b",
232
+ "sam_device": "auto",
233
+ "sam_working_size": [
234
+ 640,
235
+ 640
236
+ ],
237
+ "sam_max_side": 640,
238
+ "sam_downscale": 0.625,
239
+ "prompt_box_xyxy": [
240
+ 22.0,
241
+ 60.0,
242
+ 1003.0,
243
+ 968.0
244
+ ],
245
+ "mask_score": 3.301958,
246
+ "mask_area_ratio": 0.393952,
247
+ "elapsed_seconds": 8.2223
248
+ }
249
+ },
250
+ {
251
+ "name": "silver_car",
252
+ "is_person": false,
253
+ "subject_type": "object",
254
+ "source_set": "obj_set",
255
+ "source_image_id": "BDD100K:be3d3a81-326a032d:object:0",
256
+ "source_name": "silver car",
257
+ "source_description": "A silver compact hatchback car facing forward, waiting at an intersection. Its brake lights are on. Source dataset: BDD100K. Scene context: A rainy street scene showing cars waiting at an intersection surrounded by tall buildings and urban infrastructure.",
258
+ "sub_caption": "silver car: A silver compact hatchback car facing forward, waiting at an intersection. Its brake lights are on.. Scene role: Stopped in the traffic lane in the foreground, waiting for the pedestrians to cross",
259
+ "measured_bbox": [
260
+ 0.3062,
261
+ 0.4281,
262
+ 0.5436,
263
+ 0.7674
264
+ ],
265
+ "detection_confidence": 0.99,
266
+ "ref_style": "white_bg_encyclopedia_photo",
267
+ "ref_image": "references/ref_silver_car.png",
268
+ "raw_ref_image": "references/raw_ref_silver_car_attempt_01.png",
269
+ "reference_verify": "references/reference_verify_silver_car.json",
270
+ "reference_verify_passed": true,
271
+ "reference_attempts": 1,
272
+ "sam_white_bg": {
273
+ "input": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/driving/samples/sample_000005/references/raw_ref_silver_car_attempt_01.png",
274
+ "output": "references/ref_silver_car.png",
275
+ "mask": "references/sam_mask_silver_car.png",
276
+ "sam_checkpoint": "/lustre/fs12/portfolios/nvr/projects/nvr_elm_llm/users/hcai/workspace/code/lmlu/codex_gen_data/checkpoints/sam/sam_vit_b_01ec64.pth",
277
+ "sam_model_type": "vit_b",
278
+ "sam_device": "auto",
279
+ "sam_working_size": [
280
+ 640,
281
+ 640
282
+ ],
283
+ "sam_max_side": 640,
284
+ "sam_downscale": 0.625,
285
+ "prompt_box_xyxy": [
286
+ 80.0,
287
+ 167.0,
288
+ 957.0,
289
+ 937.0
290
+ ],
291
+ "mask_score": 3.434142,
292
+ "mask_area_ratio": 0.414005,
293
+ "elapsed_seconds": 8.3073
294
+ }
295
+ }
296
+ ],
297
+ "not_emitted": [],
298
+ "model_ids": {
299
+ "chat_model": "gcp/google/gemini-3.1-pro-preview",
300
+ "image_model": "gcp/google/gemini-3-pro-image-preview"
301
+ }
302
+ }