O96a commited on
Commit
b44db72
·
verified ·
1 Parent(s): 1fd110f

Add app.py - fix NO_APP_FILE error

Browse files
Files changed (1) hide show
  1. app.py +187 -0
app.py ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ CoT Spatial Reasoning Demo
3
+ Based on: "Chain-of-Thought Degrades Visual Spatial Reasoning" (arXiv:2604.16060)
4
+
5
+ This demo explores how Chain-of-Thought prompting affects spatial reasoning
6
+ capabilities in multimodal models.
7
+ """
8
+
9
+ import gradio as gr
10
+ from PIL import Image, ImageDraw
11
+ import random
12
+
13
+
14
+ def create_spatial_grid_puzzle():
15
+ """Create a spatial reasoning puzzle with grid layout"""
16
+ img = Image.new('RGB', (400, 400), color='white')
17
+ draw = ImageDraw.Draw(img)
18
+
19
+ # Draw 3x3 grid
20
+ colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#FFA07A', '#98D8C8', '#F7DC6F', '#DDA0DD', '#F0E68C', '#FFB6C1']
21
+ shapes = []
22
+
23
+ for i in range(3):
24
+ for j in range(3):
25
+ x, y = 50 + j * 100, 50 + i * 100
26
+ color = colors[i * 3 + j]
27
+
28
+ # Draw different shapes
29
+ if (i + j) % 3 == 0:
30
+ draw.ellipse([x, y, x+60, y+60], fill=color, outline='black', width=2)
31
+ shape = "circle"
32
+ elif (i + j) % 3 == 1:
33
+ draw.rectangle([x, y, x+60, y+60], fill=color, outline='black', width=2)
34
+ shape = "square"
35
+ else:
36
+ draw.polygon([(x+30, y), (x+60, y+60), (x, y+60)], fill=color, outline='black', width=2)
37
+ shape = "triangle"
38
+
39
+ shapes.append({
40
+ "row": i + 1,
41
+ "col": j + 1,
42
+ "shape": shape,
43
+ "color": color
44
+ })
45
+
46
+ return img, shapes
47
+
48
+
49
+ def direct_answer(puzzle_type):
50
+ """Simulate direct answering (no CoT)"""
51
+ img, shapes = create_spatial_grid_puzzle()
52
+
53
+ if puzzle_type == "Center Shape":
54
+ target = shapes[4] # Center
55
+ question = "What shape is in the center (row 2, column 2)?"
56
+ answer = target["shape"]
57
+ elif puzzle_type == "Corner Colors":
58
+ corners = [shapes[0], shapes[2], shapes[6], shapes[8]]
59
+ question = "How many corners contain circles?"
60
+ answer = str(sum(1 for s in corners if s["shape"] == "circle"))
61
+ else: # Pattern Recognition
62
+ question = "What shape appears most frequently?"
63
+ counts = {}
64
+ for s in shapes:
65
+ counts[s["shape"]] = counts.get(s["shape"], 0) + 1
66
+ answer = max(counts, key=counts.get)
67
+
68
+ response = f"**Direct Answer:** {answer}"
69
+ return img, question, response
70
+
71
+
72
+ def cot_answer(puzzle_type):
73
+ """Simulate Chain-of-Thought reasoning"""
74
+ img, shapes = create_spatial_grid_puzzle()
75
+
76
+ if puzzle_type == "Center Shape":
77
+ target = shapes[4]
78
+ question = "What shape is in the center (row 2, column 2)?"
79
+ cot = f"""**CoT Reasoning:**
80
+ 1. The grid is 3x3, so center is at position (2,2)
81
+ 2. Let me trace the grid:
82
+ - Row 1: {shapes[0]['shape']}, {shapes[1]['shape']}, {shapes[2]['shape']}
83
+ - Row 2: {shapes[3]['shape']}, [CENTER], {shapes[5]['shape']}
84
+ - Row 3: {shapes[6]['shape']}, {shapes[7]['shape']}, {shapes[8]['shape']}
85
+ 3. The center shape is a {target['shape']}
86
+
87
+ **Answer:** {target['shape']}"""
88
+
89
+ elif puzzle_type == "Corner Colors":
90
+ corners = [shapes[0], shapes[2], shapes[6], shapes[8]]
91
+ question = "How many corners contain circles?"
92
+ corner_shapes = [s['shape'] for s in corners]
93
+ circles = corner_shapes.count("circle")
94
+ cot = f"""**CoT Reasoning:**
95
+ 1. Corners are positions: (1,1), (1,3), (3,1), (3,3)
96
+ 2. Corner shapes: {', '.join(corner_shapes)}
97
+ 3. Count circles: {circles}
98
+
99
+ **Answer:** {circles}"""
100
+
101
+ else: # Pattern Recognition
102
+ counts = {}
103
+ for s in shapes:
104
+ counts[s["shape"]] = counts.get(s["shape"], 0) + 1
105
+ most_common = max(counts, key=counts.get)
106
+ cot = f"""**CoT Reasoning:**
107
+ 1. Count all shapes in grid:
108
+ - Circles: {counts.get('circle', 0)}
109
+ - Squares: {counts.get('square', 0)}
110
+ - Triangles: {counts.get('triangle', 0)}
111
+ 2. Most common: {most_common}
112
+
113
+ **Answer:** {most_common}"""
114
+
115
+ return img, question, cot
116
+
117
+
118
+ def compare_both(puzzle_type):
119
+ """Compare direct vs CoT side by side"""
120
+ img1, q1, direct = direct_answer(puzzle_type)
121
+ img2, q2, cot = cot_answer(puzzle_type)
122
+
123
+ comparison = f"""## {puzzle_type}
124
+
125
+ **Question:** {q1}
126
+
127
+ ---
128
+
129
+ {direct}
130
+
131
+ ---
132
+
133
+ {cot}
134
+
135
+ ---
136
+
137
+ **Key Insight:** CoT adds reasoning steps but may introduce errors through over-analysis of spatial relationships."""
138
+
139
+ return img1, comparison
140
+
141
+
142
+ # Gradio Interface
143
+ with gr.Blocks(title="CoT Spatial Reasoning") as demo:
144
+ gr.Markdown("""
145
+ # 📉 CoT Spatial Reasoning
146
+
147
+ Exploring how Chain-of-Thought affects spatial reasoning capabilities.
148
+
149
+ Based on: *"Chain-of-Thought Degrades Visual Spatial Reasoning Capabilities of Multimodal LLMs"* (arXiv:2604.16060)
150
+ """)
151
+
152
+ with gr.Tab("Live Comparison"):
153
+ with gr.Row():
154
+ puzzle_select = gr.Dropdown(
155
+ choices=["Center Shape", "Corner Colors", "Pattern Recognition"],
156
+ value="Center Shape",
157
+ label="Select Puzzle Type"
158
+ )
159
+
160
+ with gr.Row():
161
+ with gr.Column():
162
+ puzzle_image = gr.Image(type="pil", label="Spatial Puzzle")
163
+ with gr.Column():
164
+ comparison_output = gr.Markdown(label="Comparison")
165
+
166
+ run_btn = gr.Button("Run Comparison", variant="primary")
167
+ run_btn.click(
168
+ fn=compare_both,
169
+ inputs=[puzzle_select],
170
+ outputs=[puzzle_image, comparison_output]
171
+ )
172
+
173
+ with gr.Tab("Paper Findings"):
174
+ gr.Markdown("""
175
+ ## Key Findings
176
+
177
+ The paper demonstrates that Chain-of-Thought prompting can **degrade** spatial reasoning performance:
178
+
179
+ 1. **Shortcut Learning**: Models learn to follow textual patterns rather than analyze visual space
180
+ 2. **Over-verbalization**: Converting visual tasks to language introduces errors
181
+ 3. **Task-dependent**: Effect varies by spatial reasoning type
182
+
183
+ **Recommendation**: Use direct visual processing for pure spatial tasks.
184
+ """)
185
+
186
+ if __name__ == "__main__":
187
+ demo.launch()