File size: 9,120 Bytes
90fc756
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b83c8ad
 
 
 
 
90fc756
 
 
 
 
 
 
b83c8ad
 
 
 
 
90fc756
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b83c8ad
 
 
 
 
90fc756
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b83c8ad
 
 
 
 
90fc756
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b83c8ad
 
 
 
 
90fc756
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b83c8ad
 
 
 
 
90fc756
 
 
 
 
 
 
b83c8ad
 
 
 
 
90fc756
 
 
b83c8ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90fc756
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
[
  {
    "task_id": "medium_001",
    "difficulty": "medium",
    "query": "SELECT * FROM events ORDER BY created_at DESC;",
    "schema": {
      "events": {
        "id": "BIGINT PRIMARY KEY",
        "event_name": "VARCHAR(255)",
        "payload": "JSON",
        "created_at": "TIMESTAMP INDEX",
        "actor_id": "BIGINT",
        "metadata": "JSON"
      }
    },
    "context": "Show the most recent events on an admin dashboard.",
    "ground_truth_issues": [
      {
        "id": "medium_001_select_star",
        "category": "performance",
        "description": "SELECT * pulls a wide payload when the dashboard only needs a few columns.",
        "severity": 0.3,
        "fix": "SELECT id, event_name, created_at FROM events ORDER BY created_at DESC LIMIT 50;",
        "keywords": [
          "select *", "wide table", "projection", "performance", "star",
          "all columns", "unnecessary columns", "column selection",
          "over-fetching", "wildcard"
        ]
      },
      {
        "id": "medium_001_missing_limit",
        "category": "performance",
        "description": "The dashboard query is missing a LIMIT and can scan far more rows than necessary.",
        "severity": 0.3,
        "fix": "SELECT id, event_name, created_at FROM events ORDER BY created_at DESC LIMIT 50;",
        "keywords": [
          "limit", "unbounded query", "dashboard", "performance", "no limit",
          "missing limit", "unlimited rows", "pagination", "all rows",
          "full scan", "row count"
        ]
      }
    ],
    "max_steps": 5
  },
  {
    "task_id": "medium_002",
    "difficulty": "medium",
    "query": "SELECT c.id, c.name, (SELECT COUNT(*) FROM orders o WHERE o.customer_id = c.id) AS order_count FROM customers c;",
    "schema": {
      "customers": {
        "id": "INT PRIMARY KEY",
        "name": "VARCHAR(255)"
      },
      "orders": {
        "id": "INT PRIMARY KEY",
        "customer_id": "INT INDEX",
        "total": "DECIMAL(10,2)"
      }
    },
    "context": "Show each customer with the number of orders they have placed.",
    "ground_truth_issues": [
      {
        "id": "medium_002_correlated_subquery",
        "category": "performance",
        "description": "The correlated subquery re-counts orders per row and should be rewritten as a join with GROUP BY.",
        "severity": 0.6,
        "fix": "SELECT c.id, c.name, COUNT(o.id) AS order_count FROM customers c LEFT JOIN orders o ON o.customer_id = c.id GROUP BY c.id, c.name;",
        "keywords": [
          "correlated subquery", "group by", "join", "count", "performance",
          "subquery per row", "n+1", "rewrite", "left join", "aggregate",
          "scalar subquery", "dependent subquery"
        ]
      }
    ],
    "max_steps": 6
  },
  {
    "task_id": "medium_003",
    "difficulty": "medium",
    "query": "SELECT DISTINCT email FROM users WHERE email IS NOT NULL;",
    "schema": {
      "users": {
        "id": "INT PRIMARY KEY",
        "email": "VARCHAR(255) UNIQUE",
        "last_login_at": "TIMESTAMP NULL"
      }
    },
    "context": "Export non-null user emails for a CRM sync.",
    "ground_truth_issues": [
      {
        "id": "medium_003_redundant_distinct",
        "category": "performance",
        "description": "DISTINCT is redundant because users.email is already unique.",
        "severity": 0.45,
        "fix": "SELECT email FROM users WHERE email IS NOT NULL;",
        "keywords": [
          "distinct", "unique", "redundant", "email", "performance",
          "unnecessary distinct", "unique constraint", "already unique",
          "duplicate elimination", "deduplication", "wasted sort"
        ]
      }
    ],
    "max_steps": 5
  },
  {
    "task_id": "medium_004",
    "difficulty": "medium",
    "query": "SELECT o.id, o.total, u.name FROM orders o JOIN users u ON u.id = o.user_id WHERE DATE(o.created_at) = '2026-04-10';",
    "schema": {
      "orders": {
        "id": "INT PRIMARY KEY",
        "user_id": "INT INDEX",
        "created_at": "TIMESTAMP INDEX",
        "total": "DECIMAL(10,2)"
      },
      "users": {
        "id": "INT PRIMARY KEY",
        "name": "VARCHAR(255)"
      }
    },
    "context": "List orders placed on a specific date with the user name attached.",
    "ground_truth_issues": [
      {
        "id": "medium_004_function_on_indexed_column",
        "category": "performance",
        "description": "Wrapping created_at with DATE() prevents efficient use of the created_at index.",
        "severity": 0.6,
        "fix": "SELECT o.id, o.total, u.name FROM orders o JOIN users u ON u.id = o.user_id WHERE o.created_at >= '2026-04-10' AND o.created_at < '2026-04-11';",
        "keywords": [
          "date()", "function on column", "index", "range predicate", "performance",
          "sargable", "non-sargable", "prevents index", "full scan",
          "index usage", "function wrapping"
        ]
      }
    ],
    "max_steps": 6
  },
  {
    "task_id": "medium_005",
    "difficulty": "medium",
    "query": "SELECT id, name FROM products WHERE LOWER(name) LIKE '%pro%';",
    "schema": {
      "products": {
        "id": "INT PRIMARY KEY",
        "name": "VARCHAR(255) INDEX",
        "category_id": "INT",
        "price": "DECIMAL(10,2)"
      }
    },
    "context": "Search products whose names contain the text pro.",
    "ground_truth_issues": [
      {
        "id": "medium_005_lower_blocks_index",
        "category": "performance",
        "description": "Applying LOWER(name) on every row prevents the index on name from being used efficiently.",
        "severity": 0.35,
        "fix": "SELECT id, name FROM products WHERE name ILIKE 'pro%';",
        "keywords": [
          "lower", "function on column", "index", "performance", "sargable",
          "non-sargable", "case insensitive", "full scan", "table scan",
          "function wrapping column"
        ]
      },
      {
        "id": "medium_005_leading_wildcard",
        "category": "performance",
        "description": "The leading wildcard in LIKE '%pro%' forces a full scan instead of an index-friendly prefix lookup.",
        "severity": 0.35,
        "fix": "SELECT id, name FROM products WHERE name ILIKE 'pro%';",
        "keywords": [
          "leading wildcard", "%pro%", "full scan", "prefix lookup", "performance",
          "like wildcard", "pattern matching", "index unusable", "table scan",
          "wildcard prefix"
        ]
      }
    ],
    "max_steps": 6
  },
  {
    "task_id": "medium_006",
    "difficulty": "medium",
    "query": "SELECT * FROM events WHERE DATE(created_at) = '2024-01-15';",
    "schema": {
      "events": {
        "id": "INT PRIMARY KEY",
        "name": "VARCHAR(255)",
        "created_at": "TIMESTAMP",
        "INDEX": "idx_created_at ON events(created_at)"
      }
    },
    "context": "Find all events that happened on a specific date.",
    "ground_truth_issues": [
      {
        "id": "medium_006_function_on_index",
        "category": "performance",
        "description": "Using DATE() function on an indexed column prevents index usage. Use a range comparison instead.",
        "severity": 0.7,
        "fix": "SELECT * FROM events WHERE created_at >= '2024-01-15 00:00:00' AND created_at < '2024-01-16 00:00:00';",
        "keywords": [
          "function on column", "date function", "index", "sargable",
          "non-sargable", "prevents index", "range comparison", "full scan",
          "table scan", "index usage", "function wrapping column"
        ]
      },
      {
        "id": "medium_006_star",
        "category": "performance",
        "description": "SELECT * returns all columns when only specific fields may be needed.",
        "severity": 0.2,
        "fix": "SELECT id, name, created_at FROM events WHERE created_at >= '2024-01-15' AND created_at < '2024-01-16';",
        "keywords": [
          "select *", "star", "all columns", "projection", "unnecessary columns",
          "wildcard", "over-fetching", "column selection"
        ]
      }
    ],
    "max_steps": 6
  },
  {
    "task_id": "medium_007",
    "difficulty": "medium",
    "query": "SELECT * FROM products ORDER BY RAND() LIMIT 10;",
    "schema": {
      "products": {
        "id": "INT PRIMARY KEY",
        "name": "VARCHAR(255)",
        "price": "DECIMAL(10,2)",
        "category": "VARCHAR(64)"
      }
    },
    "context": "Show 10 random products on the homepage.",
    "ground_truth_issues": [
      {
        "id": "medium_007_order_rand",
        "category": "performance",
        "description": "ORDER BY RAND() generates a random value for every row in the table, causing a full table scan and sort. Extremely slow on large tables.",
        "severity": 0.8,
        "fix": "SELECT * FROM products WHERE id >= (SELECT FLOOR(RAND() * (SELECT MAX(id) FROM products))) LIMIT 10;",
        "keywords": [
          "order by rand", "random", "full table scan", "sort", "performance",
          "slow", "every row", "random ordering", "rand function",
          "expensive sort", "large table"
        ]
      }
    ],
    "max_steps": 5
  }
]