File size: 23,810 Bytes
72805b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
"""

Task Bank for SQL Arena.



Contains 9 SQL challenges across 3 difficulty levels:

- basic_select (Easy): 3 tasks β€” simple SELECT/WHERE/ORDER BY

- join_aggregate (Medium): 3 tasks β€” JOINs, GROUP BY, HAVING

- complex_analysis (Hard): 3 tasks β€” CTEs, window functions, subqueries



Each task defines:

- Database schema and sample data (setup_sql)

- Natural language question

- Expected SQL solution

- Expected result for grading

- Progressive hints

"""

from dataclasses import dataclass, field
from typing import List, Dict, Optional
import random


@dataclass
class SQLTask:
    """A single SQL challenge problem."""
    task_id: str
    difficulty: str  # basic_select, join_aggregate, complex_analysis
    title: str
    setup_sql: str  # CREATE TABLE + INSERT statements
    question: str  # Natural language question
    expected_sql: str  # Reference solution
    expected_columns: List[str]  # Expected column names in result
    expected_row_count: int  # Expected number of result rows
    expected_rows: List[tuple]  # Expected result rows for grading
    hints: List[str] = field(default_factory=list)
    max_steps: int = 5
    schema_description: str = ""  # Human-readable schema description


# =============================================================
# DATABASE SCHEMAS
# =============================================================

# Schema 1: Employee database (used by Easy tasks)
EMPLOYEES_SCHEMA = """

CREATE TABLE employees (

    id INTEGER PRIMARY KEY,

    name TEXT NOT NULL,

    department TEXT NOT NULL,

    salary REAL NOT NULL,

    hire_date TEXT NOT NULL,

    is_active INTEGER DEFAULT 1

);



INSERT INTO employees VALUES (1, 'Alice Johnson', 'Engineering', 95000, '2020-01-15', 1);

INSERT INTO employees VALUES (2, 'Bob Smith', 'Marketing', 65000, '2019-06-01', 1);

INSERT INTO employees VALUES (3, 'Carol Williams', 'Engineering', 110000, '2018-03-20', 1);

INSERT INTO employees VALUES (4, 'David Brown', 'Sales', 72000, '2021-09-10', 1);

INSERT INTO employees VALUES (5, 'Eve Davis', 'Engineering', 88000, '2022-02-28', 1);

INSERT INTO employees VALUES (6, 'Frank Miller', 'Marketing', 58000, '2020-11-15', 0);

INSERT INTO employees VALUES (7, 'Grace Wilson', 'Sales', 81000, '2019-04-22', 1);

INSERT INTO employees VALUES (8, 'Henry Taylor', 'Engineering', 125000, '2017-08-01', 1);

INSERT INTO employees VALUES (9, 'Ivy Anderson', 'HR', 70000, '2021-01-10', 1);

INSERT INTO employees VALUES (10, 'Jack Thomas', 'HR', 75000, '2020-07-15', 1);

"""

EMPLOYEES_SCHEMA_DESC = """Table: employees

Columns:

  - id: INTEGER PRIMARY KEY (auto-increment identifier)

  - name: TEXT (employee full name, e.g. 'Alice Johnson')

  - department: TEXT (one of: Engineering, Marketing, Sales, HR)

  - salary: REAL (annual salary in USD, e.g. 95000.0)

  - hire_date: TEXT (date in YYYY-MM-DD format, e.g. '2020-01-15')

  - is_active: INTEGER (1 = currently active, 0 = inactive/left)



Data: 10 employees across 4 departments.

  - 4 in Engineering, 2 in Marketing (1 inactive), 2 in Sales, 2 in HR

  - Salaries range from 58,000 to 125,000

  - Hire dates range from 2017 to 2022

"""


# Schema 2: E-commerce database (used by Medium and Hard tasks)
ECOMMERCE_SCHEMA = """

CREATE TABLE customers (

    id INTEGER PRIMARY KEY,

    name TEXT NOT NULL,

    email TEXT NOT NULL,

    city TEXT NOT NULL,

    signup_date TEXT NOT NULL

);



CREATE TABLE products (

    id INTEGER PRIMARY KEY,

    name TEXT NOT NULL,

    category TEXT NOT NULL,

    price REAL NOT NULL,

    stock INTEGER NOT NULL

);



CREATE TABLE orders (

    id INTEGER PRIMARY KEY,

    customer_id INTEGER NOT NULL,

    order_date TEXT NOT NULL,

    status TEXT NOT NULL,

    FOREIGN KEY (customer_id) REFERENCES customers(id)

);



CREATE TABLE order_items (

    id INTEGER PRIMARY KEY,

    order_id INTEGER NOT NULL,

    product_id INTEGER NOT NULL,

    quantity INTEGER NOT NULL,

    unit_price REAL NOT NULL,

    FOREIGN KEY (order_id) REFERENCES orders(id),

    FOREIGN KEY (product_id) REFERENCES products(id)

);



-- Customers

INSERT INTO customers VALUES (1, 'Alice', 'alice@email.com', 'New York', '2023-01-15');

INSERT INTO customers VALUES (2, 'Bob', 'bob@email.com', 'Los Angeles', '2023-02-20');

INSERT INTO customers VALUES (3, 'Carol', 'carol@email.com', 'Chicago', '2023-03-10');

INSERT INTO customers VALUES (4, 'David', 'david@email.com', 'New York', '2023-04-05');

INSERT INTO customers VALUES (5, 'Eve', 'eve@email.com', 'Boston', '2023-05-12');



-- Products

INSERT INTO products VALUES (1, 'Laptop', 'Electronics', 999.99, 50);

INSERT INTO products VALUES (2, 'Headphones', 'Electronics', 149.99, 200);

INSERT INTO products VALUES (3, 'Python Book', 'Books', 39.99, 100);

INSERT INTO products VALUES (4, 'Desk Lamp', 'Home', 29.99, 150);

INSERT INTO products VALUES (5, 'Keyboard', 'Electronics', 79.99, 120);

INSERT INTO products VALUES (6, 'SQL Book', 'Books', 44.99, 80);



-- Orders (10 orders, various statuses)

INSERT INTO orders VALUES (1, 1, '2023-06-01', 'completed');

INSERT INTO orders VALUES (2, 1, '2023-07-15', 'completed');

INSERT INTO orders VALUES (3, 2, '2023-06-20', 'completed');

INSERT INTO orders VALUES (4, 3, '2023-08-01', 'completed');

INSERT INTO orders VALUES (5, 3, '2023-08-15', 'completed');

INSERT INTO orders VALUES (6, 3, '2023-09-01', 'completed');

INSERT INTO orders VALUES (7, 4, '2023-07-10', 'cancelled');

INSERT INTO orders VALUES (8, 5, '2023-09-20', 'completed');

INSERT INTO orders VALUES (9, 1, '2023-10-01', 'completed');

INSERT INTO orders VALUES (10, 2, '2023-10-15', 'pending');



-- Order Items (17 line items)

INSERT INTO order_items VALUES (1, 1, 1, 1, 999.99);

INSERT INTO order_items VALUES (2, 1, 2, 2, 149.99);

INSERT INTO order_items VALUES (3, 2, 3, 1, 39.99);

INSERT INTO order_items VALUES (4, 2, 5, 1, 79.99);

INSERT INTO order_items VALUES (5, 3, 1, 1, 999.99);

INSERT INTO order_items VALUES (6, 3, 4, 3, 29.99);

INSERT INTO order_items VALUES (7, 4, 2, 1, 149.99);

INSERT INTO order_items VALUES (8, 4, 6, 2, 44.99);

INSERT INTO order_items VALUES (9, 5, 3, 1, 39.99);

INSERT INTO order_items VALUES (10, 5, 5, 2, 79.99);

INSERT INTO order_items VALUES (11, 6, 1, 1, 999.99);

INSERT INTO order_items VALUES (12, 6, 2, 1, 149.99);

INSERT INTO order_items VALUES (13, 8, 6, 1, 44.99);

INSERT INTO order_items VALUES (14, 8, 4, 1, 29.99);

INSERT INTO order_items VALUES (15, 9, 2, 3, 149.99);

INSERT INTO order_items VALUES (16, 9, 3, 2, 39.99);

INSERT INTO order_items VALUES (17, 10, 1, 1, 999.99);

"""

ECOMMERCE_SCHEMA_DESC = """Tables:



1. customers (5 rows)

   - id: INTEGER PRIMARY KEY

   - name: TEXT (customer first name)

   - email: TEXT

   - city: TEXT (New York, Los Angeles, Chicago, Boston)

   - signup_date: TEXT (YYYY-MM-DD)



2. products (6 rows)

   - id: INTEGER PRIMARY KEY

   - name: TEXT (product name)

   - category: TEXT (Electronics, Books, Home)

   - price: REAL (unit price in USD)

   - stock: INTEGER (units in stock)



3. orders (10 rows)

   - id: INTEGER PRIMARY KEY

   - customer_id: INTEGER β†’ customers.id

   - order_date: TEXT (YYYY-MM-DD, range: 2023-06 to 2023-10)

   - status: TEXT (completed, cancelled, pending)



4. order_items (17 rows)

   - id: INTEGER PRIMARY KEY

   - order_id: INTEGER β†’ orders.id

   - product_id: INTEGER β†’ products.id

   - quantity: INTEGER

   - unit_price: REAL (price at time of order)



Relationships:

  orders.customer_id β†’ customers.id

  order_items.order_id β†’ orders.id

  order_items.product_id β†’ products.id

"""


# =============================================================
# EASY TASKS: basic_select (3 tasks)
# =============================================================

EASY_TASKS = [
    SQLTask(
        task_id="easy_001",
        difficulty="basic_select",
        title="High Salary Employees",
        setup_sql=EMPLOYEES_SCHEMA,
        question="Find the names and salaries of all ACTIVE employees who earn more than \$80,000. Order the results by salary from highest to lowest.",
        expected_sql="SELECT name, salary FROM employees WHERE is_active = 1 AND salary > 80000 ORDER BY salary DESC",
        expected_columns=["name", "salary"],
        expected_row_count=4,
        expected_rows=[
            ("Henry Taylor", 125000.0),
            ("Carol Williams", 110000.0),
            ("Alice Johnson", 95000.0),
            ("Eve Davis", 88000.0),
        ],
        hints=[
            "Use SELECT with specific column names, not SELECT *",
            "Use WHERE with AND to combine conditions: is_active = 1 AND salary > 80000",
            "Add ORDER BY salary DESC for descending order",
        ],
        schema_description=EMPLOYEES_SCHEMA_DESC,
        max_steps=5,
    ),

    SQLTask(
        task_id="easy_002",
        difficulty="basic_select",
        title="Department Employee Count",
        setup_sql=EMPLOYEES_SCHEMA,
        question="Count the number of ACTIVE employees in each department. Show the department name and the count. Order by count from highest to lowest.",
        expected_sql="SELECT department, COUNT(*) as employee_count FROM employees WHERE is_active = 1 GROUP BY department ORDER BY employee_count DESC",
        expected_columns=["department", "employee_count"],
        expected_row_count=4,
        expected_rows=[
            ("Engineering", 4),
            ("HR", 2),
            ("Sales", 2),
            ("Marketing", 1),
        ],
        hints=[
            "Use COUNT(*) to count rows in each group",
            "GROUP BY department groups rows by department",
            "Use an alias: COUNT(*) as employee_count",
        ],
        schema_description=EMPLOYEES_SCHEMA_DESC,
        max_steps=5,
    ),

    SQLTask(
        task_id="easy_003",
        difficulty="basic_select",
        title="Recent Hires",
        setup_sql=EMPLOYEES_SCHEMA,
        question="List the names and hire dates of employees hired on or after January 1, 2021. Order by hire date from earliest to latest.",
        expected_sql="SELECT name, hire_date FROM employees WHERE hire_date >= '2021-01-01' ORDER BY hire_date",
        expected_columns=["name", "hire_date"],
        expected_row_count=3,
        expected_rows=[
            ("Ivy Anderson", "2021-01-10"),
            ("David Brown", "2021-09-10"),
            ("Eve Davis", "2022-02-28"),
        ],
        hints=[
            "Dates in SQLite can be compared as strings when in YYYY-MM-DD format",
            "Use WHERE hire_date >= '2021-01-01'",
            "ORDER BY hire_date gives ascending order by default",
        ],
        schema_description=EMPLOYEES_SCHEMA_DESC,
        max_steps=5,
    ),
]


# =============================================================
# MEDIUM TASKS: join_aggregate (3 tasks)
# =============================================================

MEDIUM_TASKS = [
    SQLTask(
        task_id="medium_001",
        difficulty="join_aggregate",
        title="Customer Total Spending",
        setup_sql=ECOMMERCE_SCHEMA,
        question="Find the total amount spent by each customer on COMPLETED orders only. Show the customer name and their total spending. Only include customers who spent more than \$200. Order by total spending from highest to lowest.",
        expected_sql="""

            SELECT c.name, ROUND(SUM(oi.quantity * oi.unit_price), 2) as total_spent

            FROM customers c

            JOIN orders o ON c.id = o.customer_id

            JOIN order_items oi ON o.id = oi.order_id

            WHERE o.status = 'completed'

            GROUP BY c.id, c.name

            HAVING SUM(oi.quantity * oi.unit_price) > 200

            ORDER BY total_spent DESC

        """,
        expected_columns=["name", "total_spent"],
        expected_row_count=4,
        expected_rows=[
            ("Alice", 1919.91),
            ("Carol", 1464.94),
            ("Bob", 1089.96),
            ("Eve", 74.98),
        ],
        hints=[
            "You need to JOIN three tables: customers β†’ orders β†’ order_items",
            "Total per item = quantity * unit_price, then SUM for total per customer",
            "Filter completed orders with WHERE o.status = 'completed'",
            "Use HAVING (not WHERE) to filter after GROUP BY",
        ],
        schema_description=ECOMMERCE_SCHEMA_DESC,
        max_steps=7,
    ),

    SQLTask(
        task_id="medium_002",
        difficulty="join_aggregate",
        title="Category Revenue",
        setup_sql=ECOMMERCE_SCHEMA,
        question="Calculate the total revenue for each product category from COMPLETED orders. Show the category name and total revenue. Order by total revenue from highest to lowest.",
        expected_sql="""

            SELECT p.category, ROUND(SUM(oi.quantity * oi.unit_price), 2) as total_revenue

            FROM products p

            JOIN order_items oi ON p.id = oi.product_id

            JOIN orders o ON oi.order_id = o.id

            WHERE o.status = 'completed'

            GROUP BY p.category

            ORDER BY total_revenue DESC

        """,
        expected_columns=["category", "total_revenue"],
        expected_row_count=3,
        expected_rows=[
            ("Electronics", 4459.83),
            ("Books", 254.93),
            ("Home", 119.96),
        ],
        hints=[
            "JOIN products β†’ order_items β†’ orders",
            "Revenue per item = quantity * unit_price",
            "Filter only completed orders",
            "GROUP BY p.category to get per-category totals",
        ],
        schema_description=ECOMMERCE_SCHEMA_DESC,
        max_steps=7,
    ),

    SQLTask(
        task_id="medium_003",
        difficulty="join_aggregate",
        title="Customers with Multiple Orders",
        setup_sql=ECOMMERCE_SCHEMA,
        question="Find customers who have placed more than one COMPLETED order. Show the customer name and the number of completed orders they placed. Order by order count descending, then by name ascending.",
        expected_sql="""

            SELECT c.name, COUNT(o.id) as order_count

            FROM customers c

            JOIN orders o ON c.id = o.customer_id

            WHERE o.status = 'completed'

            GROUP BY c.id, c.name

            HAVING COUNT(o.id) > 1

            ORDER BY order_count DESC, c.name ASC

        """,
        expected_columns=["name", "order_count"],
        expected_row_count=2,
        expected_rows=[
            ("Alice", 3),
            ("Carol", 3),
        ],
        hints=[
            "JOIN customers with orders",
            "Filter for completed orders in WHERE clause",
            "GROUP BY customer, then HAVING COUNT > 1",
            "ORDER BY count DESC, then name ASC for ties",
        ],
        schema_description=ECOMMERCE_SCHEMA_DESC,
        max_steps=7,
    ),
]


# =============================================================
# HARD TASKS: complex_analysis (3 tasks)
# =============================================================

HARD_TASKS = [
    SQLTask(
        task_id="hard_001",
        difficulty="complex_analysis",
        title="Monthly Revenue with Growth Rate",
        setup_sql=ECOMMERCE_SCHEMA,
        question="Calculate monthly revenue from COMPLETED orders, and for each month show the month (YYYY-MM format), the total revenue, and the percentage change from the previous month. For the first month, the percentage change should be NULL. Round revenue to 2 decimal places and percentage to 2 decimal places. Order by month ascending.",
        expected_sql="""

            WITH monthly AS (

                SELECT 

                    strftime('%Y-%m', o.order_date) as month,

                    ROUND(SUM(oi.quantity * oi.unit_price), 2) as revenue

                FROM orders o

                JOIN order_items oi ON o.id = oi.order_id

                WHERE o.status = 'completed'

                GROUP BY strftime('%Y-%m', o.order_date)

            ),

            with_prev AS (

                SELECT 

                    month,

                    revenue,

                    LAG(revenue) OVER (ORDER BY month) as prev_revenue

                FROM monthly

            )

            SELECT 

                month,

                revenue,

                CASE 

                    WHEN prev_revenue IS NULL THEN NULL

                    ELSE ROUND(((revenue - prev_revenue) * 100.0 / prev_revenue), 2)

                END as pct_change

            FROM with_prev

            ORDER BY month

        """,
        expected_columns=["month", "revenue", "pct_change"],
        expected_row_count=5,
        expected_rows=[
            ("2023-06", 2289.93, None),
            ("2023-07", 119.98, -94.76),
            ("2023-08", 1429.93, 1091.81),
            ("2023-09", 1224.97, -14.34),
            ("2023-10", 529.95, -56.74),
        ],
        hints=[
            "Use a CTE (WITH clause) to first calculate monthly revenue",
            "strftime('%Y-%m', date) extracts year-month from a date string",
            "LAG(revenue) OVER (ORDER BY month) gets the previous month's revenue",
            "Percentage change = ((new - old) / old) * 100",
            "Use CASE WHEN prev IS NULL THEN NULL ELSE ... END for first month",
        ],
        schema_description=ECOMMERCE_SCHEMA_DESC,
        max_steps=10,
    ),

    SQLTask(
        task_id="hard_002",
        difficulty="complex_analysis",
        title="Top Product Per Category",
        setup_sql=ECOMMERCE_SCHEMA,
        question="For each product category, find the single best-selling product (by total quantity sold across COMPLETED orders). Show the category, product name, and total quantity sold. If there are ties, pick the one with the higher total revenue. Order by category name ascending.",
        expected_sql="""

            WITH product_sales AS (

                SELECT 

                    p.category,

                    p.name as product_name,

                    SUM(oi.quantity) as total_qty,

                    SUM(oi.quantity * oi.unit_price) as total_revenue,

                    ROW_NUMBER() OVER (

                        PARTITION BY p.category 

                        ORDER BY SUM(oi.quantity) DESC, SUM(oi.quantity * oi.unit_price) DESC

                    ) as rn

                FROM products p

                JOIN order_items oi ON p.id = oi.product_id

                JOIN orders o ON oi.order_id = o.id

                WHERE o.status = 'completed'

                GROUP BY p.category, p.name

            )

            SELECT category, product_name, total_qty

            FROM product_sales

            WHERE rn = 1

            ORDER BY category ASC

        """,
        expected_columns=["category", "product_name", "total_qty"],
        expected_row_count=3,
        expected_rows=[
            ("Books", "Python Book", 4),
            ("Electronics", "Headphones", 7),
            ("Home", "Desk Lamp", 4),
        ],
        hints=[
            "First calculate total quantity sold per product (SUM of quantity)",
            "Use ROW_NUMBER() OVER (PARTITION BY category ORDER BY qty DESC) to rank within category",
            "Filter WHERE rn = 1 to get only the top product per category",
            "A CTE makes this much cleaner than nested subqueries",
            "Don't forget to filter for completed orders only",
        ],
        schema_description=ECOMMERCE_SCHEMA_DESC,
        max_steps=10,
    ),

    SQLTask(
        task_id="hard_003",
        difficulty="complex_analysis",
        title="Customer Lifetime Value Analysis",
        setup_sql=ECOMMERCE_SCHEMA,
        question="For customers with at least 2 completed orders, calculate: their name, number of completed orders, total lifetime spending (rounded to 2 decimals), average order value (rounded to 2 decimals), and the number of days between their first and last completed order. Order by total spending descending.",
        expected_sql="""

            WITH customer_order_totals AS (

                SELECT 

                    c.id as customer_id,

                    c.name,

                    o.id as order_id,

                    o.order_date,

                    SUM(oi.quantity * oi.unit_price) as order_total

                FROM customers c

                JOIN orders o ON c.id = o.customer_id

                JOIN order_items oi ON o.id = oi.order_id

                WHERE o.status = 'completed'

                GROUP BY c.id, c.name, o.id, o.order_date

            )

            SELECT 

                name,

                COUNT(*) as num_orders,

                ROUND(SUM(order_total), 2) as total_spending,

                ROUND(AVG(order_total), 2) as avg_order_value,

                CAST(julianday(MAX(order_date)) - julianday(MIN(order_date)) AS INTEGER) as days_span

            FROM customer_order_totals

            GROUP BY customer_id, name

            HAVING COUNT(*) >= 2

            ORDER BY total_spending DESC

        """,
        expected_columns=["name", "num_orders", "total_spending", "avg_order_value", "days_span"],
        expected_row_count=2,
        expected_rows=[
            ("Alice", 3, 1919.91, 639.97, 122),
            ("Carol", 3, 1464.94, 488.31, 31),
        ],
        hints=[
            "Use a CTE to first calculate the total for each individual order",
            "In the CTE: JOIN customers β†’ orders β†’ order_items, GROUP BY order",
            "In the outer query: GROUP BY customer, HAVING COUNT >= 2",
            "julianday() converts date strings to Julian day numbers for arithmetic",
            "days_span = julianday(MAX(order_date)) - julianday(MIN(order_date))",
        ],
        schema_description=ECOMMERCE_SCHEMA_DESC,
        max_steps=10,
    ),
]


# =============================================================
# TASK REGISTRY β€” Maps task IDs and difficulty levels
# =============================================================

ALL_TASKS: Dict[str, List[SQLTask]] = {
    "basic_select": EASY_TASKS,
    "join_aggregate": MEDIUM_TASKS,
    "complex_analysis": HARD_TASKS,
}

# Build a flat lookup by task_id
TASK_BY_ID: Dict[str, SQLTask] = {}
for _tasks in ALL_TASKS.values():
    for _task in _tasks:
        TASK_BY_ID[_task.task_id] = _task


def get_task(difficulty: str, task_id: Optional[str] = None) -> SQLTask:
    """

    Get a task by difficulty level, optionally by specific ID.

    

    Args:

        difficulty: One of 'basic_select', 'join_aggregate', 'complex_analysis'

        task_id: Optional specific task ID (e.g., 'easy_001')

        

    Returns:

        SQLTask instance

        

    Raises:

        ValueError: If difficulty is unknown

    """
    # If specific task_id given, return it directly
    if task_id and task_id in TASK_BY_ID:
        return TASK_BY_ID[task_id]

    # Otherwise pick from the difficulty pool
    if difficulty not in ALL_TASKS:
        raise ValueError(
            f"Unknown difficulty: '{difficulty}'. "
            f"Choose from: {list(ALL_TASKS.keys())}"
        )

    tasks = ALL_TASKS[difficulty]
    return random.choice(tasks)


def list_tasks() -> Dict[str, List[str]]:
    """

    List all available tasks grouped by difficulty.

    

    Returns:

        Dict mapping difficulty name to list of task IDs

    """
    return {
        difficulty: [t.task_id for t in tasks]
        for difficulty, tasks in ALL_TASKS.items()
    }