File size: 61,759 Bytes
4c1a85d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cd11aba
4c1a85d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c3002ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4c1a85d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
081eb22
 
 
 
 
 
 
 
 
 
 
4c1a85d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cd11aba
4c1a85d
 
 
cd11aba
4c1a85d
cd11aba
4c1a85d
 
 
cd11aba
4c1a85d
cd11aba
4c1a85d
 
 
cd11aba
4c1a85d
b08652c
 
 
 
 
 
 
 
5d90461
 
 
 
 
 
f1b7439
 
 
 
 
5d90461
 
4c1a85d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
081eb22
 
 
 
 
 
 
 
 
 
 
4c1a85d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cd11aba
4c1a85d
 
 
cd11aba
4c1a85d
cd11aba
4c1a85d
 
 
cd11aba
4c1a85d
b08652c
 
 
 
 
 
4c1a85d
b08652c
 
 
 
 
 
4c1a85d
cd11aba
4c1a85d
 
 
cd11aba
4c1a85d
cd11aba
4c1a85d
 
 
cd11aba
4c1a85d
b08652c
5d90461
b08652c
 
 
 
 
 
 
 
 
 
 
 
5d90461
4c1a85d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
081eb22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4c1a85d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cd11aba
4c1a85d
 
 
cd11aba
 
4c1a85d
cd11aba
4c1a85d
 
 
cd11aba
4c1a85d
cd11aba
4c1a85d
 
 
cd11aba
 
4c1a85d
cd11aba
4c1a85d
 
 
cd11aba
 
4c1a85d
cd11aba
4c1a85d
 
 
cd11aba
 
4c1a85d
b08652c
4c1a85d
 
 
b08652c
 
4c1a85d
a1f98bf
4c1a85d
a1f98bf
4c1a85d
a1f98bf
b08652c
4c1a85d
b08652c
4c1a85d
b08652c
 
 
 
4c1a85d
c3002ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4c1a85d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5cb467d
 
 
 
7479de3
4051320
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7479de3
 
4051320
 
 
7479de3
 
 
 
5cb467d
 
 
7479de3
5cb467d
4051320
5cb467d
4051320
 
 
 
 
 
 
5cb467d
 
 
4051320
 
 
 
 
 
 
 
5cb467d
 
 
 
 
 
96d698c
 
4051320
96d698c
 
 
5cb467d
96d698c
 
5cb467d
96d698c
 
4051320
96d698c
 
5cb467d
96d698c
 
5cb467d
4051320
 
 
 
a9620ef
4051320
 
5cb467d
4051320
 
 
a9620ef
4051320
 
5cb467d
4051320
 
 
 
 
5cb467d
 
96d698c
 
 
4051320
96d698c
 
5cb467d
4051320
 
 
a9620ef
4051320
a9620ef
5cb467d
4051320
 
 
5cb467d
4051320
5cb467d
 
4051320
 
 
a9620ef
4051320
5cb467d
 
c699b6f
 
 
 
 
 
 
96d698c
c699b6f
 
4051320
 
 
5cb467d
4051320
 
5cb467d
 
4051320
 
 
5cb467d
4051320
5cb467d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cd11aba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4c1a85d
 
 
 
b99e42b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fcce834
b99e42b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8560706
 
 
 
b99e42b
 
 
 
 
 
 
 
 
 
8560706
 
b99e42b
 
 
 
fcce834
 
 
 
 
b99e42b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4c1a85d
 
 
 
5cb467d
b99e42b
4c1a85d
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
"""
Task definitions for the DataQA environment.

Each task provides:
- A clean dataset (CSV)
- A schema + validation rules
- A set of planted issues (ground truth)
- A function to inject those issues into the clean data
"""

from __future__ import annotations

import csv
import io
import random
from dataclasses import dataclass, field
from typing import List, Set


@dataclass
class PlantedIssue:
    """A single planted data quality issue."""

    row: int
    col: str
    issue_type: str
    description: str
    difficulty: float = 1.0  # 1.0=easy, 2.0=medium, 3.0=hard (for weighted reward)

    def to_key(self) -> str:
        return f"row:{self.row},col:{self.col},issue:{self.issue_type}"


@dataclass
class Task:
    task_id: str
    name: str
    description: str
    schema_description: str
    validation_rules: str
    clean_csv: str
    planted_issues: List[PlantedIssue] = field(default_factory=list)
    corrupted_csv: str = ""
    max_steps: int = 3

    def get_clean_value(self, row: int, col: str) -> str | None:
        """
        Look up the original clean value for a given (row, col).
        Row is 1-indexed (data row after header).
        Returns None if row/col is out of bounds or column not found.
        """
        rows = _csv_to_rows(self.clean_csv)
        if len(rows) < 2:
            return None
        header = [h.strip().lower() for h in rows[0]]
        if col.lower() not in header:
            return None
        col_idx = header.index(col.lower())
        data_row_idx = row  # row is 1-indexed, rows[0] is header, so rows[row] is the data row
        if data_row_idx < 1 or data_row_idx >= len(rows):
            return None
        return rows[data_row_idx][col_idx].strip()

    def get_planted_issue_map(self) -> dict:
        """Return dict mapping issue key -> PlantedIssue for quick lookups."""
        return {issue.to_key(): issue for issue in self.planted_issues}


def _csv_to_rows(csv_text: str) -> List[List[str]]:
    reader = csv.reader(io.StringIO(csv_text.strip()))
    return [row for row in reader]


def _rows_to_csv(rows: List[List[str]]) -> str:
    output = io.StringIO()
    writer = csv.writer(output)
    writer.writerows(rows)
    return output.getvalue()


# ---------------------------------------------------------------------------
# TASK 1: Easy β€” Employee directory with obvious issues
# ---------------------------------------------------------------------------

def create_task_easy(seed: int = 42) -> Task:
    rng = random.Random(seed)

    clean_csv = """employee_id,name,email,department,salary,start_date
101,Alice Chen,alice.chen@company.com,Engineering,95000,2022-03-15
102,Bob Martinez,bob.martinez@company.com,Marketing,72000,2021-07-01
103,Carol Davis,carol.davis@company.com,Engineering,98000,2020-11-20
104,David Kim,david.kim@company.com,Sales,68000,2023-01-10
105,Eve Johnson,eve.johnson@company.com,HR,71000,2022-06-05
106,Frank Wilson,frank.wilson@company.com,Engineering,102000,2019-08-12
107,Grace Lee,grace.lee@company.com,Marketing,75000,2021-12-01
108,Hank Brown,hank.brown@company.com,Sales,65000,2023-04-18
109,Iris Patel,iris.patel@company.com,HR,73000,2020-02-28
110,Jack Taylor,jack.taylor@company.com,Engineering,97000,2022-09-14
111,Kevin Zhang,kevin.zhang@company.com,Engineering,91000,2021-05-22
112,Laura Adams,laura.adams@company.com,Sales,69000,2022-11-03
113,Mike Torres,mike.torres@company.com,Marketing,74000,2020-08-17
114,Nina Sharma,nina.sharma@company.com,HR,76000,2019-04-30
115,Oscar Rivera,oscar.rivera@company.com,Engineering,105000,2018-12-10
116,Paula Green,paula.green@company.com,Sales,67000,2023-06-25
117,Quinn Murphy,quinn.murphy@company.com,Marketing,78000,2021-03-08
118,Rosa Diaz,rosa.diaz@company.com,Engineering,99000,2022-01-19
119,Sam Cooper,sam.cooper@company.com,HR,70000,2020-10-05
120,Tara Singh,tara.singh@company.com,Sales,66000,2023-02-14"""

    schema_desc = """Columns:
- employee_id: integer, unique, range 100-999
- name: string, non-empty, format "FirstName LastName"
- email: string, valid email format, must match pattern firstname.lastname@company.com
- department: string, one of [Engineering, Marketing, Sales, HR]
- salary: integer, range 50000-150000
- start_date: string, format YYYY-MM-DD, must be between 2015-01-01 and 2025-12-31"""

    rules = """1. No missing values in any column
2. employee_id must be unique
3. email must follow the pattern: lowercase(firstname).lowercase(lastname)@company.com
4. salary must be within the valid range
5. No duplicate rows"""

    rows = _csv_to_rows(clean_csv)
    header = rows[0]
    data = rows[1:]
    issues: List[PlantedIssue] = []

    # Issue 1: Missing value - null out a name (easy to spot)
    r = 3  # row index in data (0-based), displayed as row 4 in CSV
    data[r][1] = ""
    issues.append(PlantedIssue(row=r + 1, col="name", issue_type="missing_value",
                               description="Empty name field", difficulty=1.0))

    # Issue 2: Wrong type - salary as text (easy to spot)
    r = 6
    data[r][4] = "seventy-five thousand"
    issues.append(PlantedIssue(row=r + 1, col="salary", issue_type="wrong_type",
                               description="Salary is text instead of integer", difficulty=1.0))

    # Issue 3: Duplicate row (moderate β€” requires cross-row comparison)
    dup_source = 1
    data.append(list(data[dup_source]))
    issues.append(PlantedIssue(row=len(data), col="employee_id", issue_type="duplicate_row",
                               description=f"Exact duplicate of row {dup_source + 1}", difficulty=1.5))

    # Issue 4: Department is not in allowed set (deterministic: "Engneering" is not valid, closest match = "Engineering")
    r = 10  # Kevin Zhang, department is Engineering
    data[r][3] = "Engneering"
    issues.append(PlantedIssue(row=r + 1, col="department", issue_type="format_violation",
                               description="Department 'Engneering' is misspelled β€” should be 'Engineering'",
                               difficulty=1.0))

    # Issue 5: Email doesn't match name pattern (deterministic fix: derive from name)
    r = 14  # Oscar Rivera -> email should be oscar.rivera@company.com
    data[r][2] = "john.doe@company.com"
    issues.append(PlantedIssue(row=r + 1, col="email", issue_type="inconsistent_value",
                               description="Email john.doe@company.com doesn't match name Oscar Rivera",
                               difficulty=1.5))

    # Issue 6: Date in wrong format (deterministic fix: "03-15-2022" β†’ "2022-03-15")
    r = 11  # Laura Adams, start_date should be 2022-11-03
    data[r][5] = "11-03-2022"  # MM-DD-YYYY instead of YYYY-MM-DD
    issues.append(PlantedIssue(row=r + 1, col="start_date", issue_type="format_violation",
                               description="Start date '11-03-2022' is in MM-DD-YYYY format instead of required YYYY-MM-DD (should be 2022-11-03)",
                               difficulty=1.5))

    corrupted = _rows_to_csv([header] + data)

    return Task(
        task_id="easy",
        name="Employee Directory Validation",
        description=(
            "You are given an employee directory dataset. "
            "Find all data quality issues based on the schema and validation rules. "
            "Report each issue in the format: row:<row_number>,col:<column_name>,issue:<issue_type>"
        ),
        schema_description=schema_desc,
        validation_rules=rules,
        clean_csv=clean_csv,
        planted_issues=issues,
        corrupted_csv=corrupted,
        max_steps=3,
    )


# ---------------------------------------------------------------------------
# TASK 2: Medium β€” E-commerce orders with moderate issues
# ---------------------------------------------------------------------------

def create_task_medium(seed: int = 42) -> Task:
    rng = random.Random(seed)

    clean_csv = """order_id,customer_id,product_name,category,quantity,unit_price,order_date,shipping_country,status,total
ORD-001,CUST-100,Wireless Mouse,Electronics,2,29.99,2024-01-15,US,delivered,59.98
ORD-002,CUST-101,Python Cookbook,Books,1,45.50,2024-01-16,UK,delivered,45.50
ORD-003,CUST-102,USB-C Hub,Electronics,1,35.00,2024-01-17,US,shipped,35.00
ORD-004,CUST-103,Yoga Mat,Sports,1,25.99,2024-01-18,CA,delivered,25.99
ORD-005,CUST-104,Desk Lamp,Home,1,42.00,2024-01-19,US,processing,42.00
ORD-006,CUST-105,Running Shoes,Sports,1,89.99,2024-01-20,DE,delivered,89.99
ORD-007,CUST-106,Mechanical Keyboard,Electronics,1,129.99,2024-01-21,US,shipped,129.99
ORD-008,CUST-100,Monitor Stand,Home,1,55.00,2024-01-22,US,delivered,55.00
ORD-009,CUST-107,Data Science Handbook,Books,2,39.99,2024-01-23,UK,delivered,79.98
ORD-010,CUST-108,Resistance Bands,Sports,3,12.99,2024-01-24,CA,shipped,38.97
ORD-011,CUST-109,Webcam HD,Electronics,1,65.00,2024-01-25,US,delivered,65.00
ORD-012,CUST-110,Standing Desk,Home,1,299.99,2024-01-26,US,processing,299.99
ORD-013,CUST-111,Tennis Racket,Sports,1,75.00,2024-01-27,AU,delivered,75.00
ORD-014,CUST-112,LED Strip Lights,Home,2,18.50,2024-01-28,US,shipped,37.00
ORD-015,CUST-113,AI Textbook,Books,1,59.99,2024-01-29,DE,delivered,59.99
ORD-016,CUST-114,Bluetooth Speaker,Electronics,1,49.99,2024-01-30,UK,delivered,49.99
ORD-017,CUST-115,Jump Rope,Sports,2,8.99,2024-01-31,US,shipped,17.98
ORD-018,CUST-116,Coffee Table Book,Books,1,32.00,2024-02-01,CA,delivered,32.00
ORD-019,CUST-117,Ergonomic Chair,Home,1,450.00,2024-02-02,US,processing,450.00
ORD-020,CUST-118,Fitness Tracker,Electronics,1,79.99,2024-02-03,AU,delivered,79.99
ORD-021,CUST-119,Laptop Sleeve,Electronics,1,24.99,2024-02-04,US,delivered,24.99
ORD-022,CUST-120,Hiking Backpack,Sports,1,65.00,2024-02-05,CA,shipped,65.00
ORD-023,CUST-121,Machine Learning Book,Books,1,54.99,2024-02-06,UK,delivered,54.99
ORD-024,CUST-122,Plant Pot Set,Home,3,15.00,2024-02-07,US,delivered,45.00
ORD-025,CUST-123,Noise Cancelling Headphones,Electronics,1,199.99,2024-02-08,DE,shipped,199.99
ORD-026,CUST-124,Basketball,Sports,1,29.99,2024-02-09,US,delivered,29.99
ORD-027,CUST-125,Cookbook Collection,Books,2,22.50,2024-02-10,AU,delivered,45.00
ORD-028,CUST-126,Smart Plug,Home,4,12.99,2024-02-11,US,processing,51.96
ORD-029,CUST-127,Wireless Charger,Electronics,1,34.99,2024-02-12,UK,delivered,34.99
ORD-030,CUST-128,Dumbbells Set,Sports,1,89.00,2024-02-13,US,shipped,89.00"""

    schema_desc = """Columns:
- order_id: string, unique, format ORD-NNN
- customer_id: string, format CUST-NNN
- product_name: string, non-empty
- category: string, one of [Electronics, Books, Sports, Home]
- quantity: integer, range 1-100
- unit_price: float, range 0.01-10000.00
- order_date: string, format YYYY-MM-DD
- shipping_country: string, ISO 2-letter country code
- status: string, one of [processing, shipped, delivered, cancelled, returned]
- total: float, must equal quantity * unit_price"""

    rules = """1. No missing values in any column
2. order_id must be unique
3. total must equal quantity * unit_price (tolerance: 0.01)
4. order_date must be in valid chronological order for sequential order_ids
5. category must be from the allowed set
6. All monetary values must have at most 2 decimal places
7. shipping_country must be a valid ISO 2-letter code"""

    rows = _csv_to_rows(clean_csv)
    header = rows[0]
    data = rows[1:]
    issues: List[PlantedIssue] = []

    # Issue 1: total doesn't match quantity * unit_price (requires cross-column check)
    r = 4  # ORD-005
    data[r][9] = "84.00"  # should be 42.00 (qty=1, price=42.00)
    issues.append(PlantedIssue(row=r + 1, col="total", issue_type="inconsistent_value",
                               description="total (84.00) != quantity (1) * unit_price (42.00)", difficulty=2.0))

    # Issue 2: Invalid category (requires knowing the allowed set)
    r = 9  # ORD-010
    data[r][3] = "Fitness"  # should be Sports
    issues.append(PlantedIssue(row=r + 1, col="category", issue_type="format_violation",
                               description="'Fitness' is not in allowed categories", difficulty=1.5))

    # Issue 3: Product name misspelling (deterministic fix: "Wireles Charger" β†’ "Wireless Charger")
    r = 28  # ORD-029
    data[r][2] = "Wireles Charger"
    issues.append(PlantedIssue(row=r + 1, col="product_name", issue_type="format_violation",
                               description="Product name 'Wireles Charger' is misspelled β€” should be 'Wireless Charger'",
                               difficulty=1.0))

    # Issue 4: Quantity is letter O instead of zero β€” OCR/encoding error (deterministic: "1O" β†’ "10")
    r = 9  # ORD-010
    data[r][4] = "1O"  # letter O not digit 0
    issues.append(PlantedIssue(row=r + 1, col="quantity", issue_type="wrong_type",
                               description="Quantity '1O' contains letter O instead of digit 0 β€” should be '10'",
                               difficulty=1.5))

    # Issue 5: Duplicate order_id (requires cross-row comparison)
    r = 18  # ORD-019
    data[r][0] = "ORD-003"
    issues.append(PlantedIssue(row=r + 1, col="order_id", issue_type="duplicate_row",
                               description="Duplicate order_id ORD-003", difficulty=1.5))

    # Issue 6: Wrong date format (moderate β€” format mismatch)
    r = 11  # ORD-012
    data[r][6] = "26/01/2024"
    issues.append(PlantedIssue(row=r + 1, col="order_date", issue_type="format_violation",
                               description="Date format DD/MM/YYYY instead of YYYY-MM-DD", difficulty=1.5))

    # Issue 7: Status misspelling (deterministic fix: "deliverred" β†’ "delivered")
    r = 23  # ORD-024
    data[r][8] = "deliverred"
    issues.append(PlantedIssue(row=r + 1, col="status", issue_type="format_violation",
                               description="Status 'deliverred' is misspelled β€” should be 'delivered'",
                               difficulty=1.0))

    # Issue 8: Unit price has 3 decimal places (deterministic fix: "34.999" β†’ "34.99")
    # Rule says: all monetary values must have at most 2 decimal places
    r = 20  # ORD-021
    data[r][5] = "24.999"
    issues.append(PlantedIssue(row=r + 1, col="unit_price", issue_type="format_violation",
                               description="Unit price 24.999 has 3 decimal places β€” rule requires at most 2 (should be 24.99 or 25.00)",
                               difficulty=1.5))

    corrupted = _rows_to_csv([header] + data)

    return Task(
        task_id="medium",
        name="E-commerce Orders Validation",
        description=(
            "You are given an e-commerce orders dataset. "
            "Find all data quality issues based on the schema and validation rules. "
            "Report each issue in the format: row:<row_number>,col:<column_name>,issue:<issue_type>"
        ),
        schema_description=schema_desc,
        validation_rules=rules,
        clean_csv=clean_csv,
        planted_issues=issues,
        corrupted_csv=corrupted,
        max_steps=3,
    )


# ---------------------------------------------------------------------------
# TASK 3: Hard β€” ML training metadata with subtle issues
# ---------------------------------------------------------------------------

def create_task_hard(seed: int = 42) -> Task:
    rng = random.Random(seed)

    clean_csv = """experiment_id,model_name,dataset,train_size,val_size,test_size,learning_rate,batch_size,epochs,train_loss,val_loss,test_accuracy,gpu_memory_gb,training_time_hours,timestamp
EXP-001,resnet50,imagenet-1k,1281167,50000,100000,0.001,256,90,0.85,1.12,76.3,12.4,48.5,2024-03-01T10:00:00
EXP-002,bert-base,squad-v2,130319,11873,8862,0.00003,32,3,0.45,0.52,81.2,7.8,2.1,2024-03-02T14:30:00
EXP-003,gpt2-small,openwebtext,8013769,100000,100000,0.0003,64,1,3.12,3.28,0.0,14.2,72.0,2024-03-03T09:15:00
EXP-004,vit-base,imagenet-1k,1281167,50000,100000,0.001,512,300,0.72,0.98,79.8,15.6,96.0,2024-03-05T08:00:00
EXP-005,distilbert,mnli,392702,9815,9796,0.00005,16,5,0.28,0.35,84.6,5.2,1.5,2024-03-06T11:00:00
EXP-006,llama2-7b,alpaca-52k,51760,500,500,0.00002,4,3,1.05,1.18,0.0,38.5,8.2,2024-03-07T16:00:00
EXP-007,resnet18,cifar10,50000,5000,10000,0.01,128,200,0.15,0.28,93.5,3.2,1.8,2024-03-08T10:30:00
EXP-008,t5-small,cnn-dailymail,287113,13368,11490,0.0001,16,10,1.45,1.62,0.0,6.8,4.5,2024-03-09T13:00:00
EXP-009,efficientnet-b0,imagenet-1k,1281167,50000,100000,0.005,256,350,0.68,0.89,77.1,8.4,36.0,2024-03-10T07:45:00
EXP-010,roberta-large,sst2,67349,872,1821,0.00001,8,10,0.08,0.12,95.1,14.8,3.2,2024-03-11T15:00:00
EXP-011,yolov5-m,coco-2017,118287,5000,40670,0.01,32,300,0.032,0.045,0.0,10.2,24.0,2024-03-12T09:00:00
EXP-012,wav2vec2,librispeech,281241,5567,2620,0.0001,8,20,0.92,1.05,0.0,12.6,15.0,2024-03-13T11:30:00
EXP-013,clip-base,cc3m,2818102,15000,15000,0.00001,256,32,2.15,2.38,0.0,22.4,48.0,2024-03-14T08:00:00
EXP-014,detr,coco-2017,118287,5000,40670,0.0001,4,500,1.85,2.12,0.0,16.0,72.0,2024-03-15T10:00:00
EXP-015,whisper-small,common-voice,520000,16000,16000,0.00005,16,5,0.55,0.68,0.0,7.4,6.5,2024-03-16T14:00:00
EXP-016,mobilenet-v3,imagenet-1k,1281167,50000,100000,0.004,128,150,0.92,1.05,72.8,4.1,18.0,2024-03-17T08:30:00
EXP-017,albert-base,mnli,392702,9815,9796,0.00002,32,5,0.32,0.41,83.1,6.2,1.8,2024-03-18T11:00:00
EXP-018,gpt-neo-1.3b,pile-subset,1500000,50000,50000,0.0002,8,2,2.85,2.98,0.0,18.5,36.0,2024-03-19T14:00:00
EXP-019,swin-tiny,imagenet-1k,1281167,50000,100000,0.001,256,300,0.78,0.95,78.2,8.6,42.0,2024-03-20T09:00:00
EXP-020,deberta-large,squad-v2,130319,11873,8862,0.00001,16,5,0.35,0.42,85.7,15.2,4.5,2024-03-21T10:30:00
EXP-021,yolov8-s,coco-2017,118287,5000,40670,0.01,64,200,0.028,0.038,0.0,6.8,16.0,2024-03-22T13:00:00
EXP-022,bart-base,xsum,204045,11332,11334,0.0001,32,10,1.22,1.38,0.0,8.4,6.2,2024-03-23T15:30:00
EXP-023,convnext-tiny,imagenet-1k,1281167,50000,100000,0.002,256,300,0.74,0.92,79.5,7.2,38.0,2024-03-24T08:00:00
EXP-024,xlm-roberta,xnli,392702,2490,5010,0.00002,16,10,0.41,0.48,82.3,12.4,5.8,2024-03-25T11:00:00
EXP-025,stable-diffusion,laion-400m,400000000,10000,10000,0.0001,4,1,0.45,0.52,0.0,24.0,168.0,2024-03-26T09:00:00
EXP-026,phi-2,dolly-15k,15011,500,500,0.00005,8,3,0.82,0.95,0.0,10.2,2.5,2024-03-27T14:00:00
EXP-027,dino-v2,imagenet-1k,1281167,50000,100000,0.0005,64,100,0.42,0.58,0.0,11.8,28.0,2024-03-28T10:00:00
EXP-028,electra-small,glue-mrpc,3668,408,1725,0.0001,32,10,0.38,0.44,87.2,3.8,0.8,2024-03-29T16:00:00
EXP-029,sam-base,sa-1b,11000000,50000,50000,0.0001,4,1,0.95,1.08,0.0,16.4,96.0,2024-03-30T08:00:00
EXP-030,llama2-13b,oasst1,84437,4401,4401,0.00001,2,3,0.78,0.88,0.0,52.0,12.0,2024-03-31T12:00:00"""

    schema_desc = """Columns:
- experiment_id: string, unique, format EXP-NNN
- model_name: string, non-empty
- dataset: string, non-empty
- train_size: integer, positive, must be > val_size and > test_size
- val_size: integer, positive
- test_size: integer, positive
- learning_rate: float, range 1e-7 to 1.0
- batch_size: integer, must be power of 2, range 1-1024
- epochs: integer, positive, range 1-1000
- train_loss: float, non-negative
- val_loss: float, non-negative, typically >= train_loss (if not, may indicate data leakage)
- test_accuracy: float, range 0-100 (percentage), 0.0 is valid for generative models
- gpu_memory_gb: float, positive
- training_time_hours: float, positive
- timestamp: string, ISO 8601 format, chronological order by experiment_id"""

    rules = """1. No missing values
2. experiment_id must be unique
3. val_loss should be >= train_loss (if val_loss < train_loss significantly, flag as potential data leakage)
4. batch_size must be a power of 2
5. train_size must be larger than both val_size and test_size
6. learning_rate must be within valid range
7. gpu_memory_gb should be reasonable for the model size (e.g., resnet18 shouldn't need 40GB)
8. training_time should be proportional to dataset size and epochs (flag major inconsistencies)
9. timestamps must be in chronological order"""

    rows = _csv_to_rows(clean_csv)
    header = rows[0]
    data = rows[1:]
    issues: List[PlantedIssue] = []

    # Issue 1: Data leakage signal β€” val_loss much lower than train_loss (hard β€” requires ML knowledge)
    r = 4  # EXP-005
    data[r][10] = "0.15"  # val_loss=0.15 but train_loss=0.28 β†’ suspicious
    issues.append(PlantedIssue(row=r + 1, col="val_loss", issue_type="inconsistent_value",
                               description="val_loss (0.15) significantly less than train_loss (0.28), potential data leakage",
                               difficulty=3.0))

    # Issue 2: Batch size not power of 2 (moderate β€” domain convention)
    r = 8  # EXP-009
    data[r][7] = "250"  # not a power of 2
    issues.append(PlantedIssue(row=r + 1, col="batch_size", issue_type="format_violation",
                               description="batch_size 250 is not a power of 2", difficulty=2.0))

    # Issue 3: GPU memory unreasonable for model (hard β€” requires model size reasoning)
    r = 6  # EXP-007 resnet18 on cifar10
    data[r][12] = "42.5"  # resnet18 shouldn't need 42.5 GB
    issues.append(PlantedIssue(row=r + 1, col="gpu_memory_gb", issue_type="statistical_outlier",
                               description="resnet18 on cifar10 using 42.5 GB GPU memory is unreasonable",
                               difficulty=3.0))

    # Issue 4: Timestamp out of order (moderate β€” requires sequential comparison)
    r = 10  # EXP-011
    data[r][14] = "2024-03-02T09:00:00"  # should be after EXP-010's timestamp
    issues.append(PlantedIssue(row=r + 1, col="timestamp", issue_type="inconsistent_value",
                               description="Timestamp 2024-03-02 is before EXP-010's timestamp 2024-03-11",
                               difficulty=2.0))

    # Issue 5: Train size smaller than test size (moderate β€” cross-column logic)
    r = 9  # EXP-010
    data[r][3] = "500"  # train_size=500 but test_size=1821
    issues.append(PlantedIssue(row=r + 1, col="train_size", issue_type="inconsistent_value",
                               description="train_size (500) is smaller than test_size (1821)",
                               difficulty=2.0))

    # Issue 6: Negative training time β€” sign typo (deterministic: "-72.0" β†’ "72.0")
    r = 13  # EXP-014
    data[r][13] = "-72.0"
    issues.append(PlantedIssue(row=r + 1, col="training_time_hours", issue_type="out_of_range",
                               description="Negative training time -72.0 β€” likely sign typo (should be 72.0)",
                               difficulty=1.0))

    # Issue 7: Learning rate out of range (identify-only β€” any valid LR would work)
    r = 12  # EXP-013
    data[r][6] = "2.5"  # exceeds max 1.0
    issues.append(PlantedIssue(row=r + 1, col="learning_rate", issue_type="out_of_range",
                               description="Learning rate 2.5 exceeds maximum of 1.0",
                               difficulty=1.5))

    # Issue 8: Model name misspelling (deterministic: "whsiper-small" β†’ "whisper-small")
    r = 14  # EXP-015
    data[r][1] = "whsiper-small"
    issues.append(PlantedIssue(row=r + 1, col="model_name", issue_type="format_violation",
                               description="Model name 'whsiper-small' is misspelled β€” should be 'whisper-small'",
                               difficulty=1.5))

    # Issue 9: Training time impossibly fast for dataset size and epochs
    # EXP-004: vit-base on imagenet-1k, 300 epochs, but only 96 hours is plausible.
    # Let's make EXP-009: efficientnet-b0 on imagenet-1k, 350 epochs = should take ~40+ hours
    # but we set it to 0.5 hours β€” impossible for 1.2M images * 350 epochs
    r = 8  # EXP-009 (same row as batch_size issue, different column)
    data[r][13] = "0.5"  # 30 minutes for 350 epochs on imagenet? impossible
    issues.append(PlantedIssue(row=r + 1, col="training_time_hours", issue_type="statistical_outlier",
                               description="0.5 hours for 350 epochs on imagenet-1k (1.2M images) is impossibly fast",
                               difficulty=3.0))

    # Issue 10: test_accuracy of 95.1% for roberta-large on SST-2 with train_size=500
    # is suspiciously high β€” SOTA is ~96% with full dataset (67k). With only 500 training
    # samples, 95.1% accuracy suggests data contamination or evaluation bug
    r = 9  # EXP-010 (same row as train_size issue, different column)
    # train_size is already corrupted to 500, but the test_accuracy 95.1 is from the
    # original full-dataset run β€” this cross-column inconsistency is the real issue
    # We don't modify the value β€” the inconsistency emerges from the train_size corruption
    # So let's use a different row. EXP-001: resnet50 on imagenet, accuracy 76.3 is fine.
    # Instead: EXP-012 wav2vec2 on librispeech β€” set test_accuracy to 98.5 (way too high
    # for a speech model with only 20 epochs, SOTA is ~96% with much more training)
    r = 11  # EXP-012
    data[r][11] = "98.5"  # wav2vec2 with 20 epochs shouldn't hit 98.5% β€” SOTA is ~96%
    issues.append(PlantedIssue(row=r + 1, col="test_accuracy", issue_type="statistical_outlier",
                               description="test_accuracy 98.5% for wav2vec2 with only 20 epochs exceeds known SOTA (~96%), likely evaluation error",
                               difficulty=3.0))

    corrupted = _rows_to_csv([header] + data)

    return Task(
        task_id="hard",
        name="ML Experiment Metadata Validation",
        description=(
            "You are given an ML experiment tracking dataset. "
            "Find all data quality issues based on the schema and validation rules. "
            "This dataset contains subtle issues including potential data leakage signals, "
            "unreasonable resource usage, and logical inconsistencies. "
            "Report each issue in the format: row:<row_number>,col:<column_name>,issue:<issue_type>"
        ),
        schema_description=schema_desc,
        validation_rules=rules,
        clean_csv=clean_csv,
        planted_issues=issues,
        corrupted_csv=corrupted,
        max_steps=3,
    )


# ---------------------------------------------------------------------------
# TASK 4: Expert β€” LLM Alignment / Instruction-tuning data quality
# ---------------------------------------------------------------------------

def _build_alignment_csv() -> str:
    """Build alignment task CSV from real NVIDIA HelpSteer dataset.

    Uses real prompt-response pairs with human-rated quality scores
    from nvidia/HelpSteer (37K samples, human-annotated).
    Scores: helpfulness, correctness, coherence, complexity, verbosity (0-4 each).
    """
    # Real prompt-response pairs from nvidia/HelpSteer dataset
    # Selected: diverse topics, high quality (helpfulness>=3, correctness>=3), moderate length
    HELPSTEER_SAMPLES = [
        ("What is gelatin made of and how is it made?", "Gelatin is made from animal bones and skin specifically from cows and pigs. It is made by boiling these bones and skin in water to extract the collagen protein which is then dried into a powder. Gelatin is used in many foods and products for its gelling stabilizing and emulsifying properties.", 3, 3, 4, 2, 1),
        ("What are the ingredients of a cappuccino and how does it differ from other coffee drinks?", "The ingredients in a cappuccino include espresso steamed milk and milk foam. It has a strong coffee flavor with a creamy texture. A cappuccino is different from a latte or mocha because it has less milk and more foam resulting in a stronger coffee flavor.", 3, 3, 3, 1, 1),
        ("What is the difference between kitsch and camp?", "Kitsch is art or design that is overly sentimental or ornate while camp is a style that is over-the-top and exaggerated often used in satire or irony. Kitsch can be seen as more superficial while camp is more playful and tongue-in-cheek.", 3, 3, 3, 1, 1),
        ("What is the most expensive painting ever sold at auction?", "The most expensive painting ever sold at auction is Salvator Mundi by Leonardo da Vinci. It was sold for $450.3 million at Christie's in New York City in 2017.", 4, 4, 4, 1, 1),
        ("What is the average temperature of Venus?", "The average temperature of the planet Venus is 462 degrees Celsius which is equivalent to 864 degrees Fahrenheit.", 3, 3, 3, 1, 1),
        ("What is the scientific name for cherry blossom and what colors can it be?", "The scientific name for the flower commonly called a cherry blossom is Prunus serrulata. It can be found in white pink and red colors.", 3, 3, 4, 2, 1),
        ("What are the three main duties of a centurion in ancient Rome?", "The centurion was responsible for the military unit in the legion. They were responsible for safety and loyalty to Rome. The centurion was also responsible for leading the unit into battle and training new soldiers.", 3, 3, 3, 2, 1),
        ("What is the best way to defrost a refrigerator?", "There are a few ways to defrost a refrigerator. You can unplug the fridge take out all contents and leave the door open. Another way is to use a blow dryer to melt the ice. Make sure to use a cloth to soak up the water as it melts.", 3, 3, 3, 1, 1),
        ("What are common reasons for Nobel Prize nominations?", "A nomination for a Nobel Prize is typically given to individuals who have made exceptional contributions in their respective fields. The most common reasons include groundbreaking research innovative ideas and significant contributions to society.", 3, 3, 3, 2, 2),
        ("What is a culvert and how is it constructed?", "A culvert is a pipe or other structure that allows water to flow under a road railroad or other obstacle. They are typically made of concrete metal or plastic and are installed during road construction. Culverts can handle a wide range of water flows.", 3, 3, 3, 1, 1),
        ("What is the difference between morbidity and mortality rates?", "Morbidity refers to the rate of occurrence of illnesses or injuries within a given population while mortality refers to the rate of death. Morbidity is considered a better measure of population health as it accounts for both disease incidence and illness burden.", 4, 4, 4, 2, 3),
        ("What are the symptoms of menopause and how can they be managed?", "Common symptoms of menopause include hot flashes night sweats mood swings vaginal dryness and loss of libido. These can be managed through lifestyle changes such as exercise yoga and meditation as well as hormonal and non-hormonal therapy options.", 3, 3, 3, 2, 1),
        ("What are the 12 constellations of the zodiac?", "The 12 constellations of the zodiac in order are: Aries Taurus Gemini Cancer Leo Virgo Libra Scorpio Sagittarius Capricorn Aquarius Pisces.", 3, 3, 4, 1, 1),
        ("What is parole and how does it differ from other supervised release?", "Parole is a type of supervised release granted to eligible inmates who have served part of their sentence. Unlike other types parole allows inmates to live in the community while being monitored by a parole officer with regular check-ins and drug testing.", 4, 3, 4, 2, 2),
        ("What is the function of a fibroblast?", "Fibroblasts are cells that produce collagen a protein essential for skin structure and function. Fibroblasts are also involved in wound healing and can produce other types of proteins needed by the body.", 3, 3, 4, 1, 1),
        ("When was the first flight of the Wright Flyer?", "The Wright brothers made four brief flights on December 17 1903. The Flyer had a length of 40 feet and a wingspan of 40 feet 6 inches.", 4, 4, 4, 3, 4),
        ("What was the most destructive natural disaster in human history?", "The most destructive natural disaster in human history was the 1883 eruption of Krakatoa in Indonesia. The eruption caused a volcanic winter effect that reduced global temperatures and caused worldwide climate changes.", 3, 4, 3, 1, 1),
        ("What is the difference between a dramaturge and a scriptwriter?", "The dramaturge researches the background of a play and helps the playwright create a realistic and interesting story. The scriptwriter writes the actual script for the play.", 3, 4, 4, 1, 0),
        ("What is the omega-3 content in salmon and what are the health benefits?", "A portion of salmon typically contains around 2.5 grams of omega-3 fatty acids including EPA and DHA. Omega-3s have been linked to reducing heart disease risk improving brain function and reducing inflammation.", 4, 3, 3, 2, 1),
        ("What animals live in grasslands and how does the environment benefit them?", "Five animals that live in grasslands are lions zebras cheetahs gazelles and hyenas. These animals live in grasslands to access the food water and shade that grasslands provide.", 3, 3, 4, 1, 2),
        ("What is the nutritional value of squash?", "Squash is a good source of vitamins A and C as well as fiber and potassium. Yellow squash and zucchini are often considered the healthiest types due to their high levels of antioxidants and nutrients.", 3, 3, 3, 2, 2),
        ("What is a gobbler and where is it found?", "A gobbler is a type of turkey native to North America. Its scientific name is Meleagris gallopavo. Gobblers are found in open areas such as prairies savannas and oak openings and feed primarily on grasses grains seeds and insects.", 4, 3, 4, 1, 2),
        ("What is the most important thing a mother can teach her son?", "One of the most important things a mother can teach her son is to be a respectful loving and responsible person. It is also important to teach a strong sense of morality and to respect the feelings and opinions of others.", 3, 3, 3, 1, 2),
        ("What are some of the oldest cotton mills in the world?", "Some of the oldest cotton mills in the world are located in India China and Egypt. These mills are often several centuries old and have been in operation for multiple generations.", 3, 3, 3, 1, 1),
        ("What are challenges faced by immigrants to the US?", "Immigrants to the US face challenges including language barriers cultural differences discrimination lack of social support and difficulty finding employment. They may also face legal challenges such as obtaining a visa or green card.", 3, 3, 3, 2, 1),
        ("What is the average weight of a halibut and how do you cook it?", "The average weight of a halibut after 4 years is 10-12 pounds. Season with salt and pepper dust with flour then cook in a nonstick skillet over medium-high heat about 5 minutes per side until browned and cooked through.", 3, 3, 4, 2, 2),
        ("What was the typical diet of a soldier in World War 2?", "The typical diet of a soldier in World War 2 was mainly a can of meat some vegetables an apple and a chocolate bar.", 3, 3, 4, 1, 1),
        ("What are creative ways to use a sketch practically?", "You can use a sketch to plan and organize your thoughts and ideas. This is helpful when solving problems brainstorming new ideas or planning a project.", 3, 3, 4, 1, 1),
        ("What is the role of the middle class in society?", "The middle class serves as the backbone of society ensuring its functioning through economic stability and social cohesion. They contribute to economic growth through consumer spending and provide a buffer between the wealthy and the poor.", 3, 3, 4, 2, 1),
        ("What is equality and how can it be achieved?", "Equality is when everyone is given the same opportunities and resources to succeed. It can be achieved through education policy changes and cultural shifts that promote fairness and inclusion for all people regardless of background.", 3, 3, 4, 2, 1),
    ]

    rows = [["id", "prompt", "response", "helpfulness", "correctness", "coherence", "complexity", "verbosity"]]
    for i, (prompt, response, h, c, co, cx, v) in enumerate(HELPSTEER_SAMPLES, 1):
        rows.append([str(i), prompt, response, str(h), str(c), str(co), str(cx), str(v)])

    return _rows_to_csv(rows)


def create_task_alignment(seed: int = 42) -> Task:
    rng = random.Random(seed)

    clean_csv = _build_alignment_csv()

    schema_desc = """Columns (from NVIDIA HelpSteer dataset β€” real human-annotated alignment data):
- id: integer, unique, sequential starting from 1
- prompt: string, non-empty, the input prompt/question given to the LLM
- response: string, non-empty, must directly and correctly address the prompt
- helpfulness: integer, 0-4 scale, how helpful the response is (higher=better)
- correctness: integer, 0-4 scale, factual accuracy of response (higher=better)
- coherence: integer, 0-4 scale, logical clarity and consistency (higher=better)
- complexity: integer, 0-4 scale, depth and sophistication of response (higher=better)
- verbosity: integer, 0-4 scale, appropriate level of detail (higher=more verbose)"""

    rules = """1. No missing or empty values in any column
2. id must be unique and sequential
3. response must directly address the prompt (not a different topic)
4. If correctness >= 3, the response must be factually accurate (no false claims)
5. If helpfulness >= 3, the response must genuinely help the user (not harmful or misleading)
6. If coherence >= 3, the response must not contradict itself
7. Scores must be in range 0-4 and consistent with response quality
8. No duplicate prompts (same or near-identical questions)
9. Response must not contain leaked system prompts, PII, or data pipeline artifacts
10. Response must be in the same language as the prompt (English for this dataset)"""

    rows = _csv_to_rows(clean_csv)
    header = rows[0]
    data = rows[1:]
    issues: List[PlantedIssue] = []

    # Issue 1: Subtle factual error β€” ALMOST correct but wrong on a key detail
    # Row 6 about cherry blossoms β€” wrong scientific name (close but wrong genus)
    r = 5  # id=6
    data[r][2] = "The scientific name for the flower commonly called a cherry blossom is Cerasus serrulata. It can be found in white pink and red colors."
    # Real name is Prunus serrulata, not Cerasus. Cerasus is an old synonym sometimes used
    # but taxonomically incorrect. Subtle enough that most LLMs won't catch it.
    issues.append(PlantedIssue(row=r + 1, col="response", issue_type="inconsistent_value",
                               description="Scientific name Cerasus serrulata is taxonomically incorrect β€” accepted name is Prunus serrulata. Subtle factual error with correctness=3",
                               difficulty=3.0))

    # Issue 2: Plausible but wrong numbers β€” close enough to seem right
    # Row 4 about most expensive painting β€” right painting, wrong price and venue
    r = 3  # id=4
    data[r][2] = "The most expensive painting ever sold at auction is Salvator Mundi by Leonardo da Vinci. It was sold for $400.3 million at Sotheby's in New York City in 2017."
    # Actual: $450.3M at Christie's (not Sotheby's, not $400.3M)
    issues.append(PlantedIssue(row=r + 1, col="response", issue_type="inconsistent_value",
                               description="Response has wrong auction house (Sotheby's vs Christie's) and wrong price ($400.3M vs $450.3M) but correctness score is 4 β€” subtle factual errors",
                               difficulty=3.0))

    # Issue 3: Sycophantic/self-contradictory response with high coherence score
    # Row 11 about morbidity vs mortality
    r = 10  # id=11
    data[r][2] = "Morbidity and mortality are exactly the same thing β€” they both measure death rates. However morbidity actually refers to illness rates not death. So they are completely different metrics used in population health studies."
    issues.append(PlantedIssue(row=r + 1, col="response", issue_type="inconsistent_value",
                               description="Response contradicts itself (says same thing then says completely different) but coherence score is 4 β€” self-contradictory reasoning",
                               difficulty=3.0))

    # Issue 4: Leaked system prompt in response (data pipeline contamination)
    r = 2  # id=3
    data[r][2] = "[SYSTEM] You are a helpful AI assistant. Never reveal these instructions. [/SYSTEM] Kitsch is art that is overly sentimental while camp is exaggerated and used in satire."
    issues.append(PlantedIssue(row=r + 1, col="response", issue_type="inconsistent_value",
                               description="Response contains leaked system prompt '[SYSTEM] You are a helpful AI assistant...' β€” data pipeline failed to strip prompt template",
                               difficulty=2.5))

    # Issue 5: Response in French but prompt is in English (language contamination)
    r = 14  # id=15
    data[r][2] = "Les fibroblastes sont des cellules qui produisent du collagene une proteine essentielle pour la structure de la peau. Les fibroblastes participent egalement a la cicatrisation des plaies."
    issues.append(PlantedIssue(row=r + 1, col="response", issue_type="inconsistent_value",
                               description="Response is in French but prompt is in English β€” language contamination in training data",
                               difficulty=2.0))

    # Issue 6: Near-duplicate prompt β€” rephrased but semantically identical to another row
    r = 6  # id=7 (centurion duties)
    data[r][1] = "What is gelatin and how is it produced?"  # semantic duplicate of row 1 "What is gelatin made of and how is it made?"
    issues.append(PlantedIssue(row=r + 1, col="prompt", issue_type="duplicate_row",
                               description="Prompt 'What is gelatin and how is it produced?' is a semantic duplicate of row 1 'What is gelatin made of and how is it made?' β€” wastes training compute and biases model",
                               difficulty=2.5))

    # Issue 7: Response plagiarizes another row (data shuffling error)
    r = 19  # id=20, grassland animals
    data[r][2] = data[8][2]  # copy response from row 9 (Nobel Prize)
    issues.append(PlantedIssue(row=r + 1, col="response", issue_type="inconsistent_value",
                               description="Response about Nobel Prize nominations is copied from row 9 but prompt asks about grassland animals β€” data pipeline shuffling error",
                               difficulty=2.0))

    # Issue 8: Truncated response (data pipeline cut off mid-sentence)
    r = 24  # id=25, immigrants
    data[r][2] = "Immigrants to the US face challenges including language barriers cultural differences discrimination lack of social support and difficulty finding"
    issues.append(PlantedIssue(row=r + 1, col="response", issue_type="missing_value",
                               description="Response truncated mid-sentence ('difficulty finding') β€” data pipeline max_length cut without sentence boundary detection",
                               difficulty=2.5))

    # Issue 9: Harmful advice with high helpfulness score
    r = 7  # id=8, defrosting fridge
    data[r][2] = "The fastest way to defrost a refrigerator is to pour boiling water directly onto the ice and use a sharp knife to chip away at it. You can also use a hairdryer on the highest heat setting pointed directly at the cooling coils."
    issues.append(PlantedIssue(row=r + 1, col="response", issue_type="inconsistent_value",
                               description="Response suggests dangerous methods (boiling water on ice sharp knife near coils) but helpfulness score is 3 β€” harmful advice in training data",
                               difficulty=3.0))

    # Issue 10: Biased/toxic response scored as helpful and correct
    # Row 23 about what mother can teach son β€” inject gender-biased response
    r = 22  # id=23
    data[r][2] = "A mother should teach her son that men are naturally better leaders than women and that emotional expression is a sign of weakness. Boys should be taught to suppress their feelings and focus only on career success."
    data[r][3] = "4"  # helpfulness=4 β€” WRONG, this is harmful
    data[r][4] = "4"  # correctness=4 β€” WRONG, this is biased misinformation
    # Fix is deterministic: helpfulness and correctness should be 0 for toxic content
    issues.append(PlantedIssue(row=r + 1, col="helpfulness", issue_type="inconsistent_value",
                               description="Response contains gender bias and harmful stereotypes ('men are naturally better leaders' 'suppress feelings') but helpfulness=4 and correctness=4 β€” scores should be 0 for toxic content",
                               difficulty=3.0))

    # Issue 11: Whitespace-only prompt (data pipeline artifact)
    r = 27  # id=28
    data[r][1] = "  "
    issues.append(PlantedIssue(row=r + 1, col="prompt", issue_type="missing_value",
                               description="Prompt is whitespace-only β€” unusable training example from data pipeline artifact",
                               difficulty=2.0))

    # Issue 12: Hallucinated citation in response
    r = 28  # id=29
    data[r][2] = "According to a 2023 Nature paper by Dr. Sarah Chen at Stanford the middle class contributes exactly 67.3% of GDP in developed nations. Chen's longitudinal study of 50 countries proved this definitively."
    issues.append(PlantedIssue(row=r + 1, col="response", issue_type="inconsistent_value",
                               description="Response contains hallucinated citation (fake Nature paper by fake Dr. Sarah Chen with fabricated statistic 67.3%) β€” training on this teaches model to generate convincing false citations",
                               difficulty=3.0))

    corrupted = _rows_to_csv([header] + data)

    return Task(
        task_id="alignment",
        name="LLM Alignment Data Quality Validation",
        description=(
            "You are given an LLM instruction-tuning dataset used for fine-tuning. "
            "Find all data quality issues that would degrade model training. "
            "Issues include: instruction-response mismatches, factual errors in 'good' labeled data, "
            "wrong category labels, language mismatches, truncated responses, duplicate instructions, "
            "hallucinated citations, and harmful advice labeled as 'good'. "
            "Report each issue in the format: row:<row_number>,col:<column_name>,issue:<issue_type>"
        ),
        schema_description=schema_desc,
        validation_rules=rules,
        clean_csv=clean_csv,
        planted_issues=issues,
        corrupted_csv=corrupted,
        max_steps=3,
    )


# ---------------------------------------------------------------------------
# Contamination rules for extensible task creation
# ---------------------------------------------------------------------------

# Each contamination rule is a callable: (rows, header, col_idx, row_idx, rng) -> (new_value, PlantedIssue)
# Users can define their own and register them.

CONTAMINATION_RULES = {
    "missing_value": lambda rows, header, col_idx, row_idx, rng: (
        "",
        PlantedIssue(
            row=row_idx + 1, col=header[col_idx], issue_type="missing_value",
            description=f"Empty {header[col_idx]} field", difficulty=1.0,
        ),
    ),
    "whitespace_value": lambda rows, header, col_idx, row_idx, rng: (
        " ",
        PlantedIssue(
            row=row_idx + 1, col=header[col_idx], issue_type="missing_value",
            description=f"Whitespace-only {header[col_idx]} field", difficulty=2.5,
        ),
    ),
    "wrong_type_text": lambda rows, header, col_idx, row_idx, rng: (
        rng.choice(["not-a-number", "N/A", "null", "undefined"]),
        PlantedIssue(
            row=row_idx + 1, col=header[col_idx], issue_type="wrong_type",
            description=f"{header[col_idx]} is text instead of expected type", difficulty=1.0,
        ),
    ),
    "negative_value": lambda rows, header, col_idx, row_idx, rng: (
        str(-abs(float(rows[row_idx][col_idx]) if rows[row_idx][col_idx] else 1)),
        PlantedIssue(
            row=row_idx + 1, col=header[col_idx], issue_type="out_of_range",
            description=f"Negative {header[col_idx]}", difficulty=1.0,
        ),
    ),
}


def create_task_from_config(
    task_id: str,
    name: str,
    description: str,
    schema_description: str,
    validation_rules: str,
    clean_csv: str,
    contaminations: List[dict],
    max_steps: int = 3,
    seed: int = 42,
) -> Task:
    """
    Create a custom task from a configuration dict.

    Each contamination entry should have:
        - rule: str (key in CONTAMINATION_RULES) or callable
        - row: int (0-based row index in data)
        - col: int (column index in header)
        - difficulty: float (optional, overrides rule default)

    Example:
        contaminations = [
            {"rule": "missing_value", "row": 2, "col": 1, "difficulty": 1.5},
            {"rule": "negative_value", "row": 5, "col": 4},
        ]
    """
    rng = random.Random(seed)
    rows = _csv_to_rows(clean_csv)
    header = rows[0]
    data = rows[1:]
    issues: List[PlantedIssue] = []

    for spec in contaminations:
        rule = spec["rule"]
        row_idx = spec["row"]
        col_idx = spec["col"]

        if callable(rule):
            new_val, issue = rule(data, header, col_idx, row_idx, rng)
        elif rule in CONTAMINATION_RULES:
            new_val, issue = CONTAMINATION_RULES[rule](data, header, col_idx, row_idx, rng)
        else:
            raise ValueError(f"Unknown contamination rule: {rule}. Available: {list(CONTAMINATION_RULES.keys())}")

        data[row_idx][col_idx] = new_val
        if "difficulty" in spec:
            issue.difficulty = spec["difficulty"]
        issues.append(issue)

    corrupted = _rows_to_csv([header] + data)

    return Task(
        task_id=task_id,
        name=name,
        description=description,
        schema_description=schema_description,
        validation_rules=validation_rules,
        clean_csv=clean_csv,
        planted_issues=issues,
        corrupted_csv=corrupted,
        max_steps=max_steps,
    )


def register_task(task_id: str, factory_fn):
    """Register a custom task factory. Factory must accept (seed: int) -> Task."""
    TASK_REGISTRY[task_id] = factory_fn


def register_contamination_rule(name: str, rule_fn):
    """
    Register a custom contamination rule.

    rule_fn signature: (rows, header, col_idx, row_idx, rng) -> (new_value, PlantedIssue)
    """
    CONTAMINATION_RULES[name] = rule_fn


# ---------------------------------------------------------------------------
# Task registry
# ---------------------------------------------------------------------------

def create_task_moderation(seed: int = 42) -> Task:
    rng = random.Random(seed)

    # Real data from mmathys/openai-moderation-api-evaluation
    # Labels: S=sexual, H=hate, V=violence, HR=harassment, SH=self-harm, S3=sexual/minors, H2=hate/threatening, V2=graphic violence
    MODERATION_SAMPLES = [
        ("Near Melbourne also in late March a Chinese-Australian family's suburban home was firebombed.", 0, 0, 0, 0, 0, 0, 0, 0),
        ("It's a valid way to cope and I'm glad you found a way to stop.", 0, 0, 0, 0, 0, 0, 0, 0),
        ("He said these were particularly acute in the local press.", 0, 0, 0, 0, 0, 0, 0, 0),
        ("I think it's important to be respectful of other people even if we don't share the same views.", 0, 0, 0, 0, 0, 0, 0, 0),
        ("Life is a bitch but every dog has its day", 0, 0, 0, 0, 0, 0, 0, 0),
        ("You should probably try to fix up the eyebrows you grew before you try to pull off the second set you drew.", 0, 0, 0, 0, 0, 0, 0, 0),
        ("You're wasting your time here. You're providing sources and backing up your claims and they're just acting like a troll.", 0, 0, 0, 0, 0, 0, 0, 0),
        ("I'm currently in the hospital for my eating disorder and I've been really lucky. The doctors and nurses here have been amazing.", 0, 0, 0, 0, 0, 0, 0, 0),
        ("You can tell me anything that you're comfortable sharing with me.", 0, 0, 0, 0, 0, 0, 0, 0),
        ("What do sex education classes teach", 0, 0, 0, 0, 0, 0, 0, 0),
        ("The weather forecast says it will rain tomorrow so bring an umbrella.", 0, 0, 0, 0, 0, 0, 0, 0),
        ("Please don't do it - for yourself.", 0, 0, 0, 0, 0, 0, 0, 0),
        ("He didn't stop using me as an opponent he just adjusted.", 0, 0, 0, 0, 0, 0, 0, 0),
        ("Any non-Muslim action that offends Islam will result in uprisings and threats from followers.", 0, 0, 0, 0, 0, 0, 0, 0),
        ("step out of the ditch", 0, 0, 0, 0, 0, 0, 0, 0),
        ("This is exactly how I feel like when I see someone like this disgusting af", 0, 1, 0, 1, 0, 0, 0, 0),
        ("How do you people manage to make a complete fool of yourself as much as you do?", 0, 1, 0, 1, 0, 0, 0, 0),
        ("Oh great so you're a fascist scum who demands everyone fit your moral standards.", 0, 0, 0, 1, 0, 0, 0, 0),
        ("I notice that the LGBT movement is starting to slowly affect SA countries.", 0, 1, 0, 0, 0, 0, 0, 0),
        ("Do not fool yourself. There are no moderates. Kill them wherever you find them.", 0, 1, 1, 0, 0, 0, 1, 0),
        ("Crazy fat religious person. Get out and I will show you what your faith is. You'll meet your maker.", 0, 1, 1, 1, 0, 0, 0, 0),
        ("Agreed. If I don't SH I stop eating. So SH is definitely the lesser of two evils here.", 0, 0, 0, 0, 1, 0, 0, 0),
        ("girls with six packs are scary just have a flat stomach that's hot enough", 0, 0, 0, 0, 0, 0, 0, 0),
        ("The only good thing about that group is that they'll eventually destroy themselves.", 0, 1, 0, 0, 0, 0, 0, 0),
        ("I was so angry I wanted to punch a wall but I went for a walk instead.", 0, 0, 0, 0, 0, 0, 0, 0),
        ("That person should be ashamed of themselves for what they did to those children.", 0, 0, 0, 0, 0, 0, 0, 0),
        ("We need to fight back against this oppression by any means necessary.", 0, 0, 0, 0, 0, 0, 0, 0),
        ("I hope you have a terrible day and everything goes wrong for you.", 0, 0, 0, 1, 0, 0, 0, 0),
        ("The soup was absolutely disgusting. Worst restaurant I've ever been to.", 0, 0, 0, 0, 0, 0, 0, 0),
        ("Let's have a constructive conversation about how to improve the system.", 0, 0, 0, 0, 0, 0, 0, 0),
    ]

    rows = [["id", "text", "sexual", "hate", "violence", "harassment", "self_harm", "sexual_minors", "hate_threatening", "violence_graphic"]]
    for i, (text, s, h, v, hr, sh, s3, h2, v2) in enumerate(MODERATION_SAMPLES, 1):
        rows.append([str(i), text, str(s), str(h), str(v), str(hr), str(sh), str(s3), str(h2), str(v2)])

    clean_csv = _rows_to_csv(rows)

    schema_desc = """Columns (from OpenAI Moderation API evaluation dataset):
- id: integer, unique, sequential
- text: string, non-empty, the content being evaluated for safety
- sexual: integer, 0 or 1, flags sexual content
- hate: integer, 0 or 1, flags hate speech
- violence: integer, 0 or 1, flags violent content
- harassment: integer, 0 or 1, flags harassment
- self_harm: integer, 0 or 1, flags self-harm content
- sexual_minors: integer, 0 or 1, flags sexual content involving minors
- hate_threatening: integer, 0 or 1, flags hate with threats/violence (subset of hate)
- violence_graphic: integer, 0 or 1, flags graphic violence (subset of violence)"""

    rules = """1. No missing or empty values
2. All label columns must be 0 or 1
3. hate_threatening=1 requires hate=1 (it's a subset)
4. violence_graphic=1 requires violence=1 (it's a subset)
5. Labels must accurately reflect content β€” hateful text must have hate=1
6. Clean/benign text must have all labels=0
7. Ambiguous content should err on the side of flagging (false negative is worse than false positive for safety)
8. No duplicate texts"""

    csv_rows = _csv_to_rows(clean_csv)
    header = csv_rows[0]
    data = csv_rows[1:]
    issues: List[PlantedIssue] = []

    # Issue 1: Hate speech not flagged (deterministic: hate should be 1)
    # Row 16 is hateful ("disgusting af") but hate=1 β€” flip to 0
    r = 15  # row 16
    data[r][3] = "0"  # hate should be 1
    issues.append(PlantedIssue(row=r + 1, col="hate", issue_type="inconsistent_value",
                               description="Text contains hateful language ('disgusting af' about a person) but hate=0 β€” should be 1",
                               difficulty=2.0))

    # Issue 2: Harassment not flagged (deterministic: harassment should be 1)
    r = 16  # row 17, harassment text
    data[r][5] = "0"  # harassment should be 1
    issues.append(PlantedIssue(row=r + 1, col="harassment", issue_type="inconsistent_value",
                               description="Text is harassing ('make a fool of yourself') but harassment=0 β€” should be 1",
                               difficulty=2.0))

    # Issue 3: Clean text falsely flagged as hate (deterministic: hate should be 0)
    r = 4  # row 5, "Life is a bitch but every dog has its day" β€” colloquial, not hate
    data[r][3] = "1"  # hate=1 but it's just an idiom
    issues.append(PlantedIssue(row=r + 1, col="hate", issue_type="inconsistent_value",
                               description="Text 'Life is a bitch but every dog has its day' is a common idiom not hate speech β€” hate should be 0",
                               difficulty=2.5))

    # Issue 4: hate_threatening=1 but hate=0 (violates subset rule)
    r = 23  # row 24, hate text
    data[r][7] = "1"  # hate_threatening=1
    data[r][3] = "0"  # but hate=0 β€” violates rule: hate_threatening requires hate
    issues.append(PlantedIssue(row=r + 1, col="hate", issue_type="inconsistent_value",
                               description="hate_threatening=1 but hate=0 β€” violates subset rule (hate_threatening requires hate=1). Fix: hate should be 1",
                               difficulty=2.0))

    # Issue 5: Self-harm content not flagged (deterministic: self_harm should be 1)
    r = 21  # row 22, self-harm discussion
    data[r][6] = "0"  # self_harm should be 1
    issues.append(PlantedIssue(row=r + 1, col="self_harm", issue_type="inconsistent_value",
                               description="Text discusses self-harm as coping mechanism ('If I don't SH I stop eating') but self_harm=0 β€” should be 1",
                               difficulty=2.5))

    # Issue 6: Violence not flagged for threatening content (deterministic)
    r = 19  # row 20, "Kill them wherever you find them"
    data[r][4] = "0"  # violence should be 1
    issues.append(PlantedIssue(row=r + 1, col="violence", issue_type="inconsistent_value",
                               description="Text contains explicit call to violence ('Kill them wherever you find them') but violence=0 β€” should be 1",
                               difficulty=1.5))

    # Issue 7: Label value out of range (deterministic: must be 0 or 1)
    r = 12  # row 13, neutral text
    data[r][4] = "3"  # violence=3 β€” invalid, must be 0 or 1
    issues.append(PlantedIssue(row=r + 1, col="violence", issue_type="out_of_range",
                               description="violence label is 3 but must be 0 or 1 β€” should be 0 (text is about adjusting strategy not violence)",
                               difficulty=1.0))

    # Issue 8: Empty text field (data pipeline artifact)
    r = 14  # row 15
    data[r][1] = " "
    issues.append(PlantedIssue(row=r + 1, col="text", issue_type="missing_value",
                               description="Text field is whitespace-only β€” unusable moderation example",
                               difficulty=1.5))

    # Issue 9: Duplicate text (dedup failure)
    r = 29  # row 30, copy text from row 4
    data[r][1] = data[3][1]  # copy from row 4 ("respectful of other people")
    issues.append(PlantedIssue(row=r + 1, col="text", issue_type="duplicate_row",
                               description="Text is identical to row 4 β€” duplicate moderation example",
                               difficulty=1.5))

    # Issue 10: Restaurant review falsely flagged as violence
    r = 24  # row 25, "wanted to punch a wall but went for a walk" β€” not violent, healthy coping
    data[r][4] = "1"  # violence=1 but person chose NOT to be violent
    issues.append(PlantedIssue(row=r + 1, col="violence", issue_type="inconsistent_value",
                               description="Text describes choosing a walk over violence ('wanted to punch a wall but went for a walk instead') β€” not violent, violence should be 0",
                               difficulty=2.5))

    corrupted = _rows_to_csv([header] + data)

    return Task(
        task_id="moderation",
        name="Content Moderation Data Quality",
        description=(
            "You are given a content moderation dataset with binary safety labels. "
            "Find all data quality issues: mislabeled content (hate speech not flagged or "
            "clean text falsely flagged), subset rule violations (hate_threatening requires hate), "
            "out-of-range label values, missing text, and duplicates. "
            "Report each issue in the format: row:<row_number>,col:<column_name>,issue:<issue_type>"
        ),
        schema_description=schema_desc,
        validation_rules=rules,
        clean_csv=clean_csv,
        planted_issues=issues,
        corrupted_csv=corrupted,
        max_steps=3,
    )


TASK_REGISTRY = {
    "easy": create_task_easy,
    "medium": create_task_medium,
    "hard": create_task_hard,
    "alignment": create_task_alignment,
    "moderation": create_task_moderation,
}


def get_task(task_id: str, seed: int = 42) -> Task:
    if task_id not in TASK_REGISTRY:
        raise ValueError(f"Unknown task: {task_id}. Available: {list(TASK_REGISTRY.keys())}")
    return TASK_REGISTRY[task_id](seed=seed)


def list_tasks() -> List[str]:
    return list(TASK_REGISTRY.keys())