File size: 2,679 Bytes
d2d30e9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40fcf49
 
 
 
 
d2d30e9
40fcf49
 
 
 
 
 
 
 
d2d30e9
 
 
40fcf49
d2d30e9
 
 
 
 
 
 
 
 
 
3d6f059
d2d30e9
 
 
40fcf49
d2d30e9
 
 
 
40fcf49
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
"""
Task 2 — Medium: Fix Formats + Remove Duplicates
Objective: Standardise phone & date formats and drop duplicate rows.
Score: weighted average of format_score (0.7) + dupe_score (0.3)
"""

import re
import pandas as pd
from server.data_generator import generate_task2_datasets

TASK_ID = 2
MAX_STEPS = 30
DESCRIPTION = (
    "Task 2 (Medium) — Fix Formats and Remove Duplicates\n"
    "You have a product catalog with:\n"
    "  • Phone numbers in mixed formats (need: NNN-NNN-NNNN)\n"
    "  • Dates in mixed formats (need: YYYY-MM-DD)\n"
    "  • Duplicate rows (~15)\n\n"
    "Available operations:\n"
    "  fix_format  — column: 'phone' | 'listed_date'\n"
    "  drop_duplicates — no column needed\n\n"
    "Example actions:\n"
    '  {"operation": "fix_format", "column": "phone"}\n'
    '  {"operation": "fix_format", "column": "listed_date"}\n'
    '  {"operation": "drop_duplicates"}'
)

PHONE_RE = re.compile(r"^\d{3}-\d{3}-\d{4}$")
DATE_RE  = re.compile(r"^\d{4}-\d{2}-\d{2}$")


# Cache at module load — seed=42 makes output identical every time
_DIRTY_TEMPLATE, _CLEAN_DF = generate_task2_datasets()
_META_TEMPLATE = {
    "orig_phone": int((~_DIRTY_TEMPLATE["phone"].str.match(PHONE_RE, na=False)).sum()),
    "orig_date":  int((~_DIRTY_TEMPLATE["listed_date"].apply(
        lambda x: bool(DATE_RE.match(str(x))) if pd.notna(x) else False
    )).sum()),
    "orig_dupes": len(_DIRTY_TEMPLATE) - len(_DIRTY_TEMPLATE.drop_duplicates()),
}


def load():
    """Return (dirty_df, clean_df, meta) — uses cached template."""
    return _DIRTY_TEMPLATE.copy(), _CLEAN_DF, dict(_META_TEMPLATE)


def score(current_df, meta: dict) -> float:
    phone_issues = int((~current_df["phone"].str.match(PHONE_RE, na=False)).sum())
    date_issues  = int((~current_df["listed_date"].apply(
        lambda x: bool(DATE_RE.match(str(x))) if pd.notna(x) else False
    )).sum())
    dupes        = len(current_df) - len(current_df.drop_duplicates())

    phone_score = 1.0 - phone_issues / max(meta["orig_phone"], 1)
    date_score  = 1.0 - date_issues  / max(meta["orig_date"],  1)
    dupe_score  = 1.0 - dupes        / max(meta["orig_dupes"], 1)

    combined = 0.35 * phone_score + 0.35 * date_score + 0.30 * dupe_score
    return round(max(0.01, min(0.99, combined)), 4)


def count_errors(current_df, meta: dict) -> int:
    phone_issues = int((~current_df["phone"].str.match(PHONE_RE, na=False)).sum())
    date_issues  = int((~current_df["listed_date"].apply(
        lambda x: bool(DATE_RE.match(str(x))) if pd.notna(x) else False
    )).sum())
    dupes = len(current_df) - len(current_df.drop_duplicates())
    return phone_issues + date_issues + dupes