Spaces:
Running
Running
File size: 2,679 Bytes
d2d30e9 40fcf49 d2d30e9 40fcf49 d2d30e9 40fcf49 d2d30e9 3d6f059 d2d30e9 40fcf49 d2d30e9 40fcf49 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 | """
Task 2 — Medium: Fix Formats + Remove Duplicates
Objective: Standardise phone & date formats and drop duplicate rows.
Score: weighted average of format_score (0.7) + dupe_score (0.3)
"""
import re
import pandas as pd
from server.data_generator import generate_task2_datasets
TASK_ID = 2
MAX_STEPS = 30
DESCRIPTION = (
"Task 2 (Medium) — Fix Formats and Remove Duplicates\n"
"You have a product catalog with:\n"
" • Phone numbers in mixed formats (need: NNN-NNN-NNNN)\n"
" • Dates in mixed formats (need: YYYY-MM-DD)\n"
" • Duplicate rows (~15)\n\n"
"Available operations:\n"
" fix_format — column: 'phone' | 'listed_date'\n"
" drop_duplicates — no column needed\n\n"
"Example actions:\n"
' {"operation": "fix_format", "column": "phone"}\n'
' {"operation": "fix_format", "column": "listed_date"}\n'
' {"operation": "drop_duplicates"}'
)
PHONE_RE = re.compile(r"^\d{3}-\d{3}-\d{4}$")
DATE_RE = re.compile(r"^\d{4}-\d{2}-\d{2}$")
# Cache at module load — seed=42 makes output identical every time
_DIRTY_TEMPLATE, _CLEAN_DF = generate_task2_datasets()
_META_TEMPLATE = {
"orig_phone": int((~_DIRTY_TEMPLATE["phone"].str.match(PHONE_RE, na=False)).sum()),
"orig_date": int((~_DIRTY_TEMPLATE["listed_date"].apply(
lambda x: bool(DATE_RE.match(str(x))) if pd.notna(x) else False
)).sum()),
"orig_dupes": len(_DIRTY_TEMPLATE) - len(_DIRTY_TEMPLATE.drop_duplicates()),
}
def load():
"""Return (dirty_df, clean_df, meta) — uses cached template."""
return _DIRTY_TEMPLATE.copy(), _CLEAN_DF, dict(_META_TEMPLATE)
def score(current_df, meta: dict) -> float:
phone_issues = int((~current_df["phone"].str.match(PHONE_RE, na=False)).sum())
date_issues = int((~current_df["listed_date"].apply(
lambda x: bool(DATE_RE.match(str(x))) if pd.notna(x) else False
)).sum())
dupes = len(current_df) - len(current_df.drop_duplicates())
phone_score = 1.0 - phone_issues / max(meta["orig_phone"], 1)
date_score = 1.0 - date_issues / max(meta["orig_date"], 1)
dupe_score = 1.0 - dupes / max(meta["orig_dupes"], 1)
combined = 0.35 * phone_score + 0.35 * date_score + 0.30 * dupe_score
return round(max(0.01, min(0.99, combined)), 4)
def count_errors(current_df, meta: dict) -> int:
phone_issues = int((~current_df["phone"].str.match(PHONE_RE, na=False)).sum())
date_issues = int((~current_df["listed_date"].apply(
lambda x: bool(DATE_RE.match(str(x))) if pd.notna(x) else False
)).sum())
dupes = len(current_df) - len(current_df.drop_duplicates())
return phone_issues + date_issues + dupes |