File size: 1,529 Bytes
d2d30e9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40fcf49
 
 
 
d2d30e9
 
40fcf49
 
d2d30e9
 
 
 
 
3d6f059
d2d30e9
3d6f059
d2d30e9
 
 
40fcf49
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
"""
Task 1 — Easy: Fill Missing Values
Objective: Fill all NaN values in the employee records DataFrame.
Score: 1.0 - (remaining_nulls / original_nulls)
"""

from server.data_generator import generate_task1_datasets

TASK_ID = 1
MAX_STEPS = 20
DESCRIPTION = (
    "Task 1 (Easy) — Fill Missing Values\n"
    "You have an employee records dataset with missing values (NaN) in "
    "'age', 'salary', and 'department' columns. "
    "Your goal is to fill all missing values so the dataset is complete.\n\n"
    "Available operation: fill_missing\n"
    "  params.strategy: 'median' | 'mean' | 'mode' | 'constant'\n"
    "  params.value: (required when strategy='constant') the fill value\n"
    "Example action: {\"operation\": \"fill_missing\", \"column\": \"age\", \"params\": {\"strategy\": \"median\"}}"
)

# Cache at module load — seed=42 makes output identical every time
_DIRTY_TEMPLATE, _CLEAN_DF = generate_task1_datasets()
_ORIGINAL_NULLS = int(_DIRTY_TEMPLATE.isnull().sum().sum())


def load():
    """Return (dirty_df, clean_df, original_null_count) — uses cached template."""
    return _DIRTY_TEMPLATE.copy(), _CLEAN_DF, _ORIGINAL_NULLS


def score(current_df, original_nulls: int) -> float:
    """Score in [0, 1]: fraction of nulls filled."""
    if original_nulls == 0:
        return 0.99
    remaining = int(current_df.isnull().sum().sum())
    return round(max(0.01, min(0.99, 1.0 - remaining / original_nulls)), 4)


def count_errors(current_df) -> int:
    return int(current_df.isnull().sum().sum())