Spaces:
Running
Running
File size: 1,529 Bytes
d2d30e9 40fcf49 d2d30e9 40fcf49 d2d30e9 3d6f059 d2d30e9 3d6f059 d2d30e9 40fcf49 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 | """
Task 1 — Easy: Fill Missing Values
Objective: Fill all NaN values in the employee records DataFrame.
Score: 1.0 - (remaining_nulls / original_nulls)
"""
from server.data_generator import generate_task1_datasets
TASK_ID = 1
MAX_STEPS = 20
DESCRIPTION = (
"Task 1 (Easy) — Fill Missing Values\n"
"You have an employee records dataset with missing values (NaN) in "
"'age', 'salary', and 'department' columns. "
"Your goal is to fill all missing values so the dataset is complete.\n\n"
"Available operation: fill_missing\n"
" params.strategy: 'median' | 'mean' | 'mode' | 'constant'\n"
" params.value: (required when strategy='constant') the fill value\n"
"Example action: {\"operation\": \"fill_missing\", \"column\": \"age\", \"params\": {\"strategy\": \"median\"}}"
)
# Cache at module load — seed=42 makes output identical every time
_DIRTY_TEMPLATE, _CLEAN_DF = generate_task1_datasets()
_ORIGINAL_NULLS = int(_DIRTY_TEMPLATE.isnull().sum().sum())
def load():
"""Return (dirty_df, clean_df, original_null_count) — uses cached template."""
return _DIRTY_TEMPLATE.copy(), _CLEAN_DF, _ORIGINAL_NULLS
def score(current_df, original_nulls: int) -> float:
"""Score in [0, 1]: fraction of nulls filled."""
if original_nulls == 0:
return 0.99
remaining = int(current_df.isnull().sum().sum())
return round(max(0.01, min(0.99, 1.0 - remaining / original_nulls)), 4)
def count_errors(current_df) -> int:
return int(current_df.isnull().sum().sum()) |