Spaces:
Runtime error
Runtime error
| import pandas as pd | |
| INPUT_PATH = "dataset/processed/dataset_step1_length_normalized.csv" | |
| OUTPUT_PATH = "dataset/processed/dataset_step2_cleaned.csv" | |
| def is_non_empty_code(code): | |
| return isinstance(code, str) and len(code.strip()) > 0 | |
| def main(): | |
| print("Loading Step 1 dataset...") | |
| df = pd.read_csv(INPUT_PATH) | |
| print(f"Initial dataset size: {len(df)} rows") | |
| # Normalize language column | |
| if "Language" in df.columns: | |
| df["Language"] = df["Language"].astype(str).str.strip().str.lower() | |
| # Deduplication | |
| print("Removing duplicate code snippets...") | |
| before = len(df) | |
| df = df.drop_duplicates(subset=["normalized_code"]).reset_index(drop=True) | |
| print(f"Removed {before - len(df)} duplicate rows") | |
| # Minimal sanity check ONLY | |
| print("Applying minimal sanity check...") | |
| df["is_valid"] = df["normalized_code"].apply(is_non_empty_code) | |
| before = len(df) | |
| df = df[df["is_valid"] == True].reset_index(drop=True) | |
| print(f"Removed {before - len(df)} empty rows") | |
| df = df.drop(columns=["is_valid"]) | |
| print(f"Final dataset size after Step 2: {len(df)} rows") | |
| print("Saving cleaned dataset...") | |
| df.to_csv(OUTPUT_PATH, index=False) | |
| print("Step 2 completed successfully ✅") | |
| if __name__ == "__main__": | |
| main() | |