import pandas as pd INPUT_PATH = "dataset/processed/dataset_step1_length_normalized.csv" OUTPUT_PATH = "dataset/processed/dataset_step2_cleaned.csv" def is_non_empty_code(code): return isinstance(code, str) and len(code.strip()) > 0 def main(): print("Loading Step 1 dataset...") df = pd.read_csv(INPUT_PATH) print(f"Initial dataset size: {len(df)} rows") # Normalize language column if "Language" in df.columns: df["Language"] = df["Language"].astype(str).str.strip().str.lower() # Deduplication print("Removing duplicate code snippets...") before = len(df) df = df.drop_duplicates(subset=["normalized_code"]).reset_index(drop=True) print(f"Removed {before - len(df)} duplicate rows") # Minimal sanity check ONLY print("Applying minimal sanity check...") df["is_valid"] = df["normalized_code"].apply(is_non_empty_code) before = len(df) df = df[df["is_valid"] == True].reset_index(drop=True) print(f"Removed {before - len(df)} empty rows") df = df.drop(columns=["is_valid"]) print(f"Final dataset size after Step 2: {len(df)} rows") print("Saving cleaned dataset...") df.to_csv(OUTPUT_PATH, index=False) print("Step 2 completed successfully ✅") if __name__ == "__main__": main()