File size: 1,293 Bytes
b144cb7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import pandas as pd

INPUT_PATH = "dataset/processed/dataset_step1_length_normalized.csv"
OUTPUT_PATH = "dataset/processed/dataset_step2_cleaned.csv"

def is_non_empty_code(code):
    return isinstance(code, str) and len(code.strip()) > 0

def main():
    print("Loading Step 1 dataset...")
    df = pd.read_csv(INPUT_PATH)
    print(f"Initial dataset size: {len(df)} rows")

    # Normalize language column
    if "Language" in df.columns:
        df["Language"] = df["Language"].astype(str).str.strip().str.lower()

    # Deduplication
    print("Removing duplicate code snippets...")
    before = len(df)
    df = df.drop_duplicates(subset=["normalized_code"]).reset_index(drop=True)
    print(f"Removed {before - len(df)} duplicate rows")

    # Minimal sanity check ONLY
    print("Applying minimal sanity check...")
    df["is_valid"] = df["normalized_code"].apply(is_non_empty_code)

    before = len(df)
    df = df[df["is_valid"] == True].reset_index(drop=True)
    print(f"Removed {before - len(df)} empty rows")

    df = df.drop(columns=["is_valid"])

    print(f"Final dataset size after Step 2: {len(df)} rows")

    print("Saving cleaned dataset...")
    df.to_csv(OUTPUT_PATH, index=False)

    print("Step 2 completed successfully ✅")

if __name__ == "__main__":
    main()