ai-code-detection / preprocessing /step2_duplicates_syntax.py
joshnavip's picture
Initial commit: AI code detection project (without binary files)
b144cb7
import pandas as pd
INPUT_PATH = "dataset/processed/dataset_step1_length_normalized.csv"
OUTPUT_PATH = "dataset/processed/dataset_step2_cleaned.csv"
def is_non_empty_code(code):
return isinstance(code, str) and len(code.strip()) > 0
def main():
print("Loading Step 1 dataset...")
df = pd.read_csv(INPUT_PATH)
print(f"Initial dataset size: {len(df)} rows")
# Normalize language column
if "Language" in df.columns:
df["Language"] = df["Language"].astype(str).str.strip().str.lower()
# Deduplication
print("Removing duplicate code snippets...")
before = len(df)
df = df.drop_duplicates(subset=["normalized_code"]).reset_index(drop=True)
print(f"Removed {before - len(df)} duplicate rows")
# Minimal sanity check ONLY
print("Applying minimal sanity check...")
df["is_valid"] = df["normalized_code"].apply(is_non_empty_code)
before = len(df)
df = df[df["is_valid"] == True].reset_index(drop=True)
print(f"Removed {before - len(df)} empty rows")
df = df.drop(columns=["is_valid"])
print(f"Final dataset size after Step 2: {len(df)} rows")
print("Saving cleaned dataset...")
df.to_csv(OUTPUT_PATH, index=False)
print("Step 2 completed successfully ✅")
if __name__ == "__main__":
main()