ai-code-detection / preprocessing /step4_sanity_check.py
joshnavip's picture
Initial commit: AI code detection project (without binary files)
b144cb7
import pandas as pd
DATA_PATH = "dataset/processed/dataset_step2_cleaned.csv"
def main():
df = pd.read_csv(DATA_PATH)
# -------------------------------
# Handle label column safely
# -------------------------------
if "Label" not in df.columns:
if "Label (0- HUMAN, 1-AI)" in df.columns:
df = df.rename(columns={"Label (0- HUMAN, 1-AI)": "Label"})
else:
print("⚠️ Label column not found (labels may be added later).")
print("\n🧪 SANITY CHECK REPORT\n")
print(f"Total samples: {len(df)}")
print(f"Total columns: {len(df.columns)}\n")
# -------------------------------
# Critical column null checks
# -------------------------------
print("🔍 Null value check (critical columns):")
critical_cols = ["normalized_code", "Language"]
if "Label" in df.columns:
critical_cols.append("Label")
for col in critical_cols:
if col in df.columns:
null_count = df[col].isnull().sum()
print(f" {col}: {null_count} null values")
else:
print(f" {col}: COLUMN NOT FOUND")
# -------------------------------
# Code quality checks
# -------------------------------
empty_code = df["normalized_code"].astype(str).str.strip().eq("").sum()
line_counts = df["normalized_code"].astype(str).str.split("\n").apply(len)
print("\n🧾 Code quality check:")
print(f" Empty normalized_code rows: {empty_code}")
print(f" Very short code (<3 lines): {(line_counts < 3).sum()}")
# -------------------------------
# Label sanity
# -------------------------------
if "Label" in df.columns:
unique_labels = sorted(df["Label"].unique())
print("\n🏷️ Label check:")
print(f" Unique labels found: {unique_labels}")
# -------------------------------
# Line count statistics
# -------------------------------
print("\n📏 Line count statistics:")
if "original_line_count" in df.columns and "normalized_line_count" in df.columns:
print(df[["original_line_count", "normalized_line_count"]].describe())
else:
print(" Line count columns not found.")
# -------------------------------
# Language-wise sanity summary
# -------------------------------
print("\n🌐 Language-wise summary:")
for lang in df["Language"].unique():
lang_df = df[df["Language"] == lang]
print(f"\nLanguage: {lang}")
print(f" Samples: {len(lang_df)}")
short = (lang_df["normalized_code"].str.split("\n").apply(len) < 3).sum()
print(f" Very short code (<3 lines): {short}")
print("\nSanity check completed successfully ✅")
if __name__ == "__main__":
main()