ai-code-detection / preprocessing /step5_metadata_consistency_check.py
joshnavip's picture
Initial commit: AI code detection project (without binary files)
b144cb7
import pandas as pd
DATA_PATH = "dataset/processed/dataset_step2_cleaned.csv"
def main():
df = pd.read_csv(DATA_PATH)
# -------------------------------
# Handle Label column safely
# -------------------------------
if "Label" not in df.columns:
if "Label (0- HUMAN, 1-AI)" in df.columns:
df = df.rename(columns={"Label (0- HUMAN, 1-AI)": "Label"})
else:
print("⚠️ Label column not found. Skipping label-based checks.")
# -------------------------------
# Normalize Language column
# -------------------------------
if "Language" in df.columns:
df["Language"] = df["Language"].astype(str).str.strip().str.lower()
print("\n🧩 METADATA CONSISTENCY CHECK\n")
# -------------------------------
# Language consistency
# -------------------------------
print("🌐 Language distribution:")
if "Language" in df.columns:
print(df["Language"].value_counts())
else:
print(" Language column not found")
# -------------------------------
# Label vs Source_Type consistency
# -------------------------------
print("\n🏷️ Label vs Source_Type consistency:")
if "Label" in df.columns and "Source_Type" in df.columns:
cross_tab = pd.crosstab(df["Label"], df["Source_Type"])
print(cross_tab)
print("\nExpected behavior:")
print(" Label 0 (Human) → Source_Type should be 'human'")
print(" Label 1 (AI) → Source_Type should be 'ai'")
else:
print(" Required columns not found for consistency check")
print("\nMetadata consistency check completed ✅")
if __name__ == "__main__":
main()