import pandas as pd DATA_PATH = "dataset/processed/dataset_step2_cleaned.csv" def main(): df = pd.read_csv(DATA_PATH) # ------------------------------- # Handle Label column safely # ------------------------------- if "Label" not in df.columns: if "Label (0- HUMAN, 1-AI)" in df.columns: df = df.rename(columns={"Label (0- HUMAN, 1-AI)": "Label"}) else: print("āš ļø Label column not found. Skipping label-based checks.") # ------------------------------- # Normalize Language column # ------------------------------- if "Language" in df.columns: df["Language"] = df["Language"].astype(str).str.strip().str.lower() print("\n🧩 METADATA CONSISTENCY CHECK\n") # ------------------------------- # Language consistency # ------------------------------- print("🌐 Language distribution:") if "Language" in df.columns: print(df["Language"].value_counts()) else: print(" Language column not found") # ------------------------------- # Label vs Source_Type consistency # ------------------------------- print("\nšŸ·ļø Label vs Source_Type consistency:") if "Label" in df.columns and "Source_Type" in df.columns: cross_tab = pd.crosstab(df["Label"], df["Source_Type"]) print(cross_tab) print("\nExpected behavior:") print(" Label 0 (Human) → Source_Type should be 'human'") print(" Label 1 (AI) → Source_Type should be 'ai'") else: print(" Required columns not found for consistency check") print("\nMetadata consistency check completed āœ…") if __name__ == "__main__": main()