import pandas as pd DATA_PATH = "dataset/processed/dataset_step2_cleaned.csv" def main(): df = pd.read_csv(DATA_PATH) # ------------------------------- # Handle label column safely # ------------------------------- if "Label" not in df.columns: if "Label (0- HUMAN, 1-AI)" in df.columns: df = df.rename(columns={"Label (0- HUMAN, 1-AI)": "Label"}) else: print("โš ๏ธ Label column not found (labels may be added later).") print("\n๐Ÿงช SANITY CHECK REPORT\n") print(f"Total samples: {len(df)}") print(f"Total columns: {len(df.columns)}\n") # ------------------------------- # Critical column null checks # ------------------------------- print("๐Ÿ” Null value check (critical columns):") critical_cols = ["normalized_code", "Language"] if "Label" in df.columns: critical_cols.append("Label") for col in critical_cols: if col in df.columns: null_count = df[col].isnull().sum() print(f" {col}: {null_count} null values") else: print(f" {col}: COLUMN NOT FOUND") # ------------------------------- # Code quality checks # ------------------------------- empty_code = df["normalized_code"].astype(str).str.strip().eq("").sum() line_counts = df["normalized_code"].astype(str).str.split("\n").apply(len) print("\n๐Ÿงพ Code quality check:") print(f" Empty normalized_code rows: {empty_code}") print(f" Very short code (<3 lines): {(line_counts < 3).sum()}") # ------------------------------- # Label sanity # ------------------------------- if "Label" in df.columns: unique_labels = sorted(df["Label"].unique()) print("\n๐Ÿท๏ธ Label check:") print(f" Unique labels found: {unique_labels}") # ------------------------------- # Line count statistics # ------------------------------- print("\n๐Ÿ“ Line count statistics:") if "original_line_count" in df.columns and "normalized_line_count" in df.columns: print(df[["original_line_count", "normalized_line_count"]].describe()) else: print(" Line count columns not found.") # ------------------------------- # Language-wise sanity summary # ------------------------------- print("\n๐ŸŒ Language-wise summary:") for lang in df["Language"].unique(): lang_df = df[df["Language"] == lang] print(f"\nLanguage: {lang}") print(f" Samples: {len(lang_df)}") short = (lang_df["normalized_code"].str.split("\n").apply(len) < 3).sum() print(f" Very short code (<3 lines): {short}") print("\nSanity check completed successfully โœ…") if __name__ == "__main__": main()