import pandas as pd DATA_PATH = "dataset/processed/dataset_step2_cleaned.csv" def main(): df = pd.read_csv(DATA_PATH) # ------------------------------- # Handle label column safely # ------------------------------- if "Label" not in df.columns: if "Label (0- HUMAN, 1-AI)" in df.columns: df = df.rename(columns={"Label (0- HUMAN, 1-AI)": "Label"}) else: raise ValueError("Label column not found in dataset.") # ------------------------------- # Column validation # ------------------------------- required_cols = ["Label", "Language"] missing = [c for c in required_cols if c not in df.columns] if missing: raise ValueError(f"Missing required columns: {missing}") total = len(df) print("\nšŸ“Š OVERALL CLASS BALANCE\n") class_counts = df["Label"].value_counts().sort_index() for label, count in class_counts.items(): percent = (count / total) * 100 class_name = "Human" if label == 0 else "AI" print(f"{class_name} ({label}): {count} samples ({percent:.2f}%)") print("\nTotal samples:", total) # ------------------------------- # Language-wise breakdown # ------------------------------- print("\n🌐 LANGUAGE-WISE CLASS DISTRIBUTION\n") for lang in df["Language"].unique(): print(f"Language: {lang}") lang_df = df[df["Language"] == lang] lang_total = len(lang_df) lang_counts = lang_df["Label"].value_counts().sort_index() for label, count in lang_counts.items(): percent = (count / lang_total) * 100 class_name = "Human" if label == 0 else "AI" print(f" {class_name} ({label}): {count} samples ({percent:.2f}%)") print(f" Total {lang} samples: {lang_total}\n") if __name__ == "__main__": main()