Spaces:
Runtime error
Runtime error
| import pandas as pd | |
| DATA_PATH = "dataset/processed/dataset_step2_cleaned.csv" | |
| def main(): | |
| df = pd.read_csv(DATA_PATH) | |
| # ------------------------------- | |
| # Handle label column safely | |
| # ------------------------------- | |
| if "Label" not in df.columns: | |
| if "Label (0- HUMAN, 1-AI)" in df.columns: | |
| df = df.rename(columns={"Label (0- HUMAN, 1-AI)": "Label"}) | |
| else: | |
| raise ValueError("Label column not found in dataset.") | |
| # ------------------------------- | |
| # Column validation | |
| # ------------------------------- | |
| required_cols = ["Label", "Language"] | |
| missing = [c for c in required_cols if c not in df.columns] | |
| if missing: | |
| raise ValueError(f"Missing required columns: {missing}") | |
| total = len(df) | |
| print("\n📊 OVERALL CLASS BALANCE\n") | |
| class_counts = df["Label"].value_counts().sort_index() | |
| for label, count in class_counts.items(): | |
| percent = (count / total) * 100 | |
| class_name = "Human" if label == 0 else "AI" | |
| print(f"{class_name} ({label}): {count} samples ({percent:.2f}%)") | |
| print("\nTotal samples:", total) | |
| # ------------------------------- | |
| # Language-wise breakdown | |
| # ------------------------------- | |
| print("\n🌐 LANGUAGE-WISE CLASS DISTRIBUTION\n") | |
| for lang in df["Language"].unique(): | |
| print(f"Language: {lang}") | |
| lang_df = df[df["Language"] == lang] | |
| lang_total = len(lang_df) | |
| lang_counts = lang_df["Label"].value_counts().sort_index() | |
| for label, count in lang_counts.items(): | |
| percent = (count / lang_total) * 100 | |
| class_name = "Human" if label == 0 else "AI" | |
| print(f" {class_name} ({label}): {count} samples ({percent:.2f}%)") | |
| print(f" Total {lang} samples: {lang_total}\n") | |
| if __name__ == "__main__": | |
| main() | |