ai-code-detection / preprocessing /step3_class_balance_check.py
joshnavip's picture
Initial commit: AI code detection project (without binary files)
b144cb7
import pandas as pd
DATA_PATH = "dataset/processed/dataset_step2_cleaned.csv"
def main():
df = pd.read_csv(DATA_PATH)
# -------------------------------
# Handle label column safely
# -------------------------------
if "Label" not in df.columns:
if "Label (0- HUMAN, 1-AI)" in df.columns:
df = df.rename(columns={"Label (0- HUMAN, 1-AI)": "Label"})
else:
raise ValueError("Label column not found in dataset.")
# -------------------------------
# Column validation
# -------------------------------
required_cols = ["Label", "Language"]
missing = [c for c in required_cols if c not in df.columns]
if missing:
raise ValueError(f"Missing required columns: {missing}")
total = len(df)
print("\n📊 OVERALL CLASS BALANCE\n")
class_counts = df["Label"].value_counts().sort_index()
for label, count in class_counts.items():
percent = (count / total) * 100
class_name = "Human" if label == 0 else "AI"
print(f"{class_name} ({label}): {count} samples ({percent:.2f}%)")
print("\nTotal samples:", total)
# -------------------------------
# Language-wise breakdown
# -------------------------------
print("\n🌐 LANGUAGE-WISE CLASS DISTRIBUTION\n")
for lang in df["Language"].unique():
print(f"Language: {lang}")
lang_df = df[df["Language"] == lang]
lang_total = len(lang_df)
lang_counts = lang_df["Label"].value_counts().sort_index()
for label, count in lang_counts.items():
percent = (count / lang_total) * 100
class_name = "Human" if label == 0 else "AI"
print(f" {class_name} ({label}): {count} samples ({percent:.2f}%)")
print(f" Total {lang} samples: {lang_total}\n")
if __name__ == "__main__":
main()