import pandas as pd from sklearn.model_selection import GroupShuffleSplit, train_test_split DATA_PATH = "dataset/processed/dataset_step2_cleaned.csv" OUTPUT_DIR = "dataset/processed" TRAIN_SIZE = 0.7 VAL_SIZE = 0.15 TEST_SIZE = 0.15 RANDOM_STATE = 42 def main(): df = pd.read_csv(DATA_PATH) # Standardize Label column if "Label (0- HUMAN, 1-AI)" in df.columns: df = df.rename(columns={"Label (0- HUMAN, 1-AI)": "Label"}) # Normalize Language column (for reporting) if "Language" in df.columns: df["Language"] = df["Language"].astype(str).str.strip().str.lower() # Split by class human_df = df[df["Label"] == 0].copy() ai_df = df[df["Label"] == 1].copy() # ----------------------------- # AI split (group-aware) # ----------------------------- ai_df["Generation_Prompt"] = ai_df["Generation_Prompt"].fillna("UNKNOWN_AI_PROMPT") gss = GroupShuffleSplit( n_splits=1, train_size=TRAIN_SIZE, random_state=RANDOM_STATE ) ai_train_idx, ai_temp_idx = next( gss.split(ai_df, groups=ai_df["Generation_Prompt"]) ) ai_train = ai_df.iloc[ai_train_idx] ai_temp = ai_df.iloc[ai_temp_idx] gss_val = GroupShuffleSplit( n_splits=1, train_size=VAL_SIZE / (VAL_SIZE + TEST_SIZE), random_state=RANDOM_STATE ) ai_val_idx, ai_test_idx = next( gss_val.split(ai_temp, groups=ai_temp["Generation_Prompt"]) ) ai_val = ai_temp.iloc[ai_val_idx] ai_test = ai_temp.iloc[ai_test_idx] # ----------------------------- # Human split (no grouping) # ----------------------------- human_train, human_temp = train_test_split( human_df, train_size=TRAIN_SIZE, random_state=RANDOM_STATE, shuffle=True ) human_val, human_test = train_test_split( human_temp, train_size=VAL_SIZE / (VAL_SIZE + TEST_SIZE), random_state=RANDOM_STATE, shuffle=True ) # ----------------------------- # Combine & shuffle # ----------------------------- train_df = pd.concat([ai_train, human_train]).sample(frac=1, random_state=RANDOM_STATE) val_df = pd.concat([ai_val, human_val]).sample(frac=1, random_state=RANDOM_STATE) test_df = pd.concat([ai_test, human_test]).sample(frac=1, random_state=RANDOM_STATE) # ----------------------------- # Safety checks # ----------------------------- for name, split in [("Train", train_df), ("Val", val_df), ("Test", test_df)]: labels = set(split["Label"].unique()) if labels != {0, 1}: raise RuntimeError(f"{name} split missing a class: {labels}") # ----------------------------- # Save # ----------------------------- train_df.to_csv(f"{OUTPUT_DIR}/dataset_train.csv", index=False) val_df.to_csv(f"{OUTPUT_DIR}/dataset_val.csv", index=False) test_df.to_csv(f"{OUTPUT_DIR}/dataset_test.csv", index=False) # ----------------------------- # Reporting # ----------------------------- print("\nšŸ“Š Final Class Distribution") print("Train:\n", train_df["Label"].value_counts()) print("Val:\n", val_df["Label"].value_counts()) print("Test:\n", test_df["Label"].value_counts()) if "Language" in df.columns: print("\n🌐 Language Distribution (Train)") print(train_df["Language"].value_counts()) if __name__ == "__main__": main()