Spaces:
Runtime error
Runtime error
| import pandas as pd | |
| from sklearn.model_selection import GroupShuffleSplit, train_test_split | |
| DATA_PATH = "dataset/processed/dataset_step2_cleaned.csv" | |
| OUTPUT_DIR = "dataset/processed" | |
| TRAIN_SIZE = 0.7 | |
| VAL_SIZE = 0.15 | |
| TEST_SIZE = 0.15 | |
| RANDOM_STATE = 42 | |
| def main(): | |
| df = pd.read_csv(DATA_PATH) | |
| # Standardize Label column | |
| if "Label (0- HUMAN, 1-AI)" in df.columns: | |
| df = df.rename(columns={"Label (0- HUMAN, 1-AI)": "Label"}) | |
| # Normalize Language column (for reporting) | |
| if "Language" in df.columns: | |
| df["Language"] = df["Language"].astype(str).str.strip().str.lower() | |
| # Split by class | |
| human_df = df[df["Label"] == 0].copy() | |
| ai_df = df[df["Label"] == 1].copy() | |
| # ----------------------------- | |
| # AI split (group-aware) | |
| # ----------------------------- | |
| ai_df["Generation_Prompt"] = ai_df["Generation_Prompt"].fillna("UNKNOWN_AI_PROMPT") | |
| gss = GroupShuffleSplit( | |
| n_splits=1, | |
| train_size=TRAIN_SIZE, | |
| random_state=RANDOM_STATE | |
| ) | |
| ai_train_idx, ai_temp_idx = next( | |
| gss.split(ai_df, groups=ai_df["Generation_Prompt"]) | |
| ) | |
| ai_train = ai_df.iloc[ai_train_idx] | |
| ai_temp = ai_df.iloc[ai_temp_idx] | |
| gss_val = GroupShuffleSplit( | |
| n_splits=1, | |
| train_size=VAL_SIZE / (VAL_SIZE + TEST_SIZE), | |
| random_state=RANDOM_STATE | |
| ) | |
| ai_val_idx, ai_test_idx = next( | |
| gss_val.split(ai_temp, groups=ai_temp["Generation_Prompt"]) | |
| ) | |
| ai_val = ai_temp.iloc[ai_val_idx] | |
| ai_test = ai_temp.iloc[ai_test_idx] | |
| # ----------------------------- | |
| # Human split (no grouping) | |
| # ----------------------------- | |
| human_train, human_temp = train_test_split( | |
| human_df, | |
| train_size=TRAIN_SIZE, | |
| random_state=RANDOM_STATE, | |
| shuffle=True | |
| ) | |
| human_val, human_test = train_test_split( | |
| human_temp, | |
| train_size=VAL_SIZE / (VAL_SIZE + TEST_SIZE), | |
| random_state=RANDOM_STATE, | |
| shuffle=True | |
| ) | |
| # ----------------------------- | |
| # Combine & shuffle | |
| # ----------------------------- | |
| train_df = pd.concat([ai_train, human_train]).sample(frac=1, random_state=RANDOM_STATE) | |
| val_df = pd.concat([ai_val, human_val]).sample(frac=1, random_state=RANDOM_STATE) | |
| test_df = pd.concat([ai_test, human_test]).sample(frac=1, random_state=RANDOM_STATE) | |
| # ----------------------------- | |
| # Safety checks | |
| # ----------------------------- | |
| for name, split in [("Train", train_df), ("Val", val_df), ("Test", test_df)]: | |
| labels = set(split["Label"].unique()) | |
| if labels != {0, 1}: | |
| raise RuntimeError(f"{name} split missing a class: {labels}") | |
| # ----------------------------- | |
| # Save | |
| # ----------------------------- | |
| train_df.to_csv(f"{OUTPUT_DIR}/dataset_train.csv", index=False) | |
| val_df.to_csv(f"{OUTPUT_DIR}/dataset_val.csv", index=False) | |
| test_df.to_csv(f"{OUTPUT_DIR}/dataset_test.csv", index=False) | |
| # ----------------------------- | |
| # Reporting | |
| # ----------------------------- | |
| print("\n📊 Final Class Distribution") | |
| print("Train:\n", train_df["Label"].value_counts()) | |
| print("Val:\n", val_df["Label"].value_counts()) | |
| print("Test:\n", test_df["Label"].value_counts()) | |
| if "Language" in df.columns: | |
| print("\n🌐 Language Distribution (Train)") | |
| print(train_df["Language"].value_counts()) | |
| if __name__ == "__main__": | |
| main() | |