File size: 3,432 Bytes
b144cb7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import pandas as pd
from sklearn.model_selection import GroupShuffleSplit, train_test_split

DATA_PATH = "dataset/processed/dataset_step2_cleaned.csv"
OUTPUT_DIR = "dataset/processed"

TRAIN_SIZE = 0.7
VAL_SIZE = 0.15
TEST_SIZE = 0.15
RANDOM_STATE = 42

def main():
    df = pd.read_csv(DATA_PATH)

    # Standardize Label column
    if "Label (0- HUMAN, 1-AI)" in df.columns:
        df = df.rename(columns={"Label (0- HUMAN, 1-AI)": "Label"})

    # Normalize Language column (for reporting)
    if "Language" in df.columns:
        df["Language"] = df["Language"].astype(str).str.strip().str.lower()

    # Split by class
    human_df = df[df["Label"] == 0].copy()
    ai_df = df[df["Label"] == 1].copy()

    # -----------------------------
    # AI split (group-aware)
    # -----------------------------
    ai_df["Generation_Prompt"] = ai_df["Generation_Prompt"].fillna("UNKNOWN_AI_PROMPT")

    gss = GroupShuffleSplit(
        n_splits=1,
        train_size=TRAIN_SIZE,
        random_state=RANDOM_STATE
    )

    ai_train_idx, ai_temp_idx = next(
        gss.split(ai_df, groups=ai_df["Generation_Prompt"])
    )

    ai_train = ai_df.iloc[ai_train_idx]
    ai_temp = ai_df.iloc[ai_temp_idx]

    gss_val = GroupShuffleSplit(
        n_splits=1,
        train_size=VAL_SIZE / (VAL_SIZE + TEST_SIZE),
        random_state=RANDOM_STATE
    )

    ai_val_idx, ai_test_idx = next(
        gss_val.split(ai_temp, groups=ai_temp["Generation_Prompt"])
    )

    ai_val = ai_temp.iloc[ai_val_idx]
    ai_test = ai_temp.iloc[ai_test_idx]

    # -----------------------------
    # Human split (no grouping)
    # -----------------------------
    human_train, human_temp = train_test_split(
        human_df,
        train_size=TRAIN_SIZE,
        random_state=RANDOM_STATE,
        shuffle=True
    )

    human_val, human_test = train_test_split(
        human_temp,
        train_size=VAL_SIZE / (VAL_SIZE + TEST_SIZE),
        random_state=RANDOM_STATE,
        shuffle=True
    )

    # -----------------------------
    # Combine & shuffle
    # -----------------------------
    train_df = pd.concat([ai_train, human_train]).sample(frac=1, random_state=RANDOM_STATE)
    val_df = pd.concat([ai_val, human_val]).sample(frac=1, random_state=RANDOM_STATE)
    test_df = pd.concat([ai_test, human_test]).sample(frac=1, random_state=RANDOM_STATE)

    # -----------------------------
    # Safety checks
    # -----------------------------
    for name, split in [("Train", train_df), ("Val", val_df), ("Test", test_df)]:
        labels = set(split["Label"].unique())
        if labels != {0, 1}:
            raise RuntimeError(f"{name} split missing a class: {labels}")

    # -----------------------------
    # Save
    # -----------------------------
    train_df.to_csv(f"{OUTPUT_DIR}/dataset_train.csv", index=False)
    val_df.to_csv(f"{OUTPUT_DIR}/dataset_val.csv", index=False)
    test_df.to_csv(f"{OUTPUT_DIR}/dataset_test.csv", index=False)

    # -----------------------------
    # Reporting
    # -----------------------------
    print("\n📊 Final Class Distribution")
    print("Train:\n", train_df["Label"].value_counts())
    print("Val:\n", val_df["Label"].value_counts())
    print("Test:\n", test_df["Label"].value_counts())

    if "Language" in df.columns:
        print("\n🌐 Language Distribution (Train)")
        print(train_df["Language"].value_counts())

if __name__ == "__main__":
    main()