ai-code-detection / preprocessing /step6_traintestsplit.py
joshnavip's picture
Initial commit: AI code detection project (without binary files)
b144cb7
import pandas as pd
from sklearn.model_selection import GroupShuffleSplit, train_test_split
DATA_PATH = "dataset/processed/dataset_step2_cleaned.csv"
OUTPUT_DIR = "dataset/processed"
TRAIN_SIZE = 0.7
VAL_SIZE = 0.15
TEST_SIZE = 0.15
RANDOM_STATE = 42
def main():
df = pd.read_csv(DATA_PATH)
# Standardize Label column
if "Label (0- HUMAN, 1-AI)" in df.columns:
df = df.rename(columns={"Label (0- HUMAN, 1-AI)": "Label"})
# Normalize Language column (for reporting)
if "Language" in df.columns:
df["Language"] = df["Language"].astype(str).str.strip().str.lower()
# Split by class
human_df = df[df["Label"] == 0].copy()
ai_df = df[df["Label"] == 1].copy()
# -----------------------------
# AI split (group-aware)
# -----------------------------
ai_df["Generation_Prompt"] = ai_df["Generation_Prompt"].fillna("UNKNOWN_AI_PROMPT")
gss = GroupShuffleSplit(
n_splits=1,
train_size=TRAIN_SIZE,
random_state=RANDOM_STATE
)
ai_train_idx, ai_temp_idx = next(
gss.split(ai_df, groups=ai_df["Generation_Prompt"])
)
ai_train = ai_df.iloc[ai_train_idx]
ai_temp = ai_df.iloc[ai_temp_idx]
gss_val = GroupShuffleSplit(
n_splits=1,
train_size=VAL_SIZE / (VAL_SIZE + TEST_SIZE),
random_state=RANDOM_STATE
)
ai_val_idx, ai_test_idx = next(
gss_val.split(ai_temp, groups=ai_temp["Generation_Prompt"])
)
ai_val = ai_temp.iloc[ai_val_idx]
ai_test = ai_temp.iloc[ai_test_idx]
# -----------------------------
# Human split (no grouping)
# -----------------------------
human_train, human_temp = train_test_split(
human_df,
train_size=TRAIN_SIZE,
random_state=RANDOM_STATE,
shuffle=True
)
human_val, human_test = train_test_split(
human_temp,
train_size=VAL_SIZE / (VAL_SIZE + TEST_SIZE),
random_state=RANDOM_STATE,
shuffle=True
)
# -----------------------------
# Combine & shuffle
# -----------------------------
train_df = pd.concat([ai_train, human_train]).sample(frac=1, random_state=RANDOM_STATE)
val_df = pd.concat([ai_val, human_val]).sample(frac=1, random_state=RANDOM_STATE)
test_df = pd.concat([ai_test, human_test]).sample(frac=1, random_state=RANDOM_STATE)
# -----------------------------
# Safety checks
# -----------------------------
for name, split in [("Train", train_df), ("Val", val_df), ("Test", test_df)]:
labels = set(split["Label"].unique())
if labels != {0, 1}:
raise RuntimeError(f"{name} split missing a class: {labels}")
# -----------------------------
# Save
# -----------------------------
train_df.to_csv(f"{OUTPUT_DIR}/dataset_train.csv", index=False)
val_df.to_csv(f"{OUTPUT_DIR}/dataset_val.csv", index=False)
test_df.to_csv(f"{OUTPUT_DIR}/dataset_test.csv", index=False)
# -----------------------------
# Reporting
# -----------------------------
print("\n📊 Final Class Distribution")
print("Train:\n", train_df["Label"].value_counts())
print("Val:\n", val_df["Label"].value_counts())
print("Test:\n", test_df["Label"].value_counts())
if "Language" in df.columns:
print("\n🌐 Language Distribution (Train)")
print(train_df["Language"].value_counts())
if __name__ == "__main__":
main()