semiconductor / src /split_data.py
Scribbler310
Production deployment with LFS models
a985b94
import os
import shutil
from sklearn.model_selection import train_test_split
# Current raw output folders
IMAGE_DIR = 'data/processed/images'
LABEL_DIR = 'data/processed/labels'
# New YOLO-compliant folders
YOLO_DIR = 'data/yolo_dataset'
def setup_yolo_folders():
print("Creating YOLO folder structure...")
for split in ['train', 'val']:
os.makedirs(os.path.join(YOLO_DIR, 'images', split), exist_ok=True)
os.makedirs(os.path.join(YOLO_DIR, 'labels', split), exist_ok=True)
# Grab all the images we just generated
images = [f for f in os.listdir(IMAGE_DIR) if f.endswith('.jpg')]
print(f"Found {len(images)} total images. Splitting 80/20...")
# Classic 80/20 split using scikit-learn
train_imgs, val_imgs = train_test_split(images, test_size=0.2, random_state=42)
def copy_files(file_list, split_name):
print(f"Copying {len(file_list)} files to {split_name} set...")
for img_name in file_list:
# Copy Image
shutil.copy(
os.path.join(IMAGE_DIR, img_name),
os.path.join(YOLO_DIR, 'images', split_name, img_name)
)
# Copy matching Label
label_name = img_name.replace('.jpg', '.txt')
if os.path.exists(os.path.join(LABEL_DIR, label_name)):
shutil.copy(
os.path.join(LABEL_DIR, label_name),
os.path.join(YOLO_DIR, 'labels', split_name, label_name)
)
copy_files(train_imgs, 'train')
copy_files(val_imgs, 'val')
print("Done! Dataset is formatted and ready for YOLOv8.")
if __name__ == "__main__":
setup_yolo_folders()