import os import sys import pickle import numpy as np import pandas as pd import cv2 # OpenCV for generating the images # --- 1. THE FIX for old Pandas architecture --- import pandas.core.indexes sys.modules['pandas.indexes'] = pandas.core.indexes # --- 2. CONFIGURATION --- # The path where Kaggle downloaded your dataset RAW_DATA_PATH = os.path.expanduser('~/.cache/kagglehub/datasets/qingyi/wm811k-wafer-map/versions/1/LSWMD.pkl') # Our output folders IMAGE_DIR = 'data/processed/images' LABEL_DIR = 'data/processed/labels' os.makedirs(IMAGE_DIR, exist_ok=True) os.makedirs(LABEL_DIR, exist_ok=True) # YOLO needs integers (0-7), not text words for classes CLASS_MAP = { 'Center': 0, 'Donut': 1, 'Edge-Loc': 2, 'Edge-Ring': 3, 'Loc': 4, 'Random': 5, 'Scratch': 6, 'Near-full': 7 } def process_data(): print("Loading dataset (this might take a minute)...") with open(RAW_DATA_PATH, 'rb') as f: df = pickle.load(f, encoding='latin1') print("Cleaning data...") df['failure_class'] = df['failureType'].apply(lambda x: x[0][0] if len(x) > 0 else 'None') defective_wafers = df[(df['failure_class'] != 'None') & (df['failure_class'] != 'none')] print(f"Found {len(defective_wafers)} defective wafers.") print("Starting conversion on a test batch of 500 wafers...") #processing the whole set now count = 0 for index, row in defective_wafers.iterrows(): wafer_map = row['waferMap'] defect_class_text = row['failure_class'] # Skip if the class isn't in our dictionary (safety check) if defect_class_text not in CLASS_MAP: continue class_id = CLASS_MAP[defect_class_text] # --- 3. BOUNDING BOX MATH --- # Find all Y (row) and X (col) coordinates where value is 2 (the defect) defect_y, defect_x = np.where(wafer_map == 2) # If there are no 2s for some reason, skip this wafer if len(defect_x) == 0: continue # Find the extreme edges xmin, xmax = np.min(defect_x), np.max(defect_x) ymin, ymax = np.min(defect_y), np.max(defect_y) # Total array dimensions img_height, img_width = wafer_map.shape # Calculate YOLO format (normalized 0.0 to 1.0) x_center = ((xmin + xmax) / 2.0) / img_width y_center = ((ymin + ymax) / 2.0) / img_height w_yolo = (xmax - xmin) / img_width h_yolo = (ymax - ymin) / img_height # --- 4. GENERATE IMAGE --- # Map 0 -> Black (0), 1 -> Gray (127), 2 -> White (255) # We use a grayscale format because it gives the AI perfect contrast to see the defects img_array = np.zeros((img_height, img_width), dtype=np.uint8) img_array[wafer_map == 1] = 127 img_array[wafer_map == 2] = 255 # Save the image using OpenCV img_filename = os.path.join(IMAGE_DIR, f"wafer_{index}.jpg") cv2.imwrite(img_filename, img_array) # --- 5. GENERATE LABEL FILE --- label_filename = os.path.join(LABEL_DIR, f"wafer_{index}.txt") with open(label_filename, 'w') as f: # YOLO strictly requires: class_id x_center y_center width height f.write(f"{class_id} {x_center:.6f} {y_center:.6f} {w_yolo:.6f} {h_yolo:.6f}\n") count += 1 if count % 100 == 0: print(f"Processed {count} wafers...") print("Pipeline complete! Check your data/processed/ folders.") if __name__ == "__main__": process_data()