File size: 3,619 Bytes
a985b94
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import os
import sys
import pickle
import numpy as np
import pandas as pd
import cv2  # OpenCV for generating the images

# --- 1. THE FIX for old Pandas architecture ---
import pandas.core.indexes
sys.modules['pandas.indexes'] = pandas.core.indexes

# --- 2. CONFIGURATION ---
# The path where Kaggle downloaded your dataset
RAW_DATA_PATH = os.path.expanduser('~/.cache/kagglehub/datasets/qingyi/wm811k-wafer-map/versions/1/LSWMD.pkl')

# Our output folders
IMAGE_DIR = 'data/processed/images'
LABEL_DIR = 'data/processed/labels'

os.makedirs(IMAGE_DIR, exist_ok=True)
os.makedirs(LABEL_DIR, exist_ok=True)

# YOLO needs integers (0-7), not text words for classes
CLASS_MAP = {
    'Center': 0, 'Donut': 1, 'Edge-Loc': 2, 'Edge-Ring': 3,
    'Loc': 4, 'Random': 5, 'Scratch': 6, 'Near-full': 7
}

def process_data():
    print("Loading dataset (this might take a minute)...")
    with open(RAW_DATA_PATH, 'rb') as f:
        df = pickle.load(f, encoding='latin1')
        
    print("Cleaning data...")
    df['failure_class'] = df['failureType'].apply(lambda x: x[0][0] if len(x) > 0 else 'None')
    defective_wafers = df[(df['failure_class'] != 'None') & (df['failure_class'] != 'none')]
    
    print(f"Found {len(defective_wafers)} defective wafers.")
    print("Starting conversion on a test batch of 500 wafers...")
    
#processing the whole set now
    count = 0
    for index, row in defective_wafers.iterrows():
        wafer_map = row['waferMap']
        defect_class_text = row['failure_class']
        
        # Skip if the class isn't in our dictionary (safety check)
        if defect_class_text not in CLASS_MAP:
            continue
            
        class_id = CLASS_MAP[defect_class_text]
        
        # --- 3. BOUNDING BOX MATH ---
        # Find all Y (row) and X (col) coordinates where value is 2 (the defect)
        defect_y, defect_x = np.where(wafer_map == 2)
        
        # If there are no 2s for some reason, skip this wafer
        if len(defect_x) == 0:
            continue
            
        # Find the extreme edges
        xmin, xmax = np.min(defect_x), np.max(defect_x)
        ymin, ymax = np.min(defect_y), np.max(defect_y)
        
        # Total array dimensions
        img_height, img_width = wafer_map.shape
        
        # Calculate YOLO format (normalized 0.0 to 1.0)
        x_center = ((xmin + xmax) / 2.0) / img_width
        y_center = ((ymin + ymax) / 2.0) / img_height
        w_yolo = (xmax - xmin) / img_width
        h_yolo = (ymax - ymin) / img_height
        
        # --- 4. GENERATE IMAGE ---
        # Map 0 -> Black (0), 1 -> Gray (127), 2 -> White (255)
        # We use a grayscale format because it gives the AI perfect contrast to see the defects
        img_array = np.zeros((img_height, img_width), dtype=np.uint8)
        img_array[wafer_map == 1] = 127
        img_array[wafer_map == 2] = 255
        
        # Save the image using OpenCV
        img_filename = os.path.join(IMAGE_DIR, f"wafer_{index}.jpg")
        cv2.imwrite(img_filename, img_array)
        
        # --- 5. GENERATE LABEL FILE ---
        label_filename = os.path.join(LABEL_DIR, f"wafer_{index}.txt")
        with open(label_filename, 'w') as f:
            # YOLO strictly requires: class_id x_center y_center width height
            f.write(f"{class_id} {x_center:.6f} {y_center:.6f} {w_yolo:.6f} {h_yolo:.6f}\n")
            
        count += 1
        if count % 100 == 0:
            print(f"Processed {count} wafers...")
            
    print("Pipeline complete! Check your data/processed/ folders.")

if __name__ == "__main__":
    process_data()