Spaces:

Scribbler310
/

semiconductor

Running

semiconductor / middleware /material_predictor.py

Scribbler310

Production deployment with LFS models

a985b94 20 days ago

8.66 kB

	"""
	Material Predictor — Phase 5
	Trains a Random Forest model on historical wafer scan data to predict
	material waste percentage for future production batches.
	"""

	import os
	import pickle
	import sqlite3
	import numpy as np
	import pandas as pd
	from sklearn.ensemble import RandomForestRegressor
	from sklearn.model_selection import train_test_split
	from sklearn.metrics import mean_absolute_error, r2_score

	# --- CONFIGURATION ---
	DB_PATH = os.path.join(os.path.dirname(__file__), 'wafer_control.db')
	MODEL_PATH = os.path.join(os.path.dirname(__file__), 'material_model.pkl')

	# Defect types for feature engineering
	DEFECT_TYPES = ['Center', 'Donut', 'Edge-Loc', 'Edge-Ring', 'Loc', 'Random', 'Scratch', 'Near-full', 'None', 'Undetected']


	def load_data():
	"""Load wafer logs from the SQLite database into a DataFrame."""
	conn = sqlite3.connect(DB_PATH)
	df = pd.read_sql_query("SELECT * FROM wafer_logs", conn)
	conn.close()
	df['scan_time'] = pd.to_datetime(df['scan_time'])
	return df


	def engineer_features(df):
	"""
	Build daily-aggregated features from raw scan logs.
	Each row = one day of production with aggregated metrics.
	"""
	df['scan_date'] = df['scan_time'].dt.date
	df['is_fail'] = (df['status'] == 'FAIL').astype(int)
	df['is_scrap'] = (df['action'] == 'ROUTE_TO_SCRAP').astype(int)

	# One-hot encode defect types per scan
	for defect in DEFECT_TYPES:
	col_name = f'is_{defect.lower().replace("-", "_")}'
	df[col_name] = (df['defect_type'] == defect).astype(int)

	# --- Aggregate by day ---
	daily = df.groupby('scan_date').agg(
	total_scans=('id', 'count'),
	fail_count=('is_fail', 'sum'),
	scrap_count=('is_scrap', 'sum'),
	avg_confidence=('confidence', 'mean'),
	avg_defect_area=('defect_area_px', 'mean'),
	max_defect_area=('defect_area_px', 'max'),
	total_waste_pct=('material_wasted_pct', 'sum'),
	avg_waste_pct=('material_wasted_pct', 'mean'),
	# Defect type counts per day
	center_count=('is_center', 'sum'),
	donut_count=('is_donut', 'sum'),
	edge_loc_count=('is_edge_loc', 'sum'),
	edge_ring_count=('is_edge_ring', 'sum'),
	loc_count=('is_loc', 'sum'),
	random_count=('is_random', 'sum'),
	scratch_count=('is_scratch', 'sum'),
	near_full_count=('is_near_full', 'sum'),
	pass_count=('is_none', 'sum'),
	).reset_index()

	# --- Compute waste among defective wafers only ---
	defective_daily = df[df['status'] == 'FAIL'].groupby('scan_date').agg(
	avg_waste_defective=('material_wasted_pct', 'mean'),
	avg_defect_area_fail=('defect_area_px', 'mean'),
	avg_confidence_fail=('confidence', 'mean'),
	).reset_index()

	daily = daily.merge(defective_daily, on='scan_date', how='left')
	daily['avg_waste_defective'] = daily['avg_waste_defective'].fillna(0)
	daily['avg_defect_area_fail'] = daily['avg_defect_area_fail'].fillna(0)
	daily['avg_confidence_fail'] = daily['avg_confidence_fail'].fillna(0)

	# Derived ratios
	daily['fail_rate'] = daily['fail_count'] / daily['total_scans']
	daily['scrap_rate'] = daily['scrap_count'] / daily['total_scans']

	# Time features
	daily['scan_date'] = pd.to_datetime(daily['scan_date'])
	daily['day_of_week'] = daily['scan_date'].dt.dayofweek
	daily['day_index'] = (daily['scan_date'] - daily['scan_date'].min()).dt.days

	return daily


	def train_model(daily):
	"""Train a Random Forest to predict avg material waste among defective wafers."""
	feature_cols = [
	'total_scans', 'fail_count', 'scrap_count', 'avg_confidence',
	'avg_defect_area', 'max_defect_area', 'fail_rate', 'scrap_rate',
	'avg_defect_area_fail', 'avg_confidence_fail',
	'center_count', 'donut_count', 'edge_loc_count', 'edge_ring_count',
	'loc_count', 'random_count', 'scratch_count', 'near_full_count',
	'pass_count', 'day_of_week', 'day_index'
	]

	target = 'avg_waste_defective'

	X = daily[feature_cols]
	y = daily[target]

	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

	model = RandomForestRegressor(
	n_estimators=100,
	max_depth=10,
	random_state=42,
	n_jobs=-1
	)
	model.fit(X_train, y_train)

	# Evaluate
	y_pred = model.predict(X_test)
	mae = mean_absolute_error(y_test, y_pred)
	r2 = r2_score(y_test, y_pred)

	print(f"\n{'=' * 50}")
	print(f" MODEL EVALUATION (target: avg waste % per wafer)")
	print(f" Mean Absolute Error: {mae:.2f}%")
	print(f" R² Score: {r2:.4f}")
	print(f"{'=' * 50}")

	# Feature importance
	importance = pd.Series(model.feature_importances_, index=feature_cols).sort_values(ascending=False)
	print(f"\n Top 5 Feature Importances:")
	for feat, imp in importance.head(5).items():
	print(f" {feat:25s} {imp:.4f}")

	return model, feature_cols, {'mae': mae, 'r2': r2, 'importances': importance.to_dict()}


	def predict_material_needs(model, feature_cols, total_scans, fail_rate, defect_distribution):
	"""
	Predict material waste for a hypothetical future production day.
	"""
	fail_count = int(total_scans * fail_rate)
	pass_count = total_scans - fail_count

	features = {
	'total_scans': total_scans,
	'fail_count': fail_count,
	'scrap_count': int(fail_count * defect_distribution.get('Center', 0) +
	fail_count * defect_distribution.get('Near-full', 0)),
	'avg_confidence': 0.95,
	'avg_defect_area': 1500,
	'max_defect_area': 2704,
	'fail_rate': fail_rate,
	'scrap_rate': defect_distribution.get('Center', 0) + defect_distribution.get('Near-full', 0),
	'avg_defect_area_fail': 1500,
	'avg_confidence_fail': 0.85,
	'center_count': int(fail_count * defect_distribution.get('Center', 0)),
	'donut_count': int(fail_count * defect_distribution.get('Donut', 0)),
	'edge_loc_count': int(fail_count * defect_distribution.get('Edge-Loc', 0)),
	'edge_ring_count': int(fail_count * defect_distribution.get('Edge-Ring', 0)),
	'loc_count': int(fail_count * defect_distribution.get('Loc', 0)),
	'random_count': int(fail_count * defect_distribution.get('Random', 0)),
	'scratch_count': int(fail_count * defect_distribution.get('Scratch', 0)),
	'near_full_count': int(fail_count * defect_distribution.get('Near-full', 0)),
	'pass_count': pass_count,
	'day_of_week': 2,
	'day_index': 30,
	}

	X = pd.DataFrame([features])[feature_cols]
	avg_waste_per_wafer = model.predict(X)[0]
	total_waste_wafers = (avg_waste_per_wafer / 100.0) * fail_count
	return {
	'avg_waste_per_wafer': round(avg_waste_per_wafer, 2),
	'total_daily_waste': round(total_waste_wafers, 1),
	'total_scans': total_scans,
	'fail_rate': fail_rate,
	}


	if __name__ == '__main__':
	print("=" * 50)
	print(" MATERIAL WASTE PREDICTOR — Training")
	print("=" * 50)

	# 1. Load and engineer features
	print("\nLoading scan data...")
	raw_df = load_data()
	print(f" Total records: {len(raw_df)}")
	print(f" PASS: {len(raw_df[raw_df['status'] == 'PASS'])}")
	print(f" FAIL: {len(raw_df[raw_df['status'] == 'FAIL'])}")

	print("Engineering daily features...")
	daily_df = engineer_features(raw_df)
	print(f" Training days: {len(daily_df)}")

	# 2. Train
	print("\nTraining Random Forest model...")
	trained_model, feat_cols, metrics = train_model(daily_df)

	# 3. Save
	model_package = {
	'model': trained_model,
	'feature_cols': feat_cols,
	'metrics': metrics,
	}
	with open(MODEL_PATH, 'wb') as f:
	pickle.dump(model_package, f)
	print(f"\nModel saved to: {MODEL_PATH}")

	# 4. Demo prediction
	print(f"\n{'=' * 50}")
	print(" DEMO PREDICTION")
	print(f"{'=' * 50}")

	demo_distribution = {
	'Center': 0.15, 'Edge-Ring': 0.37, 'Edge-Loc': 0.06,
	'Donut': 0.23, 'Random': 0.03, 'Scratch': 0.03,
	'Loc': 0.10, 'Near-full': 0.01
	}

	pred = predict_material_needs(trained_model, feat_cols,
	total_scans=1300, fail_rate=0.97,
	defect_distribution=demo_distribution)
	print(f" Scenario: 1,300 wafers/day, 97% defect rate")
	print(f" Predicted avg waste per wafer: {pred['avg_waste_per_wafer']:.2f}%")
	print(f" Predicted total daily waste: {pred['total_daily_waste']:.1f} equivalent wafers")
	print(f"{'=' * 50}")