anomalyOS / mlops /optuna_tuner.py
CaffeinatedCoding's picture
Upload folder using huggingface_hub
e72f783 verified
# mlops/optuna_tuner.py
# Optuna hyperparameter search for EfficientNet-B0 fine-tuning
# 10 trials: lr, dropout, batch_size
# All trials logged to MLflow on DagsHub
# Run on Kaggle T4 — not locally
import os
import optuna
import mlflow
import dagshub
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as T
from torch.utils.data import DataLoader, Dataset
from PIL import Image
import numpy as np
MVTEC_PATH = os.environ.get("MVTEC_PATH", "/kaggle/input/datasets/ipythonx/mvtec-ad")
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
N_TRIALS = 10
class MVTecBinaryDataset(Dataset):
"""
Binary classification dataset: normal=0, defective=1.
Used only for EfficientNet fine-tuning (GradCAM++ quality).
NOT used for PatchCore training.
"""
def __init__(self, mvtec_path: str, transform=None):
self.samples = []
self.transform = transform
categories = [
'bottle', 'cable', 'capsule', 'carpet', 'grid', 'hazelnut',
'leather', 'metal_nut', 'pill', 'screw', 'tile', 'toothbrush',
'transistor', 'wood', 'zipper'
]
for cat in categories:
# Normal
train_dir = os.path.join(mvtec_path, cat, "train", "good")
for f in os.listdir(train_dir):
if f.endswith((".png", ".jpg")):
self.samples.append(
(os.path.join(train_dir, f), 0)
)
# Defective
test_dir = os.path.join(mvtec_path, cat, "test")
for defect_type in os.listdir(test_dir):
if defect_type == "good":
continue
d_dir = os.path.join(test_dir, defect_type)
for f in os.listdir(d_dir):
if f.endswith((".png", ".jpg")):
self.samples.append(
(os.path.join(d_dir, f), 1)
)
def __len__(self):
return len(self.samples)
def __getitem__(self, idx):
path, label = self.samples[idx]
img = Image.open(path).convert("RGB")
if self.transform:
img = self.transform(img)
return img, label
def build_model(dropout: float) -> nn.Module:
model = models.efficientnet_b0(pretrained=True)
model.classifier = nn.Sequential(
nn.Dropout(p=dropout),
nn.Linear(1280, 2)
)
return model.to(DEVICE)
def train_one_trial(trial):
"""Single Optuna trial — returns validation AUC."""
lr = trial.suggest_float("lr", 1e-4, 1e-2, log=True)
dropout = trial.suggest_float("dropout", 0.2, 0.5)
batch_size = trial.suggest_categorical("batch_size", [16, 32])
transform = T.Compose([
T.Resize((224, 224)),
T.ToTensor(),
T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])
dataset = MVTecBinaryDataset(MVTEC_PATH, transform=transform)
n_val = int(0.2 * len(dataset))
n_train = len(dataset) - n_val
train_set, val_set = torch.utils.data.random_split(
dataset, [n_train, n_val],
generator=torch.Generator().manual_seed(42)
)
train_loader = DataLoader(train_set, batch_size=batch_size,
shuffle=True, num_workers=2)
val_loader = DataLoader(val_set, batch_size=batch_size,
shuffle=False, num_workers=2)
model = build_model(dropout)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()
# Train 3 epochs per trial
for epoch in range(3):
model.train()
for imgs, labels in train_loader:
imgs, labels = imgs.to(DEVICE), labels.to(DEVICE)
optimizer.zero_grad()
loss = criterion(model(imgs), labels)
loss.backward()
optimizer.step()
# Validate
model.eval()
all_scores = []
all_labels = []
with torch.no_grad():
for imgs, labels in val_loader:
imgs = imgs.to(DEVICE)
logits = model(imgs)
probs = torch.softmax(logits, dim=1)[:, 1]
all_scores.extend(probs.cpu().numpy().tolist())
all_labels.extend(labels.numpy().tolist())
from sklearn.metrics import roc_auc_score
auc = roc_auc_score(all_labels, all_scores)
# Log trial to MLflow
with mlflow.start_run(run_name=f"efficientnet_trial_{trial.number}",
nested=True):
mlflow.log_param("lr", lr)
mlflow.log_param("dropout", dropout)
mlflow.log_param("batch_size", batch_size)
mlflow.log_metric("val_auc", auc)
return auc
def run_optuna_search():
dagshub.init(repo_owner="devangmishra1424",
repo_name="AnomalyOS", mlflow=True)
with mlflow.start_run(run_name="efficientnet_optuna_search"):
study = optuna.create_study(direction="maximize")
study.optimize(train_one_trial, n_trials=N_TRIALS)
best = study.best_trial
print(f"\nBest trial: AUC={best.value:.4f}")
print(f" lr={best.params['lr']:.6f}")
print(f" dropout={best.params['dropout']:.3f}")
print(f" batch_size={best.params['batch_size']}")
mlflow.log_metric("best_val_auc", best.value)
mlflow.log_params(best.params)
return best.params
if __name__ == "__main__":
run_optuna_search()