uncleMehrzad
/

polyp-segmentation

+import os
+import glob
+import json
+import numpy as np
+import pandas as pd
+from PIL import Image
+from tqdm import tqdm
+from datetime import datetime
+import matplotlib.pyplot as plt
+from sklearn.model_selection import train_test_split
+from scipy.ndimage import morphology
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils.data import Dataset, DataLoader
+from torch.optim import AdamW
+from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, ReduceLROnPlateau
+from transformers import AutoModel
+import albumentations as A
+from albumentations.pytorch import ToTensorV2
+import cv2
+import warnings
+import math
+warnings.filterwarnings('ignore')
+# Set seeds for reproducibility
+def set_seed(seed=42):
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+set_seed(42)
+# ============================================================================
+# CONFIGURATION
+# ============================================================================
+class Config:
+    # Model - USING YOUR LOCAL DOWNLOADED MODEL
+    model_name = "facebook/dinov3-vitl16-pretrain-lvd1689m"
+    local_model_path = "/data/F/VoiceNegar/models/pe_models/dino7b/checkpoints/initial_dinov3-vitl16-pretrain-lvd1689m_backbone"
+    # Data paths
+    dataset_path = "/home/PeBigModelForVilab/dinov3/toy-project/Kvasir-SEG/"
+    image_size = 256
+    patch_size = 16
+    # Training
+    batch_size = 96
+    num_epochs = 150
+    learning_rate = 1e-4
+    min_lr = 1e-6
+    weight_decay = 1e-4
+    # Cosine Annealing with Warm Restarts
+    T_0 = 10  # Initial restart period (epochs)
+    T_mult = 2  # Period multiplier after each restart
+    # Validation
+    val_split = 0.1
+    test_split = 0.05
+    # Device
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    # Logging
+    save_dir = "./checkpoints"
+    log_interval = 10
+    # Image normalization (ImageNet stats)
+    mean = [0.485, 0.456, 0.406]
+    std = [0.229, 0.224, 0.225]
+    resume_from = None
+    # Multi‑scale ViT layers
+    multi_scale_layers = [5, 10, 16, 18, 20, 22, 23]
+    # Loss parameters (Focal+Dice)
+    focal_weight = 0.69
+    dice_weight = 0.3
+    boundary_weight = 0.01
+    # HD95 parameter
+    hd95_threshold = 0.5
+config = Config()
+os.makedirs(config.save_dir, exist_ok=True)
+print(f"Using device: {config.device}")
+print(f"Model: {config.model_name}")
+print(f"Local model path: {config.local_model_path}")
+print(f"Exists: {os.path.exists(config.local_model_path)}")
+# ============================================================================
+# DATASET CLASS
+# ============================================================================
+class PolypDataset(Dataset):
+    """Kvasir-SEG dataset with manual preprocessing"""
+    def __init__(self, image_paths, mask_paths, transform=None, target_size=(256, 256)):
+        self.image_paths = image_paths
+        self.mask_paths = mask_paths
+        self.transform = transform
+        self.target_size = target_size
+        # ImageNet normalization values
+        self.mean = torch.tensor(config.mean).view(3, 1, 1)
+        self.std = torch.tensor(config.std).view(3, 1, 1)
+    def __len__(self):
+        return len(self.image_paths)
+    def __getitem__(self, idx):
+        # Load image
+        image = cv2.imread(self.image_paths[idx])
+        if image is None:
+            raise ValueError(f"Could not load image: {self.image_paths[idx]}")
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        # Load mask
+        mask = cv2.imread(self.mask_paths[idx], cv2.IMREAD_GRAYSCALE)
+        if mask is None:
+            raise ValueError(f"Could not load mask: {self.mask_paths[idx]}")
+        mask = (mask > 127).astype(np.float32)
+        # Apply augmentations
+        if self.transform:
+            augmented = self.transform(image=image, mask=mask)
+            image, mask = augmented['image'], augmented['mask']
+        else:
+            image = cv2.resize(image, self.target_size)
+            mask = cv2.resize(mask, self.target_size, interpolation=cv2.INTER_NEAREST)
+        # Manual preprocessing
+        if isinstance(image, np.ndarray):
+            image = torch.from_numpy(image).permute(2, 0, 1).float() / 255.0
+        elif isinstance(image, torch.Tensor):
+            image = image.float() / 255.0
+        # Apply ImageNet normalization
+        image = (image - self.mean) / self.std
+        # Ensure mask is tensor
+        if isinstance(mask, np.ndarray):
+            mask = torch.from_numpy(mask).float()
+        return image, mask.unsqueeze(0)
+# ============================================================================
+# FIXED DINOv3 ENCODER
+# ============================================================================
+class DINOv3Encoder(nn.Module):
+    """Frozen DINOv3 encoder that can return concatenated multi‑scale features."""
+    def __init__(self, model_name="facebook/dinov3-vitl16-pretrain-lvd1689m",
+                 local_path=None, freeze=True, layers=None):
+        super().__init__()
+        # Load model
+        if local_path and os.path.exists(local_path):
+            print(f"Loading DINOv3 model from local path: {local_path}")
+            self.model = AutoModel.from_pretrained(local_path, local_files_only=True)
+        else:
+            print(f"Loading DINOv3 from HuggingFace hub: {model_name}")
+            self.model = AutoModel.from_pretrained(model_name)
+        self.embed_dim = self.model.config.hidden_size
+        self.patch_size = self.model.config.patch_size
+        self.layers = layers
+        if self.layers is not None:
+            self.out_channels = self.embed_dim * len(self.layers)
+        else:
+            self.out_channels = self.embed_dim
+        print(f"DINOv3 loaded - embed_dim: {self.embed_dim}, patch_size: {self.patch_size}")
+        if self.layers:
+            print(f"   Multi‑scale layers: {self.layers}, output channels: {self.out_channels}")
+        if freeze:
+            for param in self.model.parameters():
+                param.requires_grad = False
+    def _reshape_to_2d(self, patch_tokens, B):
+        """Robust reshaping of patch tokens to 2D grid."""
+        N = patch_tokens.shape[1]
+        D = patch_tokens.shape[2]
+        H_grid = int(math.sqrt(N))
+        W_grid = H_grid
+        while H_grid * W_grid != N:
+            if H_grid * W_grid < N:
+                W_grid += 1
+            else:
+                found = False
+                for h in range(int(math.sqrt(N)), 0, -1):
+                    if N % h == 0:
+                        H_grid = h
+                        W_grid = N // h
+                        found = True
+                        break
+                if not found:
+                    W_grid += 1
+                else:
+                    break
+        if H_grid * W_grid != N:
+            print(f"   Warning: Cannot reshape {N} patches into grid. Interpolating to square.")
+            target_size = int(math.sqrt(N))
+            patch_tokens_flat = patch_tokens.transpose(1, 2)
+            patch_tokens_2d = F.interpolate(
+                patch_tokens_flat.unsqueeze(-2) if patch_tokens_flat.dim() == 3 else patch_tokens_flat,
+                size=target_size * target_size,
+                mode='linear',
+                align_corners=False
+            ).reshape(B, D, target_size, target_size)
+            return patch_tokens_2d
+        feat_map = patch_tokens.transpose(1, 2).reshape(B, D, H_grid, W_grid)
+        return feat_map
+    def forward(self, pixel_values):
+        B, C, H, W = pixel_values.shape
+        if self.layers is not None:
+            outputs = self.model(pixel_values, output_hidden_states=True)
+            hidden_states = outputs.hidden_states
+            feature_list = []
+            for idx in self.layers:
+                hidden = hidden_states[idx]
+                patch_tokens = hidden[:, 1:, :]
+                feat_map = self._reshape_to_2d(patch_tokens, B)
+                feature_list.append(feat_map)
+            target_h, target_w = feature_list[0].shape[-2:]
+            resized_features = []
+            for feat in feature_list:
+                if feat.shape[-2:] != (target_h, target_w):
+                    feat = F.interpolate(feat, size=(target_h, target_w),
+                                        mode='bilinear', align_corners=False)
+                resized_features.append(feat)
+            features = torch.cat(resized_features, dim=1)
+        else:
+            outputs = self.model(pixel_values, output_hidden_states=False)
+            last_hidden = outputs.last_hidden_state[:, 1:, :]
+            features = self._reshape_to_2d(last_hidden, B)
+        return features
+# ============================================================================
+# SHALLOW STEM FOR SKIP CONNECTIONS
+# ============================================================================
+class ShallowStem(nn.Module):
+    """Extracts multi‑scale features from the input image."""
+    def __init__(self, in_channels=3, base_channels=64):
+        super().__init__()
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(in_channels, base_channels, 3, padding=1, bias=False),
+            nn.BatchNorm2d(base_channels),
+            nn.ReLU(inplace=True)
+        )
+        self.conv2 = nn.Sequential(
+            nn.Conv2d(base_channels, base_channels*2, 3, stride=2, padding=1, bias=False),
+            nn.BatchNorm2d(base_channels*2),
+            nn.ReLU(inplace=True)
+        )
+        self.conv3 = nn.Sequential(
+            nn.Conv2d(base_channels*2, base_channels*4, 3, stride=2, padding=1, bias=False),
+            nn.BatchNorm2d(base_channels*4),
+            nn.ReLU(inplace=True)
+        )
+        self.conv4 = nn.Sequential(
+            nn.Conv2d(base_channels*4, base_channels*8, 3, stride=2, padding=1, bias=False),
+            nn.BatchNorm2d(base_channels*8),
+            nn.ReLU(inplace=True)
+        )
+    def forward(self, x):
+        x = self.conv1(x)
+        f2 = self.conv2(x)
+        f3 = self.conv3(f2)
+        f4 = self.conv4(f3)
+        return [f4, f3, f2]
+# ============================================================================
+# U‑Net DECODER WITH SKIP CONNECTIONS
+# ============================================================================
+class UNetDecoder(nn.Module):
+    """Decoder that progressively upsamples ViT features."""
+    def __init__(self, vit_channels=1024, stem_channels=[512,256,128], num_classes=1):
+        super().__init__()
+        self.up1 = self._up_block(vit_channels, 256)
+        self.conv1 = self._conv_block(256 + stem_channels[0], 256)
+        self.up2 = self._up_block(256, 128)
+        self.conv2 = self._conv_block(128 + stem_channels[1], 128)
+        self.up3 = self._up_block(128, 64)
+        self.conv3 = self._conv_block(64 + stem_channels[2], 64)
+        self.up4 = nn.UpsamplingBilinear2d(scale_factor=2)
+        self.final = nn.Conv2d(64, num_classes, kernel_size=1)
+    def _up_block(self, in_ch, out_ch):
+        return nn.Sequential(
+            nn.UpsamplingBilinear2d(scale_factor=2),
+            nn.Conv2d(in_ch, out_ch, 3, padding=1, bias=False),
+            nn.BatchNorm2d(out_ch),
+            nn.ReLU(inplace=True)
+        )
+    def _conv_block(self, in_ch, out_ch):
+        return nn.Sequential(
+            nn.Conv2d(in_ch, out_ch, 3, padding=1, bias=False),
+            nn.BatchNorm2d(out_ch),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(out_ch, out_ch, 3, padding=1, bias=False),
+            nn.BatchNorm2d(out_ch),
+            nn.ReLU(inplace=True)
+        )
+    def forward(self, vit_features, skip_features):
+        x = self.up1(vit_features)
+        if x.shape[-2:] != skip_features[0].shape[-2:]:
+            x = F.interpolate(x, size=skip_features[0].shape[-2:], mode='bilinear', align_corners=False)
+        x = torch.cat([x, skip_features[0]], dim=1)
+        x = self.conv1(x)
+        x = self.up2(x)
+        if x.shape[-2:] != skip_features[1].shape[-2:]:
+            x = F.interpolate(x, size=skip_features[1].shape[-2:], mode='bilinear', align_corners=False)
+        x = torch.cat([x, skip_features[1]], dim=1)
+        x = self.conv2(x)
+        x = self.up3(x)
+        if x.shape[-2:] != skip_features[2].shape[-2:]:
+            x = F.interpolate(x, size=skip_features[2].shape[-2:], mode='bilinear', align_corners=False)
+        x = torch.cat([x, skip_features[2]], dim=1)
+        x = self.conv3(x)
+        x = self.up4(x)
+        return self.final(x)
+# ============================================================================
+# LOSS FUNCTIONS
+# ============================================================================
+class DiceLoss(nn.Module):
+    def __init__(self, smooth=1e-6):
+        super().__init__()
+        self.smooth = smooth
+    def forward(self, pred, target):
+        pred = torch.sigmoid(pred)
+        pred_flat = pred.view(-1)
+        target_flat = target.view(-1)
+        intersection = (pred_flat * target_flat).sum()
+        dice = (2. * intersection + self.smooth) / (pred_flat.sum() + target_flat.sum() + self.smooth)
+        return 1 - dice
+class FocalLoss(nn.Module):
+    def __init__(self, alpha=0.25, gamma=2.0):
+        super().__init__()
+        self.alpha = alpha
+        self.gamma = gamma
+    def forward(self, pred, target):
+        bce = F.binary_cross_entropy_with_logits(pred, target, reduction='none')
+        pt = torch.exp(-bce)
+        focal = self.alpha * (1 - pt) ** self.gamma * bce
+        return focal.mean()
+class BoundaryLoss(nn.Module):
+    """Boundary loss using Sobel edge detection for sharper edges"""
+    def __init__(self):
+        super().__init__()
+        # Sobel kernels for edge detection
+        self.sobel_x = torch.tensor([[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]], dtype=torch.float32).view(1, 1, 3, 3)
+        self.sobel_y = torch.tensor([[-1, -2, -1], [0, 0, 0], [1, 2, 1]], dtype=torch.float32).view(1, 1, 3, 3)
+    def forward(self, pred, target):
+        device = pred.device
+        self.sobel_x = self.sobel_x.to(device)
+        self.sobel_y = self.sobel_y.to(device)
+        # Get probabilities
+        pred_prob = torch.sigmoid(pred)
+        # Compute edges for prediction and target
+        pred_edges_x = F.conv2d(pred_prob, self.sobel_x, padding=1)
+        pred_edges_y = F.conv2d(pred_prob, self.sobel_y, padding=1)
+        pred_edges = torch.sqrt(pred_edges_x**2 + pred_edges_y**2 + 1e-6)
+        target_edges_x = F.conv2d(target, self.sobel_x, padding=1)
+        target_edges_y = F.conv2d(target, self.sobel_y, padding=1)
+        target_edges = torch.sqrt(target_edges_x**2 + target_edges_y**2 + 1e-6)
+        # MSE between edge maps
+        boundary_loss = F.mse_loss(pred_edges, target_edges)
+        return boundary_loss
+class FocalDiceBoundaryLoss(nn.Module):
+    def __init__(self, focal_weight=0.6, dice_weight=0.3, boundary_weight=0.1):
+        super().__init__()
+        self.focal = FocalLoss()
+        self.dice = DiceLoss()
+        self.boundary = BoundaryLoss()
+        self.w_f = focal_weight
+        self.w_d = dice_weight
+        self.w_b = boundary_weight
+    def forward(self, pred, target):
+        return (self.w_f * self.focal(pred, target) +
+                self.w_d * self.dice(pred, target) +
+                self.w_b * self.boundary(pred, target))
+# ============================================================================
+# METRICS
+# ============================================================================
+def compute_dice(pred, target, threshold=0.5):
+    """Compute Dice score"""
+    pred_binary = (torch.sigmoid(pred) > threshold).float()
+    intersection = (pred_binary * target).sum()
+    dice = (2. * intersection) / (pred_binary.sum() + target.sum() + 1e-6)
+    return dice.item()
+def compute_iou(pred, target, threshold=0.5):
+    """Compute IoU (Jaccard index)"""
+    pred_binary = (torch.sigmoid(pred) > threshold).float()
+    intersection = (pred_binary * target).sum()
+    union = pred_binary.sum() + target.sum() - intersection
+    iou = intersection / (union + 1e-6)
+    return iou.item()
+def compute_precision_recall(pred, target, threshold=0.5):
+    """Compute precision and recall"""
+    pred_binary = (torch.sigmoid(pred) > threshold).float()
+    tp = (pred_binary * target).sum()
+    fp = (pred_binary * (1 - target)).sum()
+    fn = ((1 - pred_binary) * target).sum()
+    precision = tp / (tp + fp + 1e-6)
+    recall = tp / (tp + fn + 1e-6)
+    return precision.item(), recall.item()
+def compute_hd95(pred, target, threshold=0.5, voxel_spacing=None):
+    """
+    Compute Hausdorff Distance 95th percentile.
+    Args:
+        pred: Tensor [B, 1, H, W] logits
+        target: Tensor [B, 1, H, W] ground truth
+        threshold: threshold for binarization
+        voxel_spacing: not used for 2D but kept for compatibility
+    Returns:
+        hd95: 95th percentile Hausdorff distance
+    """
+    # Convert to numpy and binarize
+    pred_binary = (torch.sigmoid(pred) > threshold).float().cpu().numpy().squeeze()
+    target_binary = target.cpu().numpy().squeeze()
+    # Handle batch dimension
+    if pred_binary.ndim == 3:
+        hd95_values = []
+        for i in range(pred_binary.shape[0]):
+            hd95_values.append(_compute_hd95_single(pred_binary[i], target_binary[i]))
+        return np.mean(hd95_values)
+    else:
+        return _compute_hd95_single(pred_binary, target_binary)
+def _compute_hd95_single(pred, target):
+    """Compute HD95 for a single 2D image"""
+    if pred.sum() == 0 or target.sum() == 0:
+        return 100.0  # Return a high value if either is empty
+    # Get surface points
+    pred_border = pred - morphology.binary_erosion(pred)
+    target_border = target - morphology.binary_erosion(target)
+    if pred_border.sum() == 0 or target_border.sum() == 0:
+        return 100.0
+    # Get coordinates of border points
+    pred_coords = np.argwhere(pred_border > 0)
+    target_coords = np.argwhere(target_border > 0)
+    # Compute pairwise distances
+    distances_pred_to_target = []
+    for p in pred_coords:
+        dist = np.min(np.sqrt(np.sum((target_coords - p) ** 2, axis=1)))
+        distances_pred_to_target.append(dist)
+    distances_target_to_pred = []
+    for t in target_coords:
+        dist = np.min(np.sqrt(np.sum((pred_coords - t) ** 2, axis=1)))
+        distances_target_to_pred.append(dist)
+    # Get 95th percentile
+    all_distances = distances_pred_to_target + distances_target_to_pred
+    hd95 = np.percentile(all_distances, 95)
+    return hd95
+def compute_all_metrics(pred, target, threshold=0.5):
+    """Compute all metrics at once"""
+    dice = compute_dice(pred, target, threshold)
+    iou = compute_iou(pred, target, threshold)
+    precision, recall = compute_precision_recall(pred, target, threshold)
+    hd95 = compute_hd95(pred, target, threshold)
+    return {
+        'dice': dice,
+        'iou': iou,
+        'precision': precision,
+        'recall': recall,
+        'hd95': hd95
+    }
+def evaluate(decoder, stem, encoder, loader, device):
+    """Comprehensive evaluation"""
+    decoder.eval()
+    stem.eval()
+    encoder.eval()
+    all_metrics = {
+        'dice': [], 'iou': [], 'precision': [], 'recall': [], 'hd95': []
+    }
+    with torch.no_grad():
+        for images, masks in tqdm(loader, desc="Evaluating"):
+            images, masks = images.to(device), masks.to(device)
+            vit_features = encoder(images)
+            skip = stem(images)
+            logits = decoder(vit_features, skip)
+            metrics = compute_all_metrics(logits, masks)
+            for key in all_metrics:
+                all_metrics[key].append(metrics[key])
+    # Compute mean and std for each metric
+    results = {}
+    for key in all_metrics:
+        results[key] = np.mean(all_metrics[key])
+        results[f'{key}_std'] = np.std(all_metrics[key])
+    return results
+# ============================================================================
+# TRAINING FUNCTION
+# ============================================================================
+def train_model(decoder, stem, encoder, train_loader, val_loader, config):
+    """Enhanced training loop with cosine annealing restarts and comprehensive logging"""
+    device = config.device
+    best_score = -float('inf')
+    criterion = FocalDiceBoundaryLoss(focal_weight=config.focal_weight, dice_weight=config.dice_weight, boundary_weight=config.boundary_weight)
+    # Optimizer includes both stem and decoder parameters
+    optimizer = AdamW(
+        list(decoder.parameters()) + list(stem.parameters()),
+        lr=config.learning_rate,
+        weight_decay=config.weight_decay
+    )
+    # Cosine Annealing with Warm Restarts
+    scheduler = CosineAnnealingWarmRestarts(
+        optimizer,
+        T_0=config.T_0,
+        T_mult=config.T_mult,
+        eta_min=config.min_lr
+    )
+    history = {
+        'train_loss': [],
+        'val_metrics': [],  # Store full metrics dict per epoch
+        'lr': []
+    }
+    for epoch in range(config.num_epochs):
+        # Training
+        decoder.train()
+        stem.train()
+        encoder.eval()
+        epoch_loss = 0
+        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{config.num_epochs}")
+        for batch_idx, (images, masks) in enumerate(progress_bar):
+            images, masks = images.to(device), masks.to(device)
+            # Frozen encoder
+            with torch.no_grad():
+                vit_features = encoder(images)
+            # Trainable stem
+            skip_features = stem(images)
+            # Trainable decoder
+            logits = decoder(vit_features, skip_features)
+            loss = criterion(logits, masks)
+            optimizer.zero_grad()
+            loss.backward()
+            torch.nn.utils.clip_grad_norm_(decoder.parameters(), max_norm=1.0)
+            torch.nn.utils.clip_grad_norm_(stem.parameters(), max_norm=1.0)
+            optimizer.step()
+            # Step scheduler per batch for cosine annealing
+            scheduler.step(epoch + batch_idx / len(train_loader))
+            epoch_loss += loss.item()
+            current_lr = optimizer.param_groups[0]['lr']
+            progress_bar.set_postfix({'loss': loss.item(), 'lr': f'{current_lr:.2e}'})
+        avg_loss = epoch_loss / len(train_loader)
+        # Validation
+        val_metrics = evaluate(decoder, stem, encoder, val_loader, device)
+        # Store metrics
+        history['train_loss'].append(avg_loss)
+        history['val_metrics'].append(val_metrics)
+        history['lr'].append(current_lr)
+        # Save best model
+        current_score = (0.6 * val_metrics['dice'] +
+                 0.3 * val_metrics['iou'] -
+                 0.1 * min(val_metrics['hd95'] / 100.0, 1.0))
+        if current_score > best_score :  # Rename best_dice to best_score for clarity
+            best_score  = current_score
+            print(f"✓ Saved new best model with Dice: {val_metrics['dice']:.4f}, "
+          f"IoU: {val_metrics['iou']:.4f}, HD95: {val_metrics['hd95']:.2f}")
+            torch.save({
+                'epoch': epoch,
+                'decoder_state_dict': decoder.state_dict(),
+                'stem_state_dict': stem.state_dict(),
+                'encoder_state_dict': encoder.state_dict(),
+                'optimizer_state_dict': optimizer.state_dict(),
+                'best_score': best_score,
+                'config': config,
+            }, os.path.join(config.save_dir, "best_unet_model.pth"))
+            print(f"✓ Saved new best model with Score: {best_score:.4f}")
+        # Print epoch summary
+        print(f"\n{'='*60}")
+        print(f"Epoch {epoch+1}/{config.num_epochs} Summary:")
+        print(f"  Learning Rate: {current_lr:.6f}")
+        print(f"  Train Loss: {avg_loss:.4f}")
+        print(f"  Val Dice: {val_metrics['dice']:.4f} ± {val_metrics['dice_std']:.4f}")
+        print(f"  Val IoU: {val_metrics['iou']:.4f} ± {val_metrics['iou_std']:.4f}")
+        print(f"  Val Precision: {val_metrics['precision']:.4f} ± {val_metrics['precision_std']:.4f}")
+        print(f"  Val Recall: {val_metrics['recall']:.4f} ± {val_metrics['recall_std']:.4f}")
+        print(f"  Val HD95: {val_metrics['hd95']:.4f} ± {val_metrics['hd95_std']:.4f}")
+        print(f"{'='*60}\n")
+    return history,  best_score
+# ============================================================================
+# VISUALIZATION
+# ============================================================================
+def visualize_predictions(decoder, stem, encoder, dataset, device, num_samples=5,
+                         save_path="predictions.png", subset_name="Test"):
+    """Visualize sample predictions with all metrics"""
+    decoder.eval()
+    stem.eval()
+    encoder.eval()
+    # Create a larger figure for 5 columns (image, mask, pred, overlay, metrics)
+    fig, axes = plt.subplots(num_samples, 5, figsize=(20, 4*num_samples))
+    if num_samples == 1:
+        axes = axes.reshape(1, -1)
+    indices = np.random.choice(len(dataset), num_samples, replace=False)
+    with torch.no_grad():
+        for i, idx in enumerate(indices):
+            image, mask = dataset[idx]
+            image_batch = image.unsqueeze(0).to(device)
+            mask_np = mask.cpu().numpy().squeeze()
+            vit_features = encoder(image_batch)
+            skip = stem(image_batch)
+            logits = decoder(vit_features, skip)
+            pred = torch.sigmoid(logits).cpu().numpy().squeeze()
+            pred_binary = (pred > 0.5).astype(np.float32)
+            # Compute metrics
+            metrics = compute_all_metrics(logits, mask.to(device))
+            # Denormalize image for display
+            img_display = image.cpu().squeeze().permute(1, 2, 0).numpy()
+            mean = np.array(config.mean).reshape(1, 1, 3)
+            std = np.array(config.std).reshape(1, 1, 3)
+            img_display = img_display * std + mean
+            img_display = np.clip(img_display, 0, 1)
+            # Create overlay
+            overlay = img_display.copy()
+            overlay[pred_binary > 0.5] = [1, 0, 0]  # Red for predictions
+            overlay = 0.7 * img_display + 0.3 * overlay
+            # Plot images
+            axes[i, 0].imshow(img_display)
+            axes[i, 0].set_title("Input Image")
+            axes[i, 0].axis('off')
+            axes[i, 1].imshow(mask_np, cmap='gray')
+            axes[i, 1].set_title("Ground Truth")
+            axes[i, 1].axis('off')
+            axes[i, 2].imshow(pred_binary, cmap='gray')
+            axes[i, 2].set_title("Prediction")
+            axes[i, 2].axis('off')
+            axes[i, 3].imshow(overlay)
+            axes[i, 3].set_title("Overlay")
+            axes[i, 3].axis('off')
+            # Display metrics in text
+            metrics_text = f"Dice: {metrics['dice']:.3f}\nIoU: {metrics['iou']:.3f}\nHD95: {metrics['hd95']:.1f}"
+            axes[i, 4].text(0.1, 0.5, metrics_text, fontsize=12, verticalalignment='center',
+                           transform=axes[i, 4].transAxes)
+            axes[i, 4].axis('off')
+    plt.suptitle(f"{subset_name} Set - Sample Predictions", fontsize=16, y=1.02)
+    plt.tight_layout()
+    plt.savefig(save_path, dpi=150, bbox_inches='tight')
+    plt.close()
+    print(f"Visualization saved to {save_path}")
+# ============================================================================
+# MAIN PIPELINE
+# ============================================================================
+def load_and_prepare_data(config):
+    """Load Kvasir-SEG dataset and create train/val/test splits"""
+    images_path = os.path.join(config.dataset_path, "images")
+    masks_path = os.path.join(config.dataset_path, "masks")
+    if not os.path.exists(images_path):
+        images_path = config.dataset_path
+        masks_path = config.dataset_path
+    image_files = sorted(glob.glob(os.path.join(images_path, "*.jpg")))
+    mask_files = sorted(glob.glob(os.path.join(masks_path, "*.jpg")))
+    if len(image_files) == 0:
+        image_files = sorted(glob.glob(os.path.join(images_path, "*.png")))
+        mask_files = sorted(glob.glob(os.path.join(masks_path, "*.png")))
+    print(f"Found {len(image_files)} images and {len(mask_files)} masks")
+    if len(image_files) == 0:
+        raise FileNotFoundError(f"No images found in {config.dataset_path}")
+    assert len(image_files) == len(mask_files), f"Mismatch: {len(image_files)} images vs {len(mask_files)} masks"
+    # Split into train/val/test
+    train_files, temp_files = train_test_split(
+        list(zip(image_files, mask_files)),
+        test_size=config.val_split + config.test_split,
+        random_state=42
+    )
+    val_files, test_files = train_test_split(
+        temp_files,
+        test_size=config.test_split / (config.val_split + config.test_split),
+        random_state=42
+    )
+    train_images, train_masks = zip(*train_files) if train_files else ([], [])
+    val_images, val_masks = zip(*val_files) if val_files else ([], [])
+    test_images, test_masks = zip(*test_files) if test_files else ([], [])
+    print(f"Train: {len(train_images)}, Val: {len(val_images)}, Test: {len(test_images)}")
+    return (list(train_images), list(train_masks)), (list(val_images), list(val_masks)), (list(test_images), list(test_masks))
+def plot_training_history(history, save_dir):
+    """Plot training history"""
+    epochs = range(1, len(history['train_loss']) + 1)
+    # Extract validation metrics
+    val_dice = [m['dice'] for m in history['val_metrics']]
+    val_iou = [m['iou'] for m in history['val_metrics']]
+    val_hd95 = [m['hd95'] for m in history['val_metrics']]
+    val_precision = [m['precision'] for m in history['val_metrics']]
+    val_recall = [m['recall'] for m in history['val_metrics']]
+    fig, axes = plt.subplots(2, 3, figsize=(18, 10))
+    # Loss
+    axes[0, 0].plot(epochs, history['train_loss'], 'b-', label='Train Loss')
+    axes[0, 0].set_title('Training Loss')
+    axes[0, 0].set_xlabel('Epoch')
+    axes[0, 0].set_ylabel('Loss')
+    axes[0, 0].grid(True)
+    axes[0, 0].legend()
+    # Learning Rate
+    axes[0, 1].plot(epochs, history['lr'], 'g-')
+    axes[0, 1].set_title('Learning Rate')
+    axes[0, 1].set_xlabel('Epoch')
+    axes[0, 1].set_ylabel('LR')
+    axes[0, 1].set_yscale('log')
+    axes[0, 1].grid(True)
+    # Dice
+    axes[0, 2].plot(epochs, val_dice, 'r-', label='Val Dice')
+    axes[0, 2].set_title('Validation Dice')
+    axes[0, 2].set_xlabel('Epoch')
+    axes[0, 2].set_ylabel('Dice')
+    axes[0, 2].grid(True)
+    axes[0, 2].legend()
+    # IoU
+    axes[1, 0].plot(epochs, val_iou, 'm-', label='Val IoU')
+    axes[1, 0].set_title('Validation IoU')
+    axes[1, 0].set_xlabel('Epoch')
+    axes[1, 0].set_ylabel('IoU')
+    axes[1, 0].grid(True)
+    axes[1, 0].legend()
+    # HD95
+    axes[1, 1].plot(epochs, val_hd95, 'c-', label='Val HD95')
+    axes[1, 1].set_title('Validation HD95')
+    axes[1, 1].set_xlabel('Epoch')
+    axes[1, 1].set_ylabel('HD95 (pixels)')
+    axes[1, 1].grid(True)
+    axes[1, 1].legend()
+    # Precision & Recall
+    axes[1, 2].plot(epochs, val_precision, 'orange', label='Precision')
+    axes[1, 2].plot(epochs, val_recall, 'purple', label='Recall')
+    axes[1, 2].set_title('Validation Precision & Recall')
+    axes[1, 2].set_xlabel('Epoch')
+    axes[1, 2].set_ylabel('Value')
+    axes[1, 2].grid(True)
+    axes[1, 2].legend()
+    plt.tight_layout()
+    plt.savefig(os.path.join(save_dir, 'training_history.png'), dpi=150, bbox_inches='tight')
+    plt.close()
+    # Save history to CSV
+    history_df = pd.DataFrame({
+        'epoch': epochs,
+        'train_loss': history['train_loss'],
+        'val_dice': val_dice,
+        'val_iou': val_iou,
+        'val_hd95': val_hd95,
+        'val_precision': val_precision,
+        'val_recall': val_recall,
+        'lr': history['lr']
+    })
+    history_df.to_csv(os.path.join(save_dir, 'training_history.csv'), index=False)
+def main():
+    print("=" * 60)
+    print("DINOv3 Polyp Segmentation Training - With HD95 & Cosine Annealing")
+    print("=" * 60)
+    # Load data
+    print("\n1. Loading dataset...")
+    train_data, val_data, test_data = load_and_prepare_data(config)
+    # Data augmentations
+    train_transform = A.Compose([
+        A.Resize(config.image_size, config.image_size),
+        A.RandomRotate90(p=0.5),
+        A.HorizontalFlip(p=0.5),
+        A.VerticalFlip(p=0.5),
+        A.ShiftScaleRotate(shift_limit=0.05, scale_limit=0.05, rotate_limit=15, p=0.5),
+        A.OneOf([
+            A.MotionBlur(p=0.2),
+            A.GaussianBlur(blur_limit=3, p=0.2),
+        ], p=0.3),
+        A.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1, hue=0.05, p=0.3),
+        ToTensorV2(),
+    ])
+    val_transform = A.Compose([
+        A.Resize(config.image_size, config.image_size),
+        ToTensorV2(),
+    ])
+    # Create datasets
+    train_dataset = PolypDataset(
+        train_data[0], train_data[1],
+        transform=train_transform,
+        target_size=(config.image_size, config.image_size)
+    )
+    val_dataset = PolypDataset(
+        val_data[0], val_data[1],
+        transform=val_transform,
+        target_size=(config.image_size, config.image_size)
+    )
+    test_dataset = PolypDataset(
+        test_data[0], test_data[1],
+        transform=val_transform,
+        target_size=(config.image_size, config.image_size)
+    )
+    # Dataloaders
+    train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, num_workers=4, pin_memory=True)
+    val_loader = DataLoader(val_dataset, batch_size=config.batch_size, shuffle=False, num_workers=4, pin_memory=True)
+    test_loader = DataLoader(test_dataset, batch_size=config.batch_size, shuffle=False, num_workers=4, pin_memory=True)
+    print(f"\n2. Initializing DINOv3 encoder...")
+    encoder = DINOv3Encoder(
+        model_name=config.model_name,
+        local_path=config.local_model_path,
+        freeze=True,
+        layers=config.multi_scale_layers
+    ).to(config.device)
+    # Test encoder
+    print("   Testing encoder with sample batch...")
+    sample_images, _ = next(iter(train_loader))
+    sample_images = sample_images.to(config.device)
+    with torch.no_grad():
+        sample_features = encoder(sample_images)
+    print(f"   Encoder output shape: {sample_features.shape}")
+    print("\n3. Building U‑Net decoder with skip connections...")
+    stem = ShallowStem(in_channels=3, base_channels=64).to(config.device)
+    decoder = UNetDecoder(
+        vit_channels=encoder.out_channels,
+        stem_channels=[512, 256, 128],
+        num_classes=1
+    ).to(config.device)
+    trainable = sum(p.numel() for p in decoder.parameters()) + sum(p.numel() for p in stem.parameters())
+    print(f"   Trainable parameters (stem + decoder): {trainable:,}")
+    print("\n4. Starting training with Cosine Annealing Warm Restarts...")
+    print(f"   Initial LR: {config.learning_rate:.6f}")
+    print(f"   T_0: {config.T_0}, T_mult: {config.T_mult}")
+    print(f"   Min LR: {config.min_lr:.6f}")
+    history, best_score = train_model(decoder, stem, encoder, train_loader, val_loader, config)
+    print(f"\n✓ Training complete! Best validation Score: {best_score:.4f}")
+    # Final evaluation on all sets
+    print("\n5. Final evaluation on all sets...")
+    # Load best model for final evaluation
+    checkpoint = torch.load(os.path.join(config.save_dir, "best_unet_model.pth"),weights_only=False)
+    decoder.load_state_dict(checkpoint['decoder_state_dict'])
+    stem.load_state_dict(checkpoint['stem_state_dict'])
+    # Evaluate on all splits
+    print("\nEvaluating on Training Set...")
+    train_metrics = evaluate(decoder, stem, encoder, train_loader, config.device)
+    print("Evaluating on Validation Set...")
+    val_metrics = evaluate(decoder, stem, encoder, val_loader, config.device)
+    print("Evaluating on Test Set...")
+    test_metrics = evaluate(decoder, stem, encoder, test_loader, config.device)
+    # Print comprehensive results
+    print("\n" + "=" * 80)
+    print("FINAL RESULTS - ALL METRICS")
+    print("=" * 80)
+    print(f"\n{'Metric':<15} {'Train':<20} {'Validation':<20} {'Test':<20}")
+    print("-" * 75)
+    for metric in ['dice', 'iou', 'precision', 'recall', 'hd95']:
+        print(f"{metric.upper():<15} "
+              f"{train_metrics[metric]:.4f} ± {train_metrics[f'{metric}_std']:.4f}    "
+              f"{val_metrics[metric]:.4f} ± {val_metrics[f'{metric}_std']:.4f}    "
+              f"{test_metrics[metric]:.4f} ± {test_metrics[f'{metric}_std']:.4f}")
+    print("=" * 80)
+    # Plot training history
+    print("\n6. Plotting training history...")
+    plot_training_history(history, config.save_dir)
+    # Visualize predictions for all subsets
+    print("\n7. Generating visualizations for all subsets...")
+    visualize_predictions(decoder, stem, encoder, train_dataset, config.device,
+                         num_samples=5, save_path=os.path.join(config.save_dir, "train_predictions.png"),
+                         subset_name="Training")
+    visualize_predictions(decoder, stem, encoder, val_dataset, config.device,
+                         num_samples=5, save_path=os.path.join(config.save_dir, "val_predictions.png"),
+                         subset_name="Validation")
+    visualize_predictions(decoder, stem, encoder, test_dataset, config.device,
+                         num_samples=5, save_path=os.path.join(config.save_dir, "test_predictions.png"),
+                         subset_name="Test")
+    # Save comprehensive results
+    results = {
+        'best_val_score': float(best_score),
+        'final_epoch': len(history['train_loss']),
+        'train_metrics': {k: float(v) for k, v in train_metrics.items()},
+        'val_metrics': {k: float(v) for k, v in val_metrics.items()},
+        'test_metrics': {k: float(v) for k, v in test_metrics.items()},
+        'training_history': {
+            'train_loss': [float(x) for x in history['train_loss']],
+            'lr': [float(x) for x in history['lr']],
+            'val_metrics': [{k: float(v) for k, v in m.items()} for m in history['val_metrics']]
+        },
+        'config': {
+            'model_name': config.model_name,
+            'image_size': config.image_size,
+            'batch_size': config.batch_size,
+            'num_epochs': config.num_epochs,
+            'learning_rate': config.learning_rate,
+            'min_lr': config.min_lr,
+            'T_0': config.T_0,
+            'T_mult': config.T_mult,
+            'scheduler': 'CosineAnnealingWarmRestarts',
+            'focal_weight': config.focal_weight,
+            'dice_weight': config.dice_weight,
+            'multi_scale_layers': config.multi_scale_layers
+        }
+    }
+    # Save as JSON
+    with open(os.path.join(config.save_dir, "comprehensive_results.json"), 'w') as f:
+        json.dump(results, f, indent=2)
+    # Save as formatted text report
+    with open(os.path.join(config.save_dir, "results_report.txt"), 'w') as f:
+        f.write("=" * 80 + "\n")
+        f.write("DINOv3 POLYP SEGMENTATION - FINAL REPORT\n")
+        f.write("=" * 80 + "\n\n")
+        f.write(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
+        f.write("CONFIGURATION:\n")
+        f.write("-" * 40 + "\n")
+        for key, value in results['config'].items():
+            f.write(f"  {key}: {value}\n")
+        f.write("\n\nFINAL METRICS:\n")
+        f.write("-" * 40 + "\n")
+        f.write(f"{'Metric':<15} {'Train':<25} {'Validation':<25} {'Test':<25}\n")
+        f.write("-" * 90 + "\n")
+        for metric in ['dice', 'iou', 'precision', 'recall', 'hd95']:
+            f.write(f"{metric.upper():<15} "
+                   f"{train_metrics[metric]:.4f} ± {train_metrics[f'{metric}_std']:.4f}        "
+                   f"{val_metrics[metric]:.4f} ± {val_metrics[f'{metric}_std']:.4f}        "
+                   f"{test_metrics[metric]:.4f} ± {test_metrics[f'{metric}_std']:.4f}\n")
+        f.write("\n\nBest Validation Score (Dice+IoU-HD95/100): {:.4f}\n".format(best_score))
+        f.write("Training completed at epoch: {}\n".format(len(history['train_loss'])))
+    print(f"\n✓ Comprehensive results saved to {config.save_dir}/")
+    print(f"  - comprehensive_results.json")
+    print(f"  - results_report.txt")
+    print(f"  - training_history.csv")
+    print(f"  - training_history.png")
+    print(f"  - train_predictions.png")
+    print(f"  - val_predictions.png")
+    print(f"  - test_predictions.png")
+    print("\n🎉 Enhanced training pipeline complete!")
+if __name__ == "__main__":
+    main()