| import torch |
| from torch.utils.data import DataLoader, random_split, Subset |
| from torch.cuda.amp import autocast, GradScaler |
| from tqdm import tqdm |
| import numpy as np |
| import os |
| import datetime |
| import pandas as pd |
| import matplotlib.pyplot as plt |
| import math |
| import joblib |
|
|
| from dataloader import MultiHouseDataset |
| from hierarchical_diffusion_model import HierarchicalDiffusionModel |
|
|
| if torch.cuda.is_available(): |
| DEVICE = "cuda" |
| torch.backends.cudnn.benchmark = True |
| torch.backends.cuda.matmul.allow_tf32 = True |
| print("Using NVIDIA CUDA backend.") |
| elif torch.backends.mps.is_available(): |
| DEVICE = "mps" |
| print("Using Apple MPS backend.") |
| else: |
| DEVICE = "cpu" |
| print("Using CPU.") |
|
|
| EPOCHS = 200 |
| LEARNING_RATE = 1e-4 |
| BATCH_SIZE = 512 |
| USE_AMP = True |
| GRADIENT_CLIP_VAL = 0.1 |
|
|
| WINDOW_DURATION = '14_days' |
|
|
| DATA_DIRECTORY = './data/per_house' |
| NUM_WORKERS = os.cpu_count() // 2 |
| PIN_MEMORY = True |
| USE_ATTENTION = True |
| DROPOUT = 0.1 |
| HIDDEN_SIZE = 512 |
| EMBEDDING_DIM = 64 |
| DIFFUSION_TIMESTEPS = 500 |
| DOWNSCALE_FACTOR = 4 |
|
|
| def calculate_window_size(duration: str) -> int: |
| SAMPLES_PER_DAY = 48 |
| mapping = { |
| '2_days': 2 * SAMPLES_PER_DAY, |
| '7_days': 7 * SAMPLES_PER_DAY, |
| '14_days': 14 * SAMPLES_PER_DAY, |
| '15_days': 15 * SAMPLES_PER_DAY, |
| '30_days': 30 * SAMPLES_PER_DAY |
| } |
| if duration not in mapping: |
| raise ValueError(f"Invalid WINDOW_DURATION: {duration}") |
| return mapping[duration] |
|
|
| def denormalize_data(normalized_data, scaler_path='global_scaler.gz'): |
| scaler = joblib.load(scaler_path) |
| original_shape = normalized_data.shape |
| if len(original_shape) == 3: |
| batch_size, seq_len, features = original_shape |
| normalized_flat = normalized_data.reshape(-1, features) |
| denormalized_flat = scaler.inverse_transform(normalized_flat) |
| return denormalized_flat.reshape(original_shape) |
| else: |
| return scaler.inverse_transform(normalized_data) |
|
|
| def moving_average(data, window_size): |
| return np.convolve(data, np.ones(window_size), 'valid') / window_size |
|
|
| def save_and_plot_loss(loss_dict, title, filepath, window_size=10): |
| plt.figure(figsize=(12, 6)) |
| for label, losses in loss_dict.items(): |
| pd.DataFrame({label: losses}).to_csv(f"{filepath}_{label.lower().replace(' ', '_')}.csv", index=False) |
| plt.plot(losses, label=f'Raw {label}', alpha=0.3) |
| if len(losses) > window_size: |
| smoothed_losses = moving_average(losses, window_size) |
| plt.plot(np.arange(window_size - 1, len(losses)), smoothed_losses, label=f'Smoothed {label}') |
| plt.title(title) |
| plt.xlabel('Epoch'); plt.ylabel('Loss') |
| plt.legend(); plt.grid(True) |
| plt.savefig(f"{filepath}.png"); plt.close() |
| print(f" Loss plot saved to {filepath}.png") |
|
|
| def train_diffusion(log_dir, model_save_path): |
| print("--- Starting Hierarchical Diffusion Training ---") |
| window_size = calculate_window_size(WINDOW_DURATION) |
| print(f"Using window duration: {WINDOW_DURATION} ({window_size} samples)") |
|
|
| dataset = MultiHouseDataset( |
| data_dir=DATA_DIRECTORY, |
| window_size=window_size, |
| step_size=window_size//2, |
| limit_to_one_year=False |
| ) |
| print(f"Dataset loaded: {len(dataset)} samples, {dataset.num_houses} houses, {dataset[0][0].shape[1]} features.") |
|
|
| val_split = 0.1 |
| val_size = int(len(dataset) * val_split) |
| train_size = len(dataset) - val_size |
| train_dataset, val_dataset = random_split(dataset, [train_size, val_size]) |
| print(f"Train size: {train_size}, Validation size: {val_size}") |
|
|
| train_dataloader = DataLoader( |
| train_dataset, batch_size=BATCH_SIZE, shuffle=True, |
| num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY, drop_last=True |
| ) |
| val_dataloader = DataLoader( |
| val_dataset, batch_size=BATCH_SIZE*2, shuffle=False, |
| num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY |
| ) |
| |
| channel_weights = torch.tensor([1.0, 8.0, 1.0, 1.0], device=DEVICE) |
| print(f"Using channel weights: {channel_weights}") |
|
|
| model = HierarchicalDiffusionModel( |
| in_channels=dataset[0][0].shape[1], |
| num_houses=dataset.num_houses, |
| downscale_factor=DOWNSCALE_FACTOR, |
| channel_weights=channel_weights, |
| embedding_dim=EMBEDDING_DIM, |
| hidden_dims=[HIDDEN_SIZE // 4, HIDDEN_SIZE // 2, HIDDEN_SIZE], |
| dropout=DROPOUT, |
| use_attention=USE_ATTENTION, |
| num_timesteps=DIFFUSION_TIMESTEPS, |
| blocks_per_level=3 |
| ).to(DEVICE) |
| |
| optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-4) |
| scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS) |
| scaler = GradScaler(enabled=(USE_AMP and DEVICE == "cuda")) |
| |
| train_losses, val_losses = [], [] |
| best_val_loss = float('inf') |
|
|
| print(f"Starting training for {EPOCHS} epochs...") |
| for epoch in range(EPOCHS): |
| model.train() |
| total_train_loss = 0.0 |
| pbar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{EPOCHS} (Train)") |
| |
| for clean_data, conditions in pbar: |
| clean_data = clean_data.to(DEVICE, non_blocking=PIN_MEMORY) |
| conditions = {k: v.to(DEVICE, non_blocking=PIN_MEMORY) for k, v in conditions.items()} |
| |
| optimizer.zero_grad(set_to_none=True) |
| with autocast(enabled=(USE_AMP and DEVICE == "cuda")): |
| loss = model(clean_data, conditions) |
|
|
| scaler.scale(loss).backward() |
| scaler.unscale_(optimizer) |
| torch.nn.utils.clip_grad_norm_(model.parameters(), GRADIENT_CLIP_VAL) |
| scaler.step(optimizer) |
| scaler.update() |
| |
| total_train_loss += loss.item() |
| pbar.set_postfix({'loss': f'{loss.item():.6f}', 'lr': f'{scheduler.get_last_lr()[0]:.2e}'}) |
| |
| avg_train_loss = total_train_loss / len(train_dataloader) |
| train_losses.append(avg_train_loss) |
| |
| model.eval() |
| total_val_loss = 0.0 |
| with torch.no_grad(): |
| for clean_data, conditions in tqdm(val_dataloader, desc="Validating"): |
| clean_data = clean_data.to(DEVICE, non_blocking=PIN_MEMORY) |
| conditions = {k: v.to(DEVICE, non_blocking=PIN_MEMORY) for k, v in conditions.items()} |
| with autocast(enabled=(USE_AMP and DEVICE == "cuda")): |
| loss = model(clean_data, conditions) |
| total_val_loss += loss.item() |
| |
| avg_val_loss = total_val_loss / len(val_dataloader) |
| val_losses.append(avg_val_loss) |
| |
| print(f"Epoch {epoch+1}/{EPOCHS} | Train Loss: {avg_train_loss:.6f} | Val Loss: {avg_val_loss:.6f}") |
| |
| if avg_val_loss < best_val_loss: |
| best_val_loss = avg_val_loss |
| torch.save(model.state_dict(), model_save_path) |
| print(f"New best model saved to {model_save_path} (Val Loss: {best_val_loss:.6f})") |
| |
| scheduler.step() |
|
|
| print("--- Training complete ---") |
| save_and_plot_loss( |
| {'Train Loss': train_losses, 'Validation Loss': val_losses}, |
| 'Hierarchical Diffusion Model Training & Validation Loss', |
| os.path.join(log_dir, 'diffusion_loss_curves') |
| ) |
| |
| return dataset |
|
|
| if __name__ == "__main__": |
| timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") |
| run_name = f"hierarchical_diffusion_{WINDOW_DURATION}_{timestamp}" |
| log_dir = os.path.join("./training_logs", run_name) |
| os.makedirs(log_dir, exist_ok=True) |
| model_path = os.path.join(log_dir, 'best_hierarchical_model.pth') |
|
|
| print(f"Starting new run: {run_name}") |
| print(f"Logs and models will be saved to: {log_dir}") |
|
|
| full_dataset = train_diffusion(log_dir=log_dir, model_save_path=model_path) |
| |
| print("\nTraining and best model saving complete.") |
| print(f"Model saved to: {model_path}") |
| print(f"Loss curves saved to: {os.path.join(log_dir, 'diffusion_loss_curves.png')}") |