"""
train_mnist_1k_tqdm.py

Trains a tiny MNIST model (<1000 params) until convergence,
using tqdm progress bars and early stopping.
"""

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, random_split
from tqdm import tqdm
import numpy as np
import os
import sys

# -------------------------------
# 0. Automatic device fallback
# -------------------------------
def get_device():
    if torch.cuda.is_available():
        try:
            test_tensor = torch.randn(1, 1, 28, 28).cuda()
            _ = torch.nn.functional.avg_pool2d(test_tensor, 4)
            return torch.device('cuda')
        except Exception as e:
            print(f"GPU error: {e}\nFalling back to CPU.")
            return torch.device('cpu')
    return torch.device('cpu')

device = get_device()
print(f"Using device: {device}")

# -------------------------------
# 1. Model (970 parameters)
# -------------------------------
class TinyMNISTModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.pool = nn.AvgPool2d(4, 4)
        self.fc1 = nn.Linear(7*7, 16)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)
        self.fc2 = nn.Linear(16, 10)

    def forward(self, x):
        x = self.pool(x)
        x = x.view(x.size(0), -1)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return x

# -------------------------------
# 2. Data
# -------------------------------
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

full_train = torchvision.datasets.MNIST(root='./data', train=True, download=True, transform=transform)
test_dataset = torchvision.datasets.MNIST(root='./data', train=False, download=True, transform=transform)

# Split 90% train, 10% validation
val_size = int(0.1 * len(full_train))
train_size = len(full_train) - val_size
train_dataset, val_dataset = random_split(full_train, [train_size, val_size])

batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# -------------------------------
# 3. Training with early stopping + tqdm
# -------------------------------
model = TinyMNISTModel().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

patience = 5
best_val_loss = float('inf')
epochs_no_improve = 0
best_model_state = None

print("\n🏋️ Training until convergence (early stopping patience = 5)\n")

epoch = 0
while True:
    # Training phase with tqdm
    model.train()
    train_loss = 0.0
    train_bar = tqdm(train_loader, desc=f"Epoch {epoch+1} [Train]", leave=False)
    for images, labels in train_bar:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        train_bar.set_postfix(loss=loss.item())
    train_loss /= len(train_loader)

    # Validation phase
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    val_bar = tqdm(val_loader, desc=f"Epoch {epoch+1} [Val]", leave=False)
    with torch.no_grad():
        for images, labels in val_bar:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, pred = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (pred == labels).sum().item()
            val_bar.set_postfix(loss=loss.item())
    val_loss /= len(val_loader)
    val_acc = 100.0 * correct / total

    # Print progress line (outside tqdm to keep clean)
    print(f"Epoch {epoch+1:3d} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.2f}%")

    # Early stopping logic
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        epochs_no_improve = 0
        best_model_state = model.state_dict().copy()
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= patience:
            print(f"\n🛑 Early stopping after {epoch+1} epochs (no improvement for {patience} epochs).")
            break

    epoch += 1

# Restore best model
model.load_state_dict(best_model_state)

# -------------------------------
# 4. Final evaluation on full test set
# -------------------------------
def evaluate(loader, name="Test"):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in tqdm(loader, desc=f"Evaluating on {name}", leave=False):
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, pred = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (pred == labels).sum().item()
    acc = 100.0 * correct / total
    print(f"{name} accuracy: {acc:.2f}%")
    return acc

test_acc = evaluate(test_loader, "full test set")
total_params = sum(p.numel() for p in model.parameters())

# -------------------------------
# 5. TL;DR summary
# -------------------------------
tldr = f"""
╔══════════════════════════════════════════════════════════╗
║                    TL;DR – Tiny MNIST                    ║
╠══════════════════════════════════════════════════════════╣
║ Parameters: {total_params:<48}║
║ Training epochs until convergence: {epoch+1:<31}║
║ Best validation loss: {best_val_loss:.4f}<40 spaces>║ -- actually align manually
║ Final test accuracy: {test_acc:.2f}%<39 spaces>║
║ Early stopping patience: {patience} epochs<36 spaces>║
╚══════════════════════════════════════════════════════════╝
"""
print(tldr)

# Save model
torch.save(model.state_dict(), "mnist_1k_best.pth")

# -------------------------------
# 6. Generate README.md (HF style)
# -------------------------------
readme_content = f"""---
language: en
license: apache-2.0
tags:
- mnist
- tiny-model
- tqdm
- early-stopping
---

# Tiny MNIST Classifier – with tqdm progress bars

- **Parameters**: {total_params} (<1000)
- **Test accuracy**: {test_acc:.2f}%
- **Epochs trained**: {epoch+1} (early stopping after {patience} epochs without improvement)

This script trains until convergence and shows **tqdm** progress bars for each batch.

## TL;DR

```bash
python train_mnist_1k_tqdm.py
```

## Full results

| Metric                    | Value           |
|---------------------------|-----------------|
| Total parameters          | {total_params}  |
| Best validation loss      | {best_val_loss:.4f} |
| Final test accuracy       | {test_acc:.2f}% |
| Early stopping patience   | {patience}      |
| Training epochs           | {epoch+1}       |

## Model architecture

AvgPool(4x4) → Linear(49→16) → ReLU → Dropout(0.2) → Linear(16→10)

## How to use

```python
import torch
from train_mnist_1k_tqdm import TinyMNISTModel

model = TinyMNISTModel()
model.load_state_dict(torch.load("mnist_1k_best.pth"))
model.eval()
```
"""
with open("README.md", "w") as f:
    f.write(readme_content)

print("✅ README.md generated. Model saved as mnist_1k_best.pth")