Spaces:

luohoa97
/

BitFinTrainer

Runtime error

File size: 9,355 Bytes

d5b7ee9
 
 
 
34e94cf
d5b7ee9
 
 
 
 
 
 
 
34e94cf
d5b7ee9
 
 
 
 
 
 
 
 
 
34e94cf
d5b7ee9
 
 
 
 
 
 
34e94cf
d5b7ee9
 
 
 
34e94cf
 
d5b7ee9
 
 
26f4391
 
 
 
 
e8b6287
26f4391
 
 
362e261
 
 
26f4391
 
 
 
 
 
 
 
 
 
 
 
362e261
26f4391
 
 
362e261
26f4391
e8b6287
26f4391
 
 
 
362e261
26f4391
 
d5b7ee9
b93d3aa
34e94cf
b93d3aa
34e94cf
d5b7ee9
34e94cf
 
 
 
 
d5b7ee9
 
68e57b2
d5b7ee9
34e94cf
d5b7ee9
34e94cf
 
d5b7ee9
 
34e94cf
d5b7ee9
 
 
 
b93d3aa
d5b7ee9
26f4391
 
 
3a263ff
 
26f4391
34e94cf
3a263ff
 
 
 
d5b7ee9
34e94cf
 
 
 
 
 
e8b6287
d5b7ee9
 
 
 
 
 
 
 
770d579
34e94cf
b93d3aa
d5b7ee9
 
20b4890
be03d5f
 
d5b7ee9
be03d5f
 
 
 
 
 
 
d5b7ee9
 
 
 
 
 
34e94cf
770d579
34e94cf
d5b7ee9
 
34e94cf
 
d5b7ee9
34e94cf
b93d3aa
20b4890
be03d5f
 
34e94cf
d5b7ee9
3a263ff
 
 
 
 
 
 
 
 
 
 
 
 
34e94cf
 
d5b7ee9
 
 
34e94cf
 
 
 
 
 
 
3a263ff
 
 
 
 
 
 
34e94cf
e8b6287
3a263ff
 
 
 
34e94cf
 
 
e8b6287
d5b7ee9
 
 
 
34e94cf
d5b7ee9
34e94cf
d5b7ee9
34e94cf
d5b7ee9
 
34e94cf
d5b7ee9
 
 
 
 
 
 
 
34e94cf
d5b7ee9
34e94cf
d5b7ee9

#!/usr/bin/env python3
"""
Train the BitNet AI Fusion model.
Uses ternary weights (-1, 0, 1) and 8-bit activations.
Now includes real-time PnL backtesting and Confusion Matrix logging.
"""

import sys
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split
from tqdm.auto import tqdm
import logging
from safetensors.torch import save_file, load_file
from huggingface_hub import HfApi, create_repo, hf_hub_download
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

# Add project root to path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))

from trading_cli.strategy.ai.model import create_model
from scripts.generate_ai_dataset import build_dataset, SEQ_LEN, LOOKAHEAD

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Hyperparameters
EPOCHS = 100
BATCH_SIZE = 4096 # Starting point for dynamic search
LR = 0.0003
HIDDEN_DIM = 512
LAYERS = 8

# HF Configuration
HF_REPO_ID = os.getenv("HF_REPO_ID") # e.g. "username/BitFin"
HF_DATASET_ID = "luohoa97/BitFin" # User's dataset repo
HF_TOKEN = os.getenv("HF_TOKEN")

def get_max_batch_size(model, input_dim, seq_len, device, start_batch=128):
    """Automatically find the largest batch size that fits in VRAM."""
    if device.type == 'cpu':
        return 64
        
    tqdm.write("🔍 Searching for optimal batch size for your GPU...")
    batch_size = start_batch
    last_success = batch_size
    
    pbar = tqdm(total=16384, desc="Hardware Probe", unit="batch")
    pbar.update(batch_size)
    
    try:
        while batch_size <= 16384: # Ceiling
            mock_X = torch.randn(batch_size, seq_len, input_dim).to(device)
            mock_y = torch.randint(0, 3, (batch_size,)).to(device)
            
            outputs = model(mock_X)
            loss = nn.CrossEntropyLoss()(outputs, mock_y)
            loss.backward()
            model.zero_grad()
            
            last_success = batch_size
            batch_size *= 2
            pbar.update(batch_size - last_success)
            torch.cuda.empty_cache()
            
    except RuntimeError as e:
        pbar.close()
        if "out of memory" in str(e).lower():
            tqdm.write(f"💡 GPU Hit limit at {batch_size}. Using {last_success} as optimal batch.")
            torch.cuda.empty_cache()
        else:
            raise e
            
    pbar.close()
    return last_success

def train():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}", flush=True)

    # 1. Load or Generate Dataset
    if not os.path.exists("data/trading_dataset.pt"):
        try:
            print("📦 Fetching dataset from Hugging Face...", flush=True)
            hf_hub_download(repo_id=HF_DATASET_ID, filename="trading_dataset.pt", local_dir="data", repo_type="dataset")
        except Exception:
            print("🚀 Starting on-the-fly dataset generation...", flush=True)
            build_dataset()

    print("🚀 Loading dataset from data/trading_dataset.pt...", flush=True)
    data = torch.load("data/trading_dataset.pt")
    X, y, rets = data["X"], data["y"], data["rets"]
    
    # 2. Split Data
    dataset = TensorDataset(X, y, rets)
    train_size = int(0.8 * len(dataset))
    val_size = len(dataset) - train_size
    train_ds, val_ds = random_split(dataset, [train_size, val_size], generator=torch.Generator().manual_seed(42))
    
    # 3. Create Model
    input_dim = X.shape[2]
    model = create_model(input_dim=input_dim, hidden_dim=HIDDEN_DIM, layers=LAYERS, seq_len=SEQ_LEN)
    model.to(device)
    
    # 4. Dynamic Batch Sizing
    batch_size = get_max_batch_size(model, input_dim, SEQ_LEN, device)
    
    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=0)
    val_loader = DataLoader(val_ds, batch_size=batch_size, pin_memory=True, num_workers=0)
    
    optimizer = optim.AdamW(model.parameters(), lr=LR)
    
    # 5. Class Weights (HOLD: 2.0, BUY: 1.0, SELL: 3.0)
    class_weights = torch.tensor([2.0, 1.0, 3.0]).to(device)
    criterion = nn.CrossEntropyLoss(weight=class_weights)
    
    # Mixed Precision Setup
    dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8 else torch.float16
    device_type = 'cuda' if torch.cuda.is_available() else 'cpu'
    use_bf16 = (dtype == torch.bfloat16)
    scaler = torch.amp.GradScaler(device_type, enabled=(not use_bf16 and device.type == 'cuda'))

    tqdm.write(f"🚀 Starting training (Batch Size: {batch_size}, Precision: {dtype})...")
    best_val_loss = float('inf')
    
    for epoch in range(EPOCHS):
        model.train()
        train_loss = 0
        correct = 0
        total = 0
        
        pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}")
        for batch_X, batch_y, _ in pbar:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            optimizer.zero_grad()
            
            with torch.amp.autocast(device_type=device_type, dtype=dtype, enabled=(device.type == 'cuda')):
                outputs = model(batch_X)
                loss = criterion(outputs, batch_y)
            
            if not use_bf16:
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
            else:
                loss.backward()
                optimizer.step()
            
            train_loss += loss.item()
            _, predicted = outputs.max(1)
            total += batch_y.size(0)
            correct += predicted.eq(batch_y).sum().item()
            
            pbar.set_postfix({"loss": f"{loss.item():.4f}", "acc": f"{100.*correct/total:.1f}%"})
            
        # Validation & Backtest
        model.eval()
        val_loss = 0
        all_preds, all_true, all_rets = [], [], []
        
        with torch.no_grad():
            for batch_X, batch_y, batch_r in val_loader:
                batch_X, batch_y = batch_X.to(device), batch_y.to(device)
                with torch.amp.autocast(device_type=device_type, dtype=dtype, enabled=(device.type == 'cuda')):
                    outputs = model(batch_X)
                    loss = criterion(outputs, batch_y)
                
                val_loss += loss.item()
                
                # Apply Probability Threshold (0.6)
                probs = torch.softmax(outputs, dim=-1)
                conf, preds = torch.max(probs, dim=-1)
                
                # If confidence < 0.6, force HOLD (0)
                # This reduces noisy trades and targets high-conviction signals
                threshold = 0.6
                final_preds = preds.clone()
                mask = (conf < threshold) & (preds != 0)
                final_preds[mask] = 0
                
                all_preds.extend(final_preds.cpu().numpy())
                all_true.extend(batch_y.cpu().numpy())
                all_rets.extend(batch_r.numpy())
                
        avg_val_loss = val_loss / len(val_loader)
        
        # Calculate Backtest Metrics
        all_preds = np.array(all_preds)
        all_true = np.array(all_true)
        all_rets = np.array(all_rets)
        
        buys = int((all_preds == 1).sum())
        sells = int((all_preds == 2).sum())
        
        buy_pnl = float(np.sum(all_rets[all_preds == 1]))
        sell_pnl = float(-np.sum(all_rets[all_preds == 2])) # Future return is inverse for SELL
        total_pnl = buy_pnl + sell_pnl
        
        buy_win_rate = float(np.sum((all_preds == 1) & (all_true == 1)) / (buys + 1e-6))
        sell_win_rate = float(np.sum((all_preds == 2) & (all_true == 2)) / (sells + 1e-6))
        
        tqdm.write(f"\n--- Epoch {epoch+1} Statistics ---")
        tqdm.write(f"Val Loss: {avg_val_loss:.4f} | Total PnL: {total_pnl:+.4f}")
        tqdm.write(f"BUYs: {buys} | PnL: {buy_pnl:+.4f} | Win Rate: {buy_win_rate:.1%}")
        tqdm.write(f"SELLs: {sells} | PnL: {sell_pnl:+.4f} | Win Rate: {sell_win_rate:.1%}")
        tqdm.write(f"Activity: {(buys+sells)/len(all_preds):.1%}")
        
        if buys + sells > 0:
            cm = confusion_matrix(all_true, all_preds, labels=[0, 1, 2])
            tqdm.write(f"Confusion Matrix (HOLD/BUY/SELL):\n{cm}")

        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            os.makedirs("models", exist_ok=True)
            save_file(model.state_dict(), "models/ai_fusion_bitnet.safetensors")

    print("✅ Training complete. Final model saved.")
    
    # Upload to HF
    if HF_REPO_ID and HF_TOKEN:
        try:
            print(f"📤 Uploading to HF: {HF_REPO_ID}...", flush=True)
            api = HfApi()
            create_repo(repo_id=HF_REPO_ID, token=HF_TOKEN, exist_ok=True, repo_type="model")
            api.upload_file(
                path_or_fileobj="models/ai_fusion_bitnet.safetensors",
                path_in_repo="ai_fusion_bitnet.safetensors",
                repo_id=HF_REPO_ID,
                token=HF_TOKEN
            )
            print("✅ Upload successful!", flush=True)
        except Exception as e:
            print(f"⚠️ Upload failed: {e}", flush=True)

if __name__ == "__main__":
    train()