File size: 5,896 Bytes
db82745
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
#!/usr/bin/env python3
"""Remote training script for Bee — runs on GPU cloud (RunPod, Vast.ai, Lambda, Colab).

Downloads autopilot checkpoints from your MacBook via HuggingFace Hub,
trains LoRA adapters on GPU, uploads results back.

Usage on GPU instance:
    pip install -r requirements.txt
    export HF_TOKEN=your_huggingface_token
    python train_remote.py --model_id your-username/bee-checkpoint --iterations 1000

Environment:
    HF_TOKEN              HuggingFace token for push/pull
    BEE_HUB_ID            HF Hub repo ID (e.g., "cfrost/bee")
    WANDB_PROJECT         Optional Weights & Biases project
"""

import argparse
import json
import logging
import os
import sys
import time
from pathlib import Path

import torch
from huggingface_hub import HfApi, hf_hub_download, upload_file
from transformers import AutoTokenizer

sys.path.insert(0, str(Path(__file__).resolve().parent))

from bee.config import BeeConfig
from bee.modeling_bee import BeeForCausalLM
from bee.lora_adapter import LoRAConfig
from bee.model_profiles import DEFAULT_MODEL_PROFILE, resolve_model_id
from scripts.autopilot import Autopilot

logger = logging.getLogger("bee.remote_train")


def download_checkpoint(hub_id: str, local_dir: str = "./checkpoint_in") -> str:
    """Pull latest checkpoint from HuggingFace Hub."""
    api = HfApi()
    files = api.list_repo_files(hub_id)
    os.makedirs(local_dir, exist_ok=True)

    for f in files:
        if f.endswith(('.bin', '.safetensors', '.json', '.pt')):
            logger.info("Downloading %s", f)
            hf_hub_download(repo_id=hub_id, filename=f, local_dir=local_dir)

    return local_dir


def upload_checkpoint(hub_id: str, checkpoint_dir: str):
    """Push trained checkpoint to HuggingFace Hub."""
    api = HfApi()
    for f in Path(checkpoint_dir).rglob("*"):
        if f.is_file():
            rel = f.relative_to(checkpoint_dir).as_posix()
            logger.info("Uploading %s", rel)
            upload_file(path_or_fileobj=str(f), path_in_repo=rel, repo_id=hub_id)
    logger.info("Checkpoint uploaded to %s", hub_id)


def train(
    hub_id: str,
    iterations: int = 1000,
    device: str = "cuda",
    batch_size: int = 4,
    learning_rate: float = 5e-4,
    push_every: int = 50,
):
    device = device if torch.cuda.is_available() else "cpu"
    logger.info("Training on %s", device)

    # Load model
    model_path = resolve_model_id(os.getenv("BEE_MODEL_PROFILE") or os.getenv("BEE_MODEL_PATH") or DEFAULT_MODEL_PROFILE)
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    # Exact architecture match
    cfg = BeeConfig(
        vocab_size=49152,
        hidden_size=960,
        num_hidden_layers=32,
        num_attention_heads=15,
        num_key_value_heads=5,
        intermediate_size=2560,
        max_position_embeddings=8192,
        rms_norm_eps=1e-05,
        tie_word_embeddings=False,
    )
    model = BeeForCausalLM(cfg).to(device)

    # Transfer weights from pretrained
    from bee.weight_transfer import transfer_weights
    model = transfer_weights(model_path, cfg, device)
    logger.info("Model loaded: %.1fM params", sum(p.numel() for p in model.parameters()) / 1e6)

    # Autopilot
    autopilot = Autopilot(
        model=model,
        tokenizer=tokenizer,
        device=device,
        domains=["general", "programming", "quantum", "cybersecurity", "fintech"],
        lora_config=LoRAConfig(r=16, alpha=32, dropout=0.05),
        checkpoint_dir="./remote_checkpoints",
        use_quantum=False,
    )

    # Try loading previous checkpoint from Hub
    try:
        local_ckpt = download_checkpoint(hub_id)
        autopilot.load_checkpoint(local_ckpt)
        logger.info("Resumed from Hub checkpoint")
    except Exception as e:
        logger.warning("No checkpoint on Hub, starting fresh: %s", e)

    # Training loop
    start_iter = autopilot.step_count
    for i in range(start_iter, start_iter + iterations):
        domain = autopilot.domains[i % len(autopilot.domains)]
        loss = autopilot.train_domain_adapter(
            domain=domain,
            num_steps=10,
            batch_size=batch_size,
            learning_rate=learning_rate,
            use_synthetic=True,
        )
        logger.info("Iter %d | domain=%s | loss=%.4f", i, domain, loss)

        # Save + push every N iterations
        if i % push_every == 0 and i > 0:
            ckpt_dir = f"./remote_checkpoints/iter_{i}"
            autopilot.save_checkpoint(ckpt_dir)
            upload_checkpoint(hub_id, ckpt_dir)

    # Final save
    final_dir = "./remote_checkpoints/iter_final"
    autopilot.save_checkpoint(final_dir)
    upload_checkpoint(hub_id, final_dir)
    logger.info("Training complete. Final checkpoint: %s", final_dir)


def main():
    parser = argparse.ArgumentParser(description="Bee Remote GPU Training")
    parser.add_argument("--hub_id", default=os.getenv("BEE_HUB_ID", "cfrost/bee"), help="HF Hub repo ID")
    parser.add_argument("--iterations", type=int, default=1000, help="Training iterations")
    parser.add_argument("--device", default="cuda", help="Device (cuda/cpu)")
    parser.add_argument("--batch_size", type=int, default=4, help="Batch size")
    parser.add_argument("--lr", type=float, default=5e-4, help="Learning rate")
    parser.add_argument("--push_every", type=int, default=50, help="Push to Hub every N iterations")
    args = parser.parse_args()

    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
    )

    train(
        hub_id=args.hub_id,
        iterations=args.iterations,
        device=args.device,
        batch_size=args.batch_size,
        learning_rate=args.lr,
        push_every=args.push_every,
    )


if __name__ == "__main__":
    main()