| |
| """Remote training script for Bee — runs on GPU cloud (RunPod, Vast.ai, Lambda, Colab). |
| |
| Downloads autopilot checkpoints from your MacBook via HuggingFace Hub, |
| trains LoRA adapters on GPU, uploads results back. |
| |
| Usage on GPU instance: |
| pip install -r requirements.txt |
| export HF_TOKEN=your_huggingface_token |
| python train_remote.py --model_id your-username/bee-checkpoint --iterations 1000 |
| |
| Environment: |
| HF_TOKEN HuggingFace token for push/pull |
| BEE_HUB_ID HF Hub repo ID (e.g., "cfrost/bee") |
| WANDB_PROJECT Optional Weights & Biases project |
| """ |
|
|
| import argparse |
| import json |
| import logging |
| import os |
| import sys |
| import time |
| from pathlib import Path |
|
|
| import torch |
| from huggingface_hub import HfApi, hf_hub_download, upload_file |
| from transformers import AutoTokenizer |
|
|
| sys.path.insert(0, str(Path(__file__).resolve().parent)) |
|
|
| from bee.config import BeeConfig |
| from bee.modeling_bee import BeeForCausalLM |
| from bee.lora_adapter import LoRAConfig |
| from bee.model_profiles import DEFAULT_MODEL_PROFILE, resolve_model_id |
| from scripts.autopilot import Autopilot |
|
|
| logger = logging.getLogger("bee.remote_train") |
|
|
|
|
| def download_checkpoint(hub_id: str, local_dir: str = "./checkpoint_in") -> str: |
| """Pull latest checkpoint from HuggingFace Hub.""" |
| api = HfApi() |
| files = api.list_repo_files(hub_id) |
| os.makedirs(local_dir, exist_ok=True) |
|
|
| for f in files: |
| if f.endswith(('.bin', '.safetensors', '.json', '.pt')): |
| logger.info("Downloading %s", f) |
| hf_hub_download(repo_id=hub_id, filename=f, local_dir=local_dir) |
|
|
| return local_dir |
|
|
|
|
| def upload_checkpoint(hub_id: str, checkpoint_dir: str): |
| """Push trained checkpoint to HuggingFace Hub.""" |
| api = HfApi() |
| for f in Path(checkpoint_dir).rglob("*"): |
| if f.is_file(): |
| rel = f.relative_to(checkpoint_dir).as_posix() |
| logger.info("Uploading %s", rel) |
| upload_file(path_or_fileobj=str(f), path_in_repo=rel, repo_id=hub_id) |
| logger.info("Checkpoint uploaded to %s", hub_id) |
|
|
|
|
| def train( |
| hub_id: str, |
| iterations: int = 1000, |
| device: str = "cuda", |
| batch_size: int = 4, |
| learning_rate: float = 5e-4, |
| push_every: int = 50, |
| ): |
| device = device if torch.cuda.is_available() else "cpu" |
| logger.info("Training on %s", device) |
|
|
| |
| model_path = resolve_model_id(os.getenv("BEE_MODEL_PROFILE") or os.getenv("BEE_MODEL_PATH") or DEFAULT_MODEL_PROFILE) |
| tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) |
| if tokenizer.pad_token is None: |
| tokenizer.pad_token = tokenizer.eos_token |
|
|
| |
| cfg = BeeConfig( |
| vocab_size=49152, |
| hidden_size=960, |
| num_hidden_layers=32, |
| num_attention_heads=15, |
| num_key_value_heads=5, |
| intermediate_size=2560, |
| max_position_embeddings=8192, |
| rms_norm_eps=1e-05, |
| tie_word_embeddings=False, |
| ) |
| model = BeeForCausalLM(cfg).to(device) |
|
|
| |
| from bee.weight_transfer import transfer_weights |
| model = transfer_weights(model_path, cfg, device) |
| logger.info("Model loaded: %.1fM params", sum(p.numel() for p in model.parameters()) / 1e6) |
|
|
| |
| autopilot = Autopilot( |
| model=model, |
| tokenizer=tokenizer, |
| device=device, |
| domains=["general", "programming", "quantum", "cybersecurity", "fintech"], |
| lora_config=LoRAConfig(r=16, alpha=32, dropout=0.05), |
| checkpoint_dir="./remote_checkpoints", |
| use_quantum=False, |
| ) |
|
|
| |
| try: |
| local_ckpt = download_checkpoint(hub_id) |
| autopilot.load_checkpoint(local_ckpt) |
| logger.info("Resumed from Hub checkpoint") |
| except Exception as e: |
| logger.warning("No checkpoint on Hub, starting fresh: %s", e) |
|
|
| |
| start_iter = autopilot.step_count |
| for i in range(start_iter, start_iter + iterations): |
| domain = autopilot.domains[i % len(autopilot.domains)] |
| loss = autopilot.train_domain_adapter( |
| domain=domain, |
| num_steps=10, |
| batch_size=batch_size, |
| learning_rate=learning_rate, |
| use_synthetic=True, |
| ) |
| logger.info("Iter %d | domain=%s | loss=%.4f", i, domain, loss) |
|
|
| |
| if i % push_every == 0 and i > 0: |
| ckpt_dir = f"./remote_checkpoints/iter_{i}" |
| autopilot.save_checkpoint(ckpt_dir) |
| upload_checkpoint(hub_id, ckpt_dir) |
|
|
| |
| final_dir = "./remote_checkpoints/iter_final" |
| autopilot.save_checkpoint(final_dir) |
| upload_checkpoint(hub_id, final_dir) |
| logger.info("Training complete. Final checkpoint: %s", final_dir) |
|
|
|
|
| def main(): |
| parser = argparse.ArgumentParser(description="Bee Remote GPU Training") |
| parser.add_argument("--hub_id", default=os.getenv("BEE_HUB_ID", "cfrost/bee"), help="HF Hub repo ID") |
| parser.add_argument("--iterations", type=int, default=1000, help="Training iterations") |
| parser.add_argument("--device", default="cuda", help="Device (cuda/cpu)") |
| parser.add_argument("--batch_size", type=int, default=4, help="Batch size") |
| parser.add_argument("--lr", type=float, default=5e-4, help="Learning rate") |
| parser.add_argument("--push_every", type=int, default=50, help="Push to Hub every N iterations") |
| args = parser.parse_args() |
|
|
| logging.basicConfig( |
| level=logging.INFO, |
| format="%(asctime)s | %(levelname)s | %(name)s | %(message)s", |
| ) |
|
|
| train( |
| hub_id=args.hub_id, |
| iterations=args.iterations, |
| device=args.device, |
| batch_size=args.batch_size, |
| learning_rate=args.lr, |
| push_every=args.push_every, |
| ) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|