bee / scripts /benchmark_vs_models.py
ceocxx's picture
chore: deploy Bee API backend (bee/, Dockerfile, requirements)
db82745 verified
"""Benchmark Bee against real, publicly available small LLMs.
Measures:
- Perplexity on TinyStories (lower = better)
- Forward latency (ms per token)
- Generation throughput (tok/s)
- Memory footprint
Models compared:
- Bee-Nano (random init)
- Bee-Nano (distilled, if available)
- GPT-2 124M
- SmolLM2-135M
- Qwen2.5-0.5B (if fits)
"""
import argparse
import json
import logging
import os
import sys
import time
from pathlib import Path
import torch
import torch.nn.functional as F
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from bee.register import register
from bee.config import BeeConfig
from bee.modeling_bee import BeeForCausalLM
register()
logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(levelname)s | %(name)s | %(message)s")
logger = logging.getLogger("bee.benchmark")
def count_params(model):
return sum(p.numel() for p in model.parameters())
def measure_perplexity(model, tokenizer, device, max_samples=100, max_length=256):
"""Measure perplexity on TinyStories validation."""
ds = load_dataset("roneneldan/TinyStories", split="validation", streaming=True)
ds = ds.take(max_samples)
total_nll = 0.0
total_tokens = 0
model = model.to(device).eval()
for ex in ds:
text = ex["text"]
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=max_length).to(device)
with torch.no_grad():
out = model(**inputs)
logits = out.logits if hasattr(out, "logits") else out[0]
shift_logits = logits[:, :-1, :].contiguous()
shift_labels = inputs["input_ids"][:, 1:].contiguous()
nll = F.cross_entropy(
shift_logits.view(-1, shift_logits.size(-1)),
shift_labels.view(-1),
reduction="sum",
)
total_nll += nll.item()
total_tokens += shift_labels.numel()
perplexity = torch.exp(torch.tensor(total_nll / total_tokens)).item()
return perplexity
def measure_generation_speed(model, tokenizer, device, prompt="Once upon a time", max_new_tokens=64):
"""Measure generation throughput."""
inputs = tokenizer(prompt, return_tensors="pt").to(device)
model = model.to(device).eval()
# Warmup
with torch.no_grad():
_ = model.generate(**inputs, max_new_tokens=4, do_sample=False)
torch.cuda.synchronize() if device == "cuda" else None
t0 = time.perf_counter()
with torch.no_grad():
out = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
torch.cuda.synchronize() if device == "cuda" else None
t1 = time.perf_counter()
gen_time = t1 - t0
tok_per_sec = max_new_tokens / gen_time
return tok_per_sec, gen_time, out.shape[1]
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--device", type=str, default="mps" if torch.backends.mps.is_available() else "cpu")
parser.add_argument("--bee_checkpoint", type=str, default=None, help="Distilled Bee checkpoint")
parser.add_argument("--max_samples", type=int, default=50)
parser.add_argument("--output", type=str, default="benchmark_results.json")
args = parser.parse_args()
results = []
device = args.device
# Models to benchmark
models_to_test = []
# Bee-Nano (random init)
logger.info("Preparing Bee-Nano (random init)")
bee_cfg = BeeConfig(vocab_size=49152, hidden_size=512, num_hidden_layers=8,
num_attention_heads=8, intermediate_size=1024, max_position_embeddings=2048)
bee_random = BeeForCausalLM(bee_cfg)
models_to_test.append(("Bee-Nano (random)", bee_random, None))
# Bee-Nano (distilled, if exists)
if args.bee_checkpoint and os.path.exists(args.bee_checkpoint):
logger.info("Loading distilled Bee from %s", args.bee_checkpoint)
bee_distilled = BeeForCausalLM.from_pretrained(args.bee_checkpoint)
tok = AutoTokenizer.from_pretrained(args.bee_checkpoint)
models_to_test.append(("Bee-Nano (distilled)", bee_distilled, tok))
# GPT-2
try:
logger.info("Loading GPT-2")
gpt2 = AutoModelForCausalLM.from_pretrained("gpt2")
gpt2_tok = AutoTokenizer.from_pretrained("gpt2")
models_to_test.append(("GPT-2 124M", gpt2, gpt2_tok))
except Exception as e:
logger.warning("Failed to load GPT-2: %s", e)
# SmolLM2-135M
try:
logger.info("Loading SmolLM2-135M")
smol = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM2-135M", trust_remote_code=True)
smol_tok = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-135M", trust_remote_code=True)
models_to_test.append(("SmolLM2-135M", smol, smol_tok))
except Exception as e:
logger.warning("Failed to load SmolLM2: %s", e)
# Run benchmarks
for name, model, tok in models_to_test:
logger.info("=" * 50)
logger.info("Benchmarking: %s", name)
logger.info("=" * 50)
params = count_params(model)
logger.info("Parameters: %.2fM", params / 1e6)
# We need a tokenizer
if tok is None:
tok = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-135M", trust_remote_code=True)
if tok.pad_token is None:
tok.pad_token = tok.eos_token
try:
ppl = measure_perplexity(model, tok, device, max_samples=args.max_samples)
logger.info("Perplexity: %.2f", ppl)
except Exception as e:
logger.error("Perplexity failed: %s", e)
ppl = None
try:
tps, gen_time, out_len = measure_generation_speed(model, tok, device, max_new_tokens=32)
logger.info("Generation: %.2f tok/s (%.2f ms for 32 tok)", tps, gen_time * 1000)
except Exception as e:
logger.error("Generation speed failed: %s", e)
tps = gen_time = out_len = None
results.append({
"model": name,
"params_M": params / 1e6,
"perplexity": ppl,
"gen_tok_per_sec": tps,
"gen_time_ms": gen_time * 1000 if gen_time else None,
"output_tokens": out_len,
})
# Save and print summary
with open(args.output, "w") as f:
json.dump(results, f, indent=2)
logger.info("\n" + "=" * 50)
logger.info("SUMMARY")
logger.info("=" * 50)
for r in results:
ppl_str = f"{r['perplexity']:.2f}" if r['perplexity'] else "N/A"
tps_str = f"{r['gen_tok_per_sec']:.1f}" if r['gen_tok_per_sec'] else "N/A"
logger.info("%-25s | %.1fM params | PPL: %s | Gen: %s tok/s",
r["model"], r["params_M"], ppl_str, tps_str)
logger.info("Results saved to %s", args.output)
if __name__ == "__main__":
main()