bee / scripts /benchmark.py
ceocxx's picture
chore: deploy Bee API backend (bee/, Dockerfile, requirements)
db82745 verified
"""Honest benchmark of Bee AGI — architecture-only, untrained.
This measures:
- Parameter count per config
- Memory footprint (FP32 / BF16 / INT8)
- Forward pass latency (single token + full sequence)
- Generation throughput (tokens/sec on CPU)
- Architecture module validation
"""
import time
import sys
from pathlib import Path
import torch
sys.path.insert(0, str(Path(__file__).resolve().parent))
from bee.agi_register import register_agi
from bee.agi_config import BeeAGIConfig
from bee.agi_model import BeeAGIForCausalLM
register_agi()
def count_params(model):
total = sum(p.numel() for p in model.parameters())
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
return total, trainable
def benchmark_config(name, config, device="cpu", batch_size=1, prompt_len=512, gen_tokens=128):
print(f"\n{'='*60}")
print(f" Config: {name}")
print(f"{'='*60}")
model = BeeAGIForCausalLM(config).to(device).eval()
total, trainable = count_params(model)
print(f" Total params: {total / 1e6:.2f}M ({total / 1e9:.3f}B)")
print(f" Trainable: {trainable / 1e6:.2f}M")
# Memory estimates
fp32_bytes = total * 4
bf16_bytes = total * 2
int8_bytes = total * 1
print(f" FP32 memory: {fp32_bytes / 1e9:.2f} GB")
print(f" BF16 memory: {bf16_bytes / 1e9:.2f} GB")
print(f" INT8 memory: {int8_bytes / 1e9:.2f} GB")
# Warmup
dummy_ids = torch.randint(0, config.vocab_size, (batch_size, prompt_len), device=device)
with torch.no_grad():
_ = model(dummy_ids)
# Forward pass (full sequence)
torch.cuda.synchronize() if device == "cuda" else None
t0 = time.perf_counter()
with torch.no_grad():
_ = model(dummy_ids)
torch.cuda.synchronize() if device == "cuda" else None
t1 = time.perf_counter()
fwd_ms = (t1 - t0) * 1000
print(f" Forward {prompt_len} tok: {fwd_ms:.1f} ms ({prompt_len * batch_size / (t1 - t0):.1f} tok/sec)")
# Generation throughput
input_ids = torch.randint(0, config.vocab_size, (batch_size, 32), device=device)
t0 = time.perf_counter()
with torch.no_grad():
out = model.generate(input_ids, max_new_tokens=gen_tokens, do_sample=False, temperature=1.0)
t1 = time.perf_counter()
gen_time = t1 - t0
tok_per_sec = gen_tokens * batch_size / gen_time
print(f" Generate {gen_tokens} tok: {gen_time * 1000:.1f} ms ({tok_per_sec:.1f} tok/sec)")
print(f" Output shape: {out.shape}")
# MacBook feasibility
ram_gb = bf16_bytes / 1e9
feasible = "YES" if ram_gb < 32 else "NO (needs GPU cluster)"
print(f" MacBook viable: {feasible}")
return {
"name": name,
"params_M": total / 1e6,
"params_B": total / 1e9,
"fp32_GB": fp32_bytes / 1e9,
"bf16_GB": bf16_bytes / 1e9,
"int8_GB": int8_bytes / 1e9,
"fwd_ms": fwd_ms,
"gen_tok_per_sec": tok_per_sec,
"macbook_viable": ram_gb < 32,
}
def main():
device = "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Device: {device}")
configs = [
("Bee-Nano (test)", BeeAGIConfig(
vocab_size=1000, hidden_size=256, num_hidden_layers=4,
num_attention_heads=4, num_key_value_heads=2, intermediate_size=512,
num_experts=4, num_experts_per_tok=2, moe_layers=[1, 3],
state_space_layers=[2], state_dim=16, memory_slots=64,
memory_dim=256, reasoning_depth=2, compression_latent_dim=64,
domain_expert_count=4, domains=["programming", "quantum", "general", "math"],
max_position_embeddings=512,
)),
("Bee-Tiny (256M est)", BeeAGIConfig(
vocab_size=32000, hidden_size=1024, num_hidden_layers=24,
num_attention_heads=16, num_key_value_heads=4, intermediate_size=2816,
num_experts=8, num_experts_per_tok=2, moe_layers=list(range(6, 24, 4)),
state_space_layers=list(range(4, 24, 6)), state_dim=32,
memory_slots=1024, memory_dim=1024, reasoning_depth=4,
compression_latent_dim=128, domain_expert_count=8,
domains=["programming", "quantum", "blockchain", "crypto", "fintech", "spacetech", "math", "general"],
max_position_embeddings=8192,
)),
("Bee-Medium (4B est)", BeeAGIConfig(
vocab_size=100000, hidden_size=2048, num_hidden_layers=32,
num_attention_heads=16, num_key_value_heads=4, intermediate_size=5632,
num_experts=16, num_experts_per_tok=2, moe_layers=list(range(8, 32, 4)),
state_space_layers=list(range(4, 32, 6)), state_dim=64,
memory_slots=4096, memory_dim=2048, reasoning_depth=6,
compression_latent_dim=256, domain_expert_count=8,
domains=["programming", "quantum", "blockchain", "crypto", "fintech", "spacetech", "math", "general"],
max_position_embeddings=32768,
)),
]
results = []
for name, cfg in configs:
try:
r = benchmark_config(name, cfg, device=device, batch_size=1, prompt_len=128 if "Nano" in name else 64, gen_tokens=32 if "Nano" in name else 16)
results.append(r)
except Exception as e:
print(f" ERROR: {e}")
print(f"\n{'='*60}")
print(" SUMMARY")
print(f"{'='*60}")
for r in results:
print(f" {r['name']}: {r['params_B']:.3f}B params, {r['bf16_GB']:.2f}GB BF16, {r['gen_tok_per_sec']:.1f} tok/s")
print("\n NOTE: This is the UNTRAINED architecture. Token output is random.")
print(" Training requires: multi-GPU cluster, TB-scale dataset, weeks of compute.")
print(f"{'='*60}")
if __name__ == "__main__":
main()