| """Honest benchmark of Bee AGI — architecture-only, untrained. |
| |
| This measures: |
| - Parameter count per config |
| - Memory footprint (FP32 / BF16 / INT8) |
| - Forward pass latency (single token + full sequence) |
| - Generation throughput (tokens/sec on CPU) |
| - Architecture module validation |
| """ |
|
|
| import time |
| import sys |
| from pathlib import Path |
|
|
| import torch |
|
|
| sys.path.insert(0, str(Path(__file__).resolve().parent)) |
| from bee.agi_register import register_agi |
| from bee.agi_config import BeeAGIConfig |
| from bee.agi_model import BeeAGIForCausalLM |
|
|
| register_agi() |
|
|
|
|
| def count_params(model): |
| total = sum(p.numel() for p in model.parameters()) |
| trainable = sum(p.numel() for p in model.parameters() if p.requires_grad) |
| return total, trainable |
|
|
|
|
| def benchmark_config(name, config, device="cpu", batch_size=1, prompt_len=512, gen_tokens=128): |
| print(f"\n{'='*60}") |
| print(f" Config: {name}") |
| print(f"{'='*60}") |
|
|
| model = BeeAGIForCausalLM(config).to(device).eval() |
| total, trainable = count_params(model) |
| print(f" Total params: {total / 1e6:.2f}M ({total / 1e9:.3f}B)") |
| print(f" Trainable: {trainable / 1e6:.2f}M") |
|
|
| |
| fp32_bytes = total * 4 |
| bf16_bytes = total * 2 |
| int8_bytes = total * 1 |
| print(f" FP32 memory: {fp32_bytes / 1e9:.2f} GB") |
| print(f" BF16 memory: {bf16_bytes / 1e9:.2f} GB") |
| print(f" INT8 memory: {int8_bytes / 1e9:.2f} GB") |
|
|
| |
| dummy_ids = torch.randint(0, config.vocab_size, (batch_size, prompt_len), device=device) |
| with torch.no_grad(): |
| _ = model(dummy_ids) |
|
|
| |
| torch.cuda.synchronize() if device == "cuda" else None |
| t0 = time.perf_counter() |
| with torch.no_grad(): |
| _ = model(dummy_ids) |
| torch.cuda.synchronize() if device == "cuda" else None |
| t1 = time.perf_counter() |
| fwd_ms = (t1 - t0) * 1000 |
| print(f" Forward {prompt_len} tok: {fwd_ms:.1f} ms ({prompt_len * batch_size / (t1 - t0):.1f} tok/sec)") |
|
|
| |
| input_ids = torch.randint(0, config.vocab_size, (batch_size, 32), device=device) |
| t0 = time.perf_counter() |
| with torch.no_grad(): |
| out = model.generate(input_ids, max_new_tokens=gen_tokens, do_sample=False, temperature=1.0) |
| t1 = time.perf_counter() |
| gen_time = t1 - t0 |
| tok_per_sec = gen_tokens * batch_size / gen_time |
| print(f" Generate {gen_tokens} tok: {gen_time * 1000:.1f} ms ({tok_per_sec:.1f} tok/sec)") |
| print(f" Output shape: {out.shape}") |
|
|
| |
| ram_gb = bf16_bytes / 1e9 |
| feasible = "YES" if ram_gb < 32 else "NO (needs GPU cluster)" |
| print(f" MacBook viable: {feasible}") |
|
|
| return { |
| "name": name, |
| "params_M": total / 1e6, |
| "params_B": total / 1e9, |
| "fp32_GB": fp32_bytes / 1e9, |
| "bf16_GB": bf16_bytes / 1e9, |
| "int8_GB": int8_bytes / 1e9, |
| "fwd_ms": fwd_ms, |
| "gen_tok_per_sec": tok_per_sec, |
| "macbook_viable": ram_gb < 32, |
| } |
|
|
|
|
| def main(): |
| device = "mps" if torch.backends.mps.is_available() else "cpu" |
| print(f"Device: {device}") |
|
|
| configs = [ |
| ("Bee-Nano (test)", BeeAGIConfig( |
| vocab_size=1000, hidden_size=256, num_hidden_layers=4, |
| num_attention_heads=4, num_key_value_heads=2, intermediate_size=512, |
| num_experts=4, num_experts_per_tok=2, moe_layers=[1, 3], |
| state_space_layers=[2], state_dim=16, memory_slots=64, |
| memory_dim=256, reasoning_depth=2, compression_latent_dim=64, |
| domain_expert_count=4, domains=["programming", "quantum", "general", "math"], |
| max_position_embeddings=512, |
| )), |
| ("Bee-Tiny (256M est)", BeeAGIConfig( |
| vocab_size=32000, hidden_size=1024, num_hidden_layers=24, |
| num_attention_heads=16, num_key_value_heads=4, intermediate_size=2816, |
| num_experts=8, num_experts_per_tok=2, moe_layers=list(range(6, 24, 4)), |
| state_space_layers=list(range(4, 24, 6)), state_dim=32, |
| memory_slots=1024, memory_dim=1024, reasoning_depth=4, |
| compression_latent_dim=128, domain_expert_count=8, |
| domains=["programming", "quantum", "blockchain", "crypto", "fintech", "spacetech", "math", "general"], |
| max_position_embeddings=8192, |
| )), |
| ("Bee-Medium (4B est)", BeeAGIConfig( |
| vocab_size=100000, hidden_size=2048, num_hidden_layers=32, |
| num_attention_heads=16, num_key_value_heads=4, intermediate_size=5632, |
| num_experts=16, num_experts_per_tok=2, moe_layers=list(range(8, 32, 4)), |
| state_space_layers=list(range(4, 32, 6)), state_dim=64, |
| memory_slots=4096, memory_dim=2048, reasoning_depth=6, |
| compression_latent_dim=256, domain_expert_count=8, |
| domains=["programming", "quantum", "blockchain", "crypto", "fintech", "spacetech", "math", "general"], |
| max_position_embeddings=32768, |
| )), |
| ] |
|
|
| results = [] |
| for name, cfg in configs: |
| try: |
| r = benchmark_config(name, cfg, device=device, batch_size=1, prompt_len=128 if "Nano" in name else 64, gen_tokens=32 if "Nano" in name else 16) |
| results.append(r) |
| except Exception as e: |
| print(f" ERROR: {e}") |
|
|
| print(f"\n{'='*60}") |
| print(" SUMMARY") |
| print(f"{'='*60}") |
| for r in results: |
| print(f" {r['name']}: {r['params_B']:.3f}B params, {r['bf16_GB']:.2f}GB BF16, {r['gen_tok_per_sec']:.1f} tok/s") |
|
|
| print("\n NOTE: This is the UNTRAINED architecture. Token output is random.") |
| print(" Training requires: multi-GPU cluster, TB-scale dataset, weeks of compute.") |
| print(f"{'='*60}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|