"""Honest benchmark of Bee AGI — architecture-only, untrained. This measures: - Parameter count per config - Memory footprint (FP32 / BF16 / INT8) - Forward pass latency (single token + full sequence) - Generation throughput (tokens/sec on CPU) - Architecture module validation """ import time import sys from pathlib import Path import torch sys.path.insert(0, str(Path(__file__).resolve().parent)) from bee.agi_register import register_agi from bee.agi_config import BeeAGIConfig from bee.agi_model import BeeAGIForCausalLM register_agi() def count_params(model): total = sum(p.numel() for p in model.parameters()) trainable = sum(p.numel() for p in model.parameters() if p.requires_grad) return total, trainable def benchmark_config(name, config, device="cpu", batch_size=1, prompt_len=512, gen_tokens=128): print(f"\n{'='*60}") print(f" Config: {name}") print(f"{'='*60}") model = BeeAGIForCausalLM(config).to(device).eval() total, trainable = count_params(model) print(f" Total params: {total / 1e6:.2f}M ({total / 1e9:.3f}B)") print(f" Trainable: {trainable / 1e6:.2f}M") # Memory estimates fp32_bytes = total * 4 bf16_bytes = total * 2 int8_bytes = total * 1 print(f" FP32 memory: {fp32_bytes / 1e9:.2f} GB") print(f" BF16 memory: {bf16_bytes / 1e9:.2f} GB") print(f" INT8 memory: {int8_bytes / 1e9:.2f} GB") # Warmup dummy_ids = torch.randint(0, config.vocab_size, (batch_size, prompt_len), device=device) with torch.no_grad(): _ = model(dummy_ids) # Forward pass (full sequence) torch.cuda.synchronize() if device == "cuda" else None t0 = time.perf_counter() with torch.no_grad(): _ = model(dummy_ids) torch.cuda.synchronize() if device == "cuda" else None t1 = time.perf_counter() fwd_ms = (t1 - t0) * 1000 print(f" Forward {prompt_len} tok: {fwd_ms:.1f} ms ({prompt_len * batch_size / (t1 - t0):.1f} tok/sec)") # Generation throughput input_ids = torch.randint(0, config.vocab_size, (batch_size, 32), device=device) t0 = time.perf_counter() with torch.no_grad(): out = model.generate(input_ids, max_new_tokens=gen_tokens, do_sample=False, temperature=1.0) t1 = time.perf_counter() gen_time = t1 - t0 tok_per_sec = gen_tokens * batch_size / gen_time print(f" Generate {gen_tokens} tok: {gen_time * 1000:.1f} ms ({tok_per_sec:.1f} tok/sec)") print(f" Output shape: {out.shape}") # MacBook feasibility ram_gb = bf16_bytes / 1e9 feasible = "YES" if ram_gb < 32 else "NO (needs GPU cluster)" print(f" MacBook viable: {feasible}") return { "name": name, "params_M": total / 1e6, "params_B": total / 1e9, "fp32_GB": fp32_bytes / 1e9, "bf16_GB": bf16_bytes / 1e9, "int8_GB": int8_bytes / 1e9, "fwd_ms": fwd_ms, "gen_tok_per_sec": tok_per_sec, "macbook_viable": ram_gb < 32, } def main(): device = "mps" if torch.backends.mps.is_available() else "cpu" print(f"Device: {device}") configs = [ ("Bee-Nano (test)", BeeAGIConfig( vocab_size=1000, hidden_size=256, num_hidden_layers=4, num_attention_heads=4, num_key_value_heads=2, intermediate_size=512, num_experts=4, num_experts_per_tok=2, moe_layers=[1, 3], state_space_layers=[2], state_dim=16, memory_slots=64, memory_dim=256, reasoning_depth=2, compression_latent_dim=64, domain_expert_count=4, domains=["programming", "quantum", "general", "math"], max_position_embeddings=512, )), ("Bee-Tiny (256M est)", BeeAGIConfig( vocab_size=32000, hidden_size=1024, num_hidden_layers=24, num_attention_heads=16, num_key_value_heads=4, intermediate_size=2816, num_experts=8, num_experts_per_tok=2, moe_layers=list(range(6, 24, 4)), state_space_layers=list(range(4, 24, 6)), state_dim=32, memory_slots=1024, memory_dim=1024, reasoning_depth=4, compression_latent_dim=128, domain_expert_count=8, domains=["programming", "quantum", "blockchain", "crypto", "fintech", "spacetech", "math", "general"], max_position_embeddings=8192, )), ("Bee-Medium (4B est)", BeeAGIConfig( vocab_size=100000, hidden_size=2048, num_hidden_layers=32, num_attention_heads=16, num_key_value_heads=4, intermediate_size=5632, num_experts=16, num_experts_per_tok=2, moe_layers=list(range(8, 32, 4)), state_space_layers=list(range(4, 32, 6)), state_dim=64, memory_slots=4096, memory_dim=2048, reasoning_depth=6, compression_latent_dim=256, domain_expert_count=8, domains=["programming", "quantum", "blockchain", "crypto", "fintech", "spacetech", "math", "general"], max_position_embeddings=32768, )), ] results = [] for name, cfg in configs: try: r = benchmark_config(name, cfg, device=device, batch_size=1, prompt_len=128 if "Nano" in name else 64, gen_tokens=32 if "Nano" in name else 16) results.append(r) except Exception as e: print(f" ERROR: {e}") print(f"\n{'='*60}") print(" SUMMARY") print(f"{'='*60}") for r in results: print(f" {r['name']}: {r['params_B']:.3f}B params, {r['bf16_GB']:.2f}GB BF16, {r['gen_tok_per_sec']:.1f} tok/s") print("\n NOTE: This is the UNTRAINED architecture. Token output is random.") print(" Training requires: multi-GPU cluster, TB-scale dataset, weeks of compute.") print(f"{'='*60}") if __name__ == "__main__": main()