File size: 5,744 Bytes
db82745 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 | """Honest benchmark of Bee AGI — architecture-only, untrained.
This measures:
- Parameter count per config
- Memory footprint (FP32 / BF16 / INT8)
- Forward pass latency (single token + full sequence)
- Generation throughput (tokens/sec on CPU)
- Architecture module validation
"""
import time
import sys
from pathlib import Path
import torch
sys.path.insert(0, str(Path(__file__).resolve().parent))
from bee.agi_register import register_agi
from bee.agi_config import BeeAGIConfig
from bee.agi_model import BeeAGIForCausalLM
register_agi()
def count_params(model):
total = sum(p.numel() for p in model.parameters())
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
return total, trainable
def benchmark_config(name, config, device="cpu", batch_size=1, prompt_len=512, gen_tokens=128):
print(f"\n{'='*60}")
print(f" Config: {name}")
print(f"{'='*60}")
model = BeeAGIForCausalLM(config).to(device).eval()
total, trainable = count_params(model)
print(f" Total params: {total / 1e6:.2f}M ({total / 1e9:.3f}B)")
print(f" Trainable: {trainable / 1e6:.2f}M")
# Memory estimates
fp32_bytes = total * 4
bf16_bytes = total * 2
int8_bytes = total * 1
print(f" FP32 memory: {fp32_bytes / 1e9:.2f} GB")
print(f" BF16 memory: {bf16_bytes / 1e9:.2f} GB")
print(f" INT8 memory: {int8_bytes / 1e9:.2f} GB")
# Warmup
dummy_ids = torch.randint(0, config.vocab_size, (batch_size, prompt_len), device=device)
with torch.no_grad():
_ = model(dummy_ids)
# Forward pass (full sequence)
torch.cuda.synchronize() if device == "cuda" else None
t0 = time.perf_counter()
with torch.no_grad():
_ = model(dummy_ids)
torch.cuda.synchronize() if device == "cuda" else None
t1 = time.perf_counter()
fwd_ms = (t1 - t0) * 1000
print(f" Forward {prompt_len} tok: {fwd_ms:.1f} ms ({prompt_len * batch_size / (t1 - t0):.1f} tok/sec)")
# Generation throughput
input_ids = torch.randint(0, config.vocab_size, (batch_size, 32), device=device)
t0 = time.perf_counter()
with torch.no_grad():
out = model.generate(input_ids, max_new_tokens=gen_tokens, do_sample=False, temperature=1.0)
t1 = time.perf_counter()
gen_time = t1 - t0
tok_per_sec = gen_tokens * batch_size / gen_time
print(f" Generate {gen_tokens} tok: {gen_time * 1000:.1f} ms ({tok_per_sec:.1f} tok/sec)")
print(f" Output shape: {out.shape}")
# MacBook feasibility
ram_gb = bf16_bytes / 1e9
feasible = "YES" if ram_gb < 32 else "NO (needs GPU cluster)"
print(f" MacBook viable: {feasible}")
return {
"name": name,
"params_M": total / 1e6,
"params_B": total / 1e9,
"fp32_GB": fp32_bytes / 1e9,
"bf16_GB": bf16_bytes / 1e9,
"int8_GB": int8_bytes / 1e9,
"fwd_ms": fwd_ms,
"gen_tok_per_sec": tok_per_sec,
"macbook_viable": ram_gb < 32,
}
def main():
device = "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Device: {device}")
configs = [
("Bee-Nano (test)", BeeAGIConfig(
vocab_size=1000, hidden_size=256, num_hidden_layers=4,
num_attention_heads=4, num_key_value_heads=2, intermediate_size=512,
num_experts=4, num_experts_per_tok=2, moe_layers=[1, 3],
state_space_layers=[2], state_dim=16, memory_slots=64,
memory_dim=256, reasoning_depth=2, compression_latent_dim=64,
domain_expert_count=4, domains=["programming", "quantum", "general", "math"],
max_position_embeddings=512,
)),
("Bee-Tiny (256M est)", BeeAGIConfig(
vocab_size=32000, hidden_size=1024, num_hidden_layers=24,
num_attention_heads=16, num_key_value_heads=4, intermediate_size=2816,
num_experts=8, num_experts_per_tok=2, moe_layers=list(range(6, 24, 4)),
state_space_layers=list(range(4, 24, 6)), state_dim=32,
memory_slots=1024, memory_dim=1024, reasoning_depth=4,
compression_latent_dim=128, domain_expert_count=8,
domains=["programming", "quantum", "blockchain", "crypto", "fintech", "spacetech", "math", "general"],
max_position_embeddings=8192,
)),
("Bee-Medium (4B est)", BeeAGIConfig(
vocab_size=100000, hidden_size=2048, num_hidden_layers=32,
num_attention_heads=16, num_key_value_heads=4, intermediate_size=5632,
num_experts=16, num_experts_per_tok=2, moe_layers=list(range(8, 32, 4)),
state_space_layers=list(range(4, 32, 6)), state_dim=64,
memory_slots=4096, memory_dim=2048, reasoning_depth=6,
compression_latent_dim=256, domain_expert_count=8,
domains=["programming", "quantum", "blockchain", "crypto", "fintech", "spacetech", "math", "general"],
max_position_embeddings=32768,
)),
]
results = []
for name, cfg in configs:
try:
r = benchmark_config(name, cfg, device=device, batch_size=1, prompt_len=128 if "Nano" in name else 64, gen_tokens=32 if "Nano" in name else 16)
results.append(r)
except Exception as e:
print(f" ERROR: {e}")
print(f"\n{'='*60}")
print(" SUMMARY")
print(f"{'='*60}")
for r in results:
print(f" {r['name']}: {r['params_B']:.3f}B params, {r['bf16_GB']:.2f}GB BF16, {r['gen_tok_per_sec']:.1f} tok/s")
print("\n NOTE: This is the UNTRAINED architecture. Token output is random.")
print(" Training requires: multi-GPU cluster, TB-scale dataset, weeks of compute.")
print(f"{'='*60}")
if __name__ == "__main__":
main()
|