Spaces:

cuilabs
/

bee

Running

App Files Files Community

bee / scripts /benchmark.py

ceocxx

chore: deploy Bee API backend (bee/, Dockerfile, requirements)

db82745 verified 1 day ago

raw

history blame contribute delete

5.74 kB

	"""Honest benchmark of Bee AGI — architecture-only, untrained.

	This measures:
	- Parameter count per config
	- Memory footprint (FP32 / BF16 / INT8)
	- Forward pass latency (single token + full sequence)
	- Generation throughput (tokens/sec on CPU)
	- Architecture module validation
	"""

	import time
	import sys
	from pathlib import Path

	import torch

	sys.path.insert(0, str(Path(__file__).resolve().parent))
	from bee.agi_register import register_agi
	from bee.agi_config import BeeAGIConfig
	from bee.agi_model import BeeAGIForCausalLM

	register_agi()


	def count_params(model):
	total = sum(p.numel() for p in model.parameters())
	trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
	return total, trainable


	def benchmark_config(name, config, device="cpu", batch_size=1, prompt_len=512, gen_tokens=128):
	print(f"\n{'='*60}")
	print(f" Config: {name}")
	print(f"{'='*60}")

	model = BeeAGIForCausalLM(config).to(device).eval()
	total, trainable = count_params(model)
	print(f" Total params: {total / 1e6:.2f}M ({total / 1e9:.3f}B)")
	print(f" Trainable: {trainable / 1e6:.2f}M")

	# Memory estimates
	fp32_bytes = total * 4
	bf16_bytes = total * 2
	int8_bytes = total * 1
	print(f" FP32 memory: {fp32_bytes / 1e9:.2f} GB")
	print(f" BF16 memory: {bf16_bytes / 1e9:.2f} GB")
	print(f" INT8 memory: {int8_bytes / 1e9:.2f} GB")

	# Warmup
	dummy_ids = torch.randint(0, config.vocab_size, (batch_size, prompt_len), device=device)
	with torch.no_grad():
	_ = model(dummy_ids)

	# Forward pass (full sequence)
	torch.cuda.synchronize() if device == "cuda" else None
	t0 = time.perf_counter()
	with torch.no_grad():
	_ = model(dummy_ids)
	torch.cuda.synchronize() if device == "cuda" else None
	t1 = time.perf_counter()
	fwd_ms = (t1 - t0) * 1000
	print(f" Forward {prompt_len} tok: {fwd_ms:.1f} ms ({prompt_len * batch_size / (t1 - t0):.1f} tok/sec)")

	# Generation throughput
	input_ids = torch.randint(0, config.vocab_size, (batch_size, 32), device=device)
	t0 = time.perf_counter()
	with torch.no_grad():
	out = model.generate(input_ids, max_new_tokens=gen_tokens, do_sample=False, temperature=1.0)
	t1 = time.perf_counter()
	gen_time = t1 - t0
	tok_per_sec = gen_tokens * batch_size / gen_time
	print(f" Generate {gen_tokens} tok: {gen_time * 1000:.1f} ms ({tok_per_sec:.1f} tok/sec)")
	print(f" Output shape: {out.shape}")

	# MacBook feasibility
	ram_gb = bf16_bytes / 1e9
	feasible = "YES" if ram_gb < 32 else "NO (needs GPU cluster)"
	print(f" MacBook viable: {feasible}")

	return {
	"name": name,
	"params_M": total / 1e6,
	"params_B": total / 1e9,
	"fp32_GB": fp32_bytes / 1e9,
	"bf16_GB": bf16_bytes / 1e9,
	"int8_GB": int8_bytes / 1e9,
	"fwd_ms": fwd_ms,
	"gen_tok_per_sec": tok_per_sec,
	"macbook_viable": ram_gb < 32,
	}


	def main():
	device = "mps" if torch.backends.mps.is_available() else "cpu"
	print(f"Device: {device}")

	configs = [
	("Bee-Nano (test)", BeeAGIConfig(
	vocab_size=1000, hidden_size=256, num_hidden_layers=4,
	num_attention_heads=4, num_key_value_heads=2, intermediate_size=512,
	num_experts=4, num_experts_per_tok=2, moe_layers=[1, 3],
	state_space_layers=[2], state_dim=16, memory_slots=64,
	memory_dim=256, reasoning_depth=2, compression_latent_dim=64,
	domain_expert_count=4, domains=["programming", "quantum", "general", "math"],
	max_position_embeddings=512,
	)),
	("Bee-Tiny (256M est)", BeeAGIConfig(
	vocab_size=32000, hidden_size=1024, num_hidden_layers=24,
	num_attention_heads=16, num_key_value_heads=4, intermediate_size=2816,
	num_experts=8, num_experts_per_tok=2, moe_layers=list(range(6, 24, 4)),
	state_space_layers=list(range(4, 24, 6)), state_dim=32,
	memory_slots=1024, memory_dim=1024, reasoning_depth=4,
	compression_latent_dim=128, domain_expert_count=8,
	domains=["programming", "quantum", "blockchain", "crypto", "fintech", "spacetech", "math", "general"],
	max_position_embeddings=8192,
	)),
	("Bee-Medium (4B est)", BeeAGIConfig(
	vocab_size=100000, hidden_size=2048, num_hidden_layers=32,
	num_attention_heads=16, num_key_value_heads=4, intermediate_size=5632,
	num_experts=16, num_experts_per_tok=2, moe_layers=list(range(8, 32, 4)),
	state_space_layers=list(range(4, 32, 6)), state_dim=64,
	memory_slots=4096, memory_dim=2048, reasoning_depth=6,
	compression_latent_dim=256, domain_expert_count=8,
	domains=["programming", "quantum", "blockchain", "crypto", "fintech", "spacetech", "math", "general"],
	max_position_embeddings=32768,
	)),
	]

	results = []
	for name, cfg in configs:
	try:
	r = benchmark_config(name, cfg, device=device, batch_size=1, prompt_len=128 if "Nano" in name else 64, gen_tokens=32 if "Nano" in name else 16)
	results.append(r)
	except Exception as e:
	print(f" ERROR: {e}")

	print(f"\n{'='*60}")
	print(" SUMMARY")
	print(f"{'='*60}")
	for r in results:
	print(f" {r['name']}: {r['params_B']:.3f}B params, {r['bf16_GB']:.2f}GB BF16, {r['gen_tok_per_sec']:.1f} tok/s")

	print("\n NOTE: This is the UNTRAINED architecture. Token output is random.")
	print(" Training requires: multi-GPU cluster, TB-scale dataset, weeks of compute.")
	print(f"{'='*60}")


	if __name__ == "__main__":
	main()