File size: 5,744 Bytes
db82745
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
"""Honest benchmark of Bee AGI — architecture-only, untrained.

This measures:
- Parameter count per config
- Memory footprint (FP32 / BF16 / INT8)
- Forward pass latency (single token + full sequence)
- Generation throughput (tokens/sec on CPU)
- Architecture module validation
"""

import time
import sys
from pathlib import Path

import torch

sys.path.insert(0, str(Path(__file__).resolve().parent))
from bee.agi_register import register_agi
from bee.agi_config import BeeAGIConfig
from bee.agi_model import BeeAGIForCausalLM

register_agi()


def count_params(model):
    total = sum(p.numel() for p in model.parameters())
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return total, trainable


def benchmark_config(name, config, device="cpu", batch_size=1, prompt_len=512, gen_tokens=128):
    print(f"\n{'='*60}")
    print(f"  Config: {name}")
    print(f"{'='*60}")

    model = BeeAGIForCausalLM(config).to(device).eval()
    total, trainable = count_params(model)
    print(f"  Total params:   {total / 1e6:.2f}M  ({total / 1e9:.3f}B)")
    print(f"  Trainable:      {trainable / 1e6:.2f}M")

    # Memory estimates
    fp32_bytes = total * 4
    bf16_bytes = total * 2
    int8_bytes = total * 1
    print(f"  FP32 memory:    {fp32_bytes / 1e9:.2f} GB")
    print(f"  BF16 memory:    {bf16_bytes / 1e9:.2f} GB")
    print(f"  INT8 memory:    {int8_bytes / 1e9:.2f} GB")

    # Warmup
    dummy_ids = torch.randint(0, config.vocab_size, (batch_size, prompt_len), device=device)
    with torch.no_grad():
        _ = model(dummy_ids)

    # Forward pass (full sequence)
    torch.cuda.synchronize() if device == "cuda" else None
    t0 = time.perf_counter()
    with torch.no_grad():
        _ = model(dummy_ids)
    torch.cuda.synchronize() if device == "cuda" else None
    t1 = time.perf_counter()
    fwd_ms = (t1 - t0) * 1000
    print(f"  Forward {prompt_len} tok:  {fwd_ms:.1f} ms  ({prompt_len * batch_size / (t1 - t0):.1f} tok/sec)")

    # Generation throughput
    input_ids = torch.randint(0, config.vocab_size, (batch_size, 32), device=device)
    t0 = time.perf_counter()
    with torch.no_grad():
        out = model.generate(input_ids, max_new_tokens=gen_tokens, do_sample=False, temperature=1.0)
    t1 = time.perf_counter()
    gen_time = t1 - t0
    tok_per_sec = gen_tokens * batch_size / gen_time
    print(f"  Generate {gen_tokens} tok:  {gen_time * 1000:.1f} ms  ({tok_per_sec:.1f} tok/sec)")
    print(f"  Output shape:   {out.shape}")

    # MacBook feasibility
    ram_gb = bf16_bytes / 1e9
    feasible = "YES" if ram_gb < 32 else "NO (needs GPU cluster)"
    print(f"  MacBook viable: {feasible}")

    return {
        "name": name,
        "params_M": total / 1e6,
        "params_B": total / 1e9,
        "fp32_GB": fp32_bytes / 1e9,
        "bf16_GB": bf16_bytes / 1e9,
        "int8_GB": int8_bytes / 1e9,
        "fwd_ms": fwd_ms,
        "gen_tok_per_sec": tok_per_sec,
        "macbook_viable": ram_gb < 32,
    }


def main():
    device = "mps" if torch.backends.mps.is_available() else "cpu"
    print(f"Device: {device}")

    configs = [
        ("Bee-Nano (test)", BeeAGIConfig(
            vocab_size=1000, hidden_size=256, num_hidden_layers=4,
            num_attention_heads=4, num_key_value_heads=2, intermediate_size=512,
            num_experts=4, num_experts_per_tok=2, moe_layers=[1, 3],
            state_space_layers=[2], state_dim=16, memory_slots=64,
            memory_dim=256, reasoning_depth=2, compression_latent_dim=64,
            domain_expert_count=4, domains=["programming", "quantum", "general", "math"],
            max_position_embeddings=512,
        )),
        ("Bee-Tiny (256M est)", BeeAGIConfig(
            vocab_size=32000, hidden_size=1024, num_hidden_layers=24,
            num_attention_heads=16, num_key_value_heads=4, intermediate_size=2816,
            num_experts=8, num_experts_per_tok=2, moe_layers=list(range(6, 24, 4)),
            state_space_layers=list(range(4, 24, 6)), state_dim=32,
            memory_slots=1024, memory_dim=1024, reasoning_depth=4,
            compression_latent_dim=128, domain_expert_count=8,
            domains=["programming", "quantum", "blockchain", "crypto", "fintech", "spacetech", "math", "general"],
            max_position_embeddings=8192,
        )),
        ("Bee-Medium (4B est)", BeeAGIConfig(
            vocab_size=100000, hidden_size=2048, num_hidden_layers=32,
            num_attention_heads=16, num_key_value_heads=4, intermediate_size=5632,
            num_experts=16, num_experts_per_tok=2, moe_layers=list(range(8, 32, 4)),
            state_space_layers=list(range(4, 32, 6)), state_dim=64,
            memory_slots=4096, memory_dim=2048, reasoning_depth=6,
            compression_latent_dim=256, domain_expert_count=8,
            domains=["programming", "quantum", "blockchain", "crypto", "fintech", "spacetech", "math", "general"],
            max_position_embeddings=32768,
        )),
    ]

    results = []
    for name, cfg in configs:
        try:
            r = benchmark_config(name, cfg, device=device, batch_size=1, prompt_len=128 if "Nano" in name else 64, gen_tokens=32 if "Nano" in name else 16)
            results.append(r)
        except Exception as e:
            print(f"  ERROR: {e}")

    print(f"\n{'='*60}")
    print("  SUMMARY")
    print(f"{'='*60}")
    for r in results:
        print(f"  {r['name']}: {r['params_B']:.3f}B params, {r['bf16_GB']:.2f}GB BF16, {r['gen_tok_per_sec']:.1f} tok/s")

    print("\n  NOTE: This is the UNTRAINED architecture. Token output is random.")
    print("  Training requires: multi-GPU cluster, TB-scale dataset, weeks of compute.")
    print(f"{'='*60}")


if __name__ == "__main__":
    main()