"""Honest benchmark of Bee AGI — architecture-only, untrained.

This measures:
- Parameter count per config
- Memory footprint (FP32 / BF16 / INT8)
- Forward pass latency (single token + full sequence)
- Generation throughput (tokens/sec on CPU)
- Architecture module validation
"""

import time
import sys
from pathlib import Path

import torch

sys.path.insert(0, str(Path(__file__).resolve().parent))
from bee.agi_register import register_agi
from bee.agi_config import BeeAGIConfig
from bee.agi_model import BeeAGIForCausalLM

register_agi()


def count_params(model):
    total = sum(p.numel() for p in model.parameters())
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return total, trainable


def benchmark_config(name, config, device="cpu", batch_size=1, prompt_len=512, gen_tokens=128):
    print(f"\n{'='*60}")
    print(f"  Config: {name}")
    print(f"{'='*60}")

    model = BeeAGIForCausalLM(config).to(device).eval()
    total, trainable = count_params(model)
    print(f"  Total params:   {total / 1e6:.2f}M  ({total / 1e9:.3f}B)")
    print(f"  Trainable:      {trainable / 1e6:.2f}M")

    # Memory estimates
    fp32_bytes = total * 4
    bf16_bytes = total * 2
    int8_bytes = total * 1
    print(f"  FP32 memory:    {fp32_bytes / 1e9:.2f} GB")
    print(f"  BF16 memory:    {bf16_bytes / 1e9:.2f} GB")
    print(f"  INT8 memory:    {int8_bytes / 1e9:.2f} GB")

    # Warmup
    dummy_ids = torch.randint(0, config.vocab_size, (batch_size, prompt_len), device=device)
    with torch.no_grad():
        _ = model(dummy_ids)

    # Forward pass (full sequence)
    torch.cuda.synchronize() if device == "cuda" else None
    t0 = time.perf_counter()
    with torch.no_grad():
        _ = model(dummy_ids)
    torch.cuda.synchronize() if device == "cuda" else None
    t1 = time.perf_counter()
    fwd_ms = (t1 - t0) * 1000
    print(f"  Forward {prompt_len} tok:  {fwd_ms:.1f} ms  ({prompt_len * batch_size / (t1 - t0):.1f} tok/sec)")

    # Generation throughput
    input_ids = torch.randint(0, config.vocab_size, (batch_size, 32), device=device)
    t0 = time.perf_counter()
    with torch.no_grad():
        out = model.generate(input_ids, max_new_tokens=gen_tokens, do_sample=False, temperature=1.0)
    t1 = time.perf_counter()
    gen_time = t1 - t0
    tok_per_sec = gen_tokens * batch_size / gen_time
    print(f"  Generate {gen_tokens} tok:  {gen_time * 1000:.1f} ms  ({tok_per_sec:.1f} tok/sec)")
    print(f"  Output shape:   {out.shape}")

    # MacBook feasibility
    ram_gb = bf16_bytes / 1e9
    feasible = "YES" if ram_gb < 32 else "NO (needs GPU cluster)"
    print(f"  MacBook viable: {feasible}")

    return {
        "name": name,
        "params_M": total / 1e6,
        "params_B": total / 1e9,
        "fp32_GB": fp32_bytes / 1e9,
        "bf16_GB": bf16_bytes / 1e9,
        "int8_GB": int8_bytes / 1e9,
        "fwd_ms": fwd_ms,
        "gen_tok_per_sec": tok_per_sec,
        "macbook_viable": ram_gb < 32,
    }


def main():
    device = "mps" if torch.backends.mps.is_available() else "cpu"
    print(f"Device: {device}")

    configs = [
        ("Bee-Nano (test)", BeeAGIConfig(
            vocab_size=1000, hidden_size=256, num_hidden_layers=4,
            num_attention_heads=4, num_key_value_heads=2, intermediate_size=512,
            num_experts=4, num_experts_per_tok=2, moe_layers=[1, 3],
            state_space_layers=[2], state_dim=16, memory_slots=64,
            memory_dim=256, reasoning_depth=2, compression_latent_dim=64,
            domain_expert_count=4, domains=["programming", "quantum", "general", "math"],
            max_position_embeddings=512,
        )),
        ("Bee-Tiny (256M est)", BeeAGIConfig(
            vocab_size=32000, hidden_size=1024, num_hidden_layers=24,
            num_attention_heads=16, num_key_value_heads=4, intermediate_size=2816,
            num_experts=8, num_experts_per_tok=2, moe_layers=list(range(6, 24, 4)),
            state_space_layers=list(range(4, 24, 6)), state_dim=32,
            memory_slots=1024, memory_dim=1024, reasoning_depth=4,
            compression_latent_dim=128, domain_expert_count=8,
            domains=["programming", "quantum", "blockchain", "crypto", "fintech", "spacetech", "math", "general"],
            max_position_embeddings=8192,
        )),
        ("Bee-Medium (4B est)", BeeAGIConfig(
            vocab_size=100000, hidden_size=2048, num_hidden_layers=32,
            num_attention_heads=16, num_key_value_heads=4, intermediate_size=5632,
            num_experts=16, num_experts_per_tok=2, moe_layers=list(range(8, 32, 4)),
            state_space_layers=list(range(4, 32, 6)), state_dim=64,
            memory_slots=4096, memory_dim=2048, reasoning_depth=6,
            compression_latent_dim=256, domain_expert_count=8,
            domains=["programming", "quantum", "blockchain", "crypto", "fintech", "spacetech", "math", "general"],
            max_position_embeddings=32768,
        )),
    ]

    results = []
    for name, cfg in configs:
        try:
            r = benchmark_config(name, cfg, device=device, batch_size=1, prompt_len=128 if "Nano" in name else 64, gen_tokens=32 if "Nano" in name else 16)
            results.append(r)
        except Exception as e:
            print(f"  ERROR: {e}")

    print(f"\n{'='*60}")
    print("  SUMMARY")
    print(f"{'='*60}")
    for r in results:
        print(f"  {r['name']}: {r['params_B']:.3f}B params, {r['bf16_GB']:.2f}GB BF16, {r['gen_tok_per_sec']:.1f} tok/s")

    print("\n  NOTE: This is the UNTRAINED architecture. Token output is random.")
    print("  Training requires: multi-GPU cluster, TB-scale dataset, weeks of compute.")
    print(f"{'='*60}")


if __name__ == "__main__":
    main()