File size: 2,548 Bytes
76db545
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
"""
Phase 1 smoke test: load Whisper, run inference on a sample audio clip.
Prints model info, inference time, GPU memory usage, and sample transcript.

Usage:
    python scripts/verify_baseline.py
"""
import sys
import time
from pathlib import Path

# Allow imports from project root
sys.path.insert(0, str(Path(__file__).parent.parent))

import numpy as np
import torch


def main() -> None:
    from src.engine.whisper_base import WhisperBackbone

    print("=" * 60)
    print("Sahel-Agri Voice AI — Baseline Verification")
    print("=" * 60)

    # 1. Check environment
    print(f"\nPython:   {sys.version.split()[0]}")
    print(f"PyTorch:  {torch.__version__}")
    print(f"CUDA available: {torch.cuda.is_available()}")
    if torch.cuda.is_available():
        print(f"GPU:      {torch.cuda.get_device_name(0)}")
        print(f"VRAM:     {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

    # 2. Load model
    print("\n[1/3] Loading backbone model...")
    t0 = time.time()
    backbone = WhisperBackbone("configs/base_config.yaml")
    backbone.load(device="cuda")
    load_time = time.time() - t0
    print(f"      Loaded in {load_time:.1f}s")

    if torch.cuda.is_available():
        used = torch.cuda.memory_allocated() / 1e9
        reserved = torch.cuda.memory_reserved() / 1e9
        print(f"      GPU memory: {used:.2f} GB allocated / {reserved:.2f} GB reserved")

    # 3. Generate synthetic test audio (1 second of silence with slight noise)
    print("\n[2/3] Generating test audio (1s white noise)...")
    sample_rate = 16000
    duration = 1.0
    audio = np.random.randn(int(sample_rate * duration)).astype(np.float32) * 0.01

    # 4. Run inference
    print("[3/3] Running inference...")
    processor = backbone.processor
    model = backbone.model

    inputs = processor(audio, sampling_rate=sample_rate, return_tensors="pt")
    input_features = inputs.input_features.to(backbone.device)
    if backbone.device == "cuda":
        input_features = input_features.half()

    t0 = time.time()
    with torch.no_grad():
        predicted_ids = model.generate(input_features, max_new_tokens=50)
    infer_time = time.time() - t0

    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]

    print(f"\n{'=' * 60}")
    print(f"Transcript:      '{transcription}' (noise input — blank expected)")
    print(f"Inference time:  {infer_time * 1000:.0f} ms")
    print(f"\nBaseline verification PASSED.")
    print(f"{'=' * 60}")


if __name__ == "__main__":
    main()