Spaces:
Running
Running
| """ | |
| Phase 1 smoke test: load Whisper, run inference on a sample audio clip. | |
| Prints model info, inference time, GPU memory usage, and sample transcript. | |
| Usage: | |
| python scripts/verify_baseline.py | |
| """ | |
| import sys | |
| import time | |
| from pathlib import Path | |
| # Allow imports from project root | |
| sys.path.insert(0, str(Path(__file__).parent.parent)) | |
| import numpy as np | |
| import torch | |
| def main() -> None: | |
| from src.engine.whisper_base import WhisperBackbone | |
| print("=" * 60) | |
| print("Sahel-Agri Voice AI — Baseline Verification") | |
| print("=" * 60) | |
| # 1. Check environment | |
| print(f"\nPython: {sys.version.split()[0]}") | |
| print(f"PyTorch: {torch.__version__}") | |
| print(f"CUDA available: {torch.cuda.is_available()}") | |
| if torch.cuda.is_available(): | |
| print(f"GPU: {torch.cuda.get_device_name(0)}") | |
| print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB") | |
| # 2. Load model | |
| print("\n[1/3] Loading backbone model...") | |
| t0 = time.time() | |
| backbone = WhisperBackbone("configs/base_config.yaml") | |
| backbone.load(device="cuda") | |
| load_time = time.time() - t0 | |
| print(f" Loaded in {load_time:.1f}s") | |
| if torch.cuda.is_available(): | |
| used = torch.cuda.memory_allocated() / 1e9 | |
| reserved = torch.cuda.memory_reserved() / 1e9 | |
| print(f" GPU memory: {used:.2f} GB allocated / {reserved:.2f} GB reserved") | |
| # 3. Generate synthetic test audio (1 second of silence with slight noise) | |
| print("\n[2/3] Generating test audio (1s white noise)...") | |
| sample_rate = 16000 | |
| duration = 1.0 | |
| audio = np.random.randn(int(sample_rate * duration)).astype(np.float32) * 0.01 | |
| # 4. Run inference | |
| print("[3/3] Running inference...") | |
| processor = backbone.processor | |
| model = backbone.model | |
| inputs = processor(audio, sampling_rate=sample_rate, return_tensors="pt") | |
| input_features = inputs.input_features.to(backbone.device) | |
| if backbone.device == "cuda": | |
| input_features = input_features.half() | |
| t0 = time.time() | |
| with torch.no_grad(): | |
| predicted_ids = model.generate(input_features, max_new_tokens=50) | |
| infer_time = time.time() - t0 | |
| transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] | |
| print(f"\n{'=' * 60}") | |
| print(f"Transcript: '{transcription}' (noise input — blank expected)") | |
| print(f"Inference time: {infer_time * 1000:.0f} ms") | |
| print(f"\nBaseline verification PASSED.") | |
| print(f"{'=' * 60}") | |
| if __name__ == "__main__": | |
| main() | |