| import sys, torch, torchaudio |
| from transformers import AutoModel, AutoProcessor |
|
|
| REPO = "." |
| DEV = "cuda" if torch.cuda.is_available() else "cpu" |
| model = AutoModel.from_pretrained(REPO, trust_remote_code=True).to(DEV).eval() |
| proc = AutoProcessor.from_pretrained(REPO, trust_remote_code=True) |
|
|
| for path in sys.argv[1:]: |
| wav, sr = torchaudio.load(path) |
| wav = wav.mean(0) if wav.shape[0] > 1 else wav.squeeze(0) |
| inputs = proc(wav, sampling_rate=sr, return_tensors="pt").to(DEV) |
| print(f"{path}\t{proc.batch_decode(model.generate(**inputs))[0]}") |
|
|