import sys, torch, torchaudio from transformers import AutoModel, AutoProcessor REPO = "." DEV = "cuda" if torch.cuda.is_available() else "cpu" model = AutoModel.from_pretrained(REPO, trust_remote_code=True).to(DEV).eval() proc = AutoProcessor.from_pretrained(REPO, trust_remote_code=True) for path in sys.argv[1:]: wav, sr = torchaudio.load(path) wav = wav.mean(0) if wav.shape[0] > 1 else wav.squeeze(0) inputs = proc(wav, sampling_rate=sr, return_tensors="pt").to(DEV) print(f"{path}\t{proc.batch_decode(model.generate(**inputs))[0]}")