| | from typing import Dict |
| | from transformers.pipelines.audio_utils import ffmpeg_read |
| | import whisper |
| | import torch |
| |
|
| | SAMPLE_RATE = 16000 |
| |
|
| | class EndpointHandler(): |
| | def __init__(self, path=""): |
| | |
| | self.model = whisper.load_model("large") |
| |
|
| | def __call__(self, data: Dict[str, bytes]) -> Dict[str, str]: |
| | """ |
| | Args: |
| | data (:obj:): |
| | includes the deserialized audio file as bytes |
| | Return: |
| | A :obj:`dict`:. base64 encoded image |
| | """ |
| | |
| | inputs = data.pop("inputs", data) |
| | audio_nparray = ffmpeg_read(inputs, SAMPLE_RATE) |
| | audio_tensor = torch.from_numpy(audio_nparray) |
| |
|
| | |
| | result = self.model.transcribe(audio_nparray) |
| |
|
| | |
| | return result |
| |
|