| | """Example for using HiggsAudio for generating both the transcript and audio in an interleaved manner.""" |
| |
|
| | from boson_multimodal.serve.serve_engine import HiggsAudioServeEngine, HiggsAudioResponse |
| | import torch |
| | import torchaudio |
| | import time |
| | from loguru import logger |
| | import click |
| |
|
| | from input_samples import INPUT_SAMPLES |
| |
|
| | MODEL_PATH = "bosonai/higgs-audio-v2-generation-3B-base" |
| | AUDIO_TOKENIZER_PATH = "bosonai/higgs-audio-v2-tokenizer" |
| |
|
| |
|
| | @click.command() |
| | @click.argument("example", type=click.Choice(list(INPUT_SAMPLES.keys()))) |
| | def main(example: str): |
| | input_sample = INPUT_SAMPLES[example]() |
| | device = "cuda" if torch.cuda.is_available() else "cpu" |
| | logger.info(f"Using device: {device}") |
| |
|
| | serve_engine = HiggsAudioServeEngine( |
| | MODEL_PATH, |
| | AUDIO_TOKENIZER_PATH, |
| | device=device, |
| | ) |
| |
|
| | logger.info("Starting generation...") |
| | start_time = time.time() |
| | output: HiggsAudioResponse = serve_engine.generate( |
| | chat_ml_sample=input_sample, |
| | max_new_tokens=1024, |
| | temperature=1.0, |
| | top_p=0.95, |
| | top_k=50, |
| | stop_strings=["<|end_of_text|>", "<|eot_id|>"], |
| | ) |
| | elapsed_time = time.time() - start_time |
| | logger.info(f"Generation time: {elapsed_time:.2f} seconds") |
| |
|
| | torchaudio.save(f"output_{example}.wav", torch.from_numpy(output.audio)[None, :], output.sampling_rate) |
| | logger.info(f"Generated text:\n{output.generated_text}") |
| | logger.info(f"Saved audio to output_{example}.wav") |
| |
|
| |
|
| | if __name__ == "__main__": |
| | main() |
| |
|