Spaces:

agnixcode
/

multimodel_chatbot

Sleeping

File size: 5,297 Bytes

import os
import streamlit as st
from groq import Groq
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, pipeline
from espnet2.bin.tts_inference import Text2Speech
import soundfile as sf
from pydub import AudioSegment
import io
from streamlit_webrtc import webrtc_streamer, WebRtcMode, AudioProcessorBase
import av
import numpy as np
import nltk

# Download NLTK data
nltk.download("averaged_perceptron_tagger")
nltk.download("cmudict")

# Load Groq API key from environment secrets
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
if not GROQ_API_KEY:
    st.error("Groq API key not found. Please add it as a secret.")
    st.stop()

# Initialize Groq client
groq_client = Groq(api_key=GROQ_API_KEY)

# Load models
@st.cache_resource  # Use st.cache_resource for caching models
def load_models():
    # Speech-to-Text
    processor = AutoProcessor.from_pretrained("openai/whisper-small")
    stt_model = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-small")
    stt_pipe = pipeline(
        "automatic-speech-recognition",
        model=stt_model,
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
        return_timestamps=True  # Enable timestamps for long-form audio
    )

    # Text-to-Speech
    tts_model = Text2Speech.from_pretrained("espnet/espnet_tts_vctk_espnet_spk_voxceleb12_rawnet")

    return stt_pipe, tts_model

stt_pipe, tts_model = load_models()

# Audio recorder
class AudioRecorder(AudioProcessorBase):
    def __init__(self):
        self.audio_frames = []

    def recv(self, frame: av.AudioFrame) -> av.AudioFrame:
        self.audio_frames.append(frame.to_ndarray())
        return frame

# Streamlit app
st.title("Voice and Text Chatbot")

# Sidebar for mode selection
mode = st.sidebar.radio("Select Mode", ["Text Chatbot", "Voice Chatbot"])

if mode == "Text Chatbot":
    # Text Chatbot
    st.header("Text Chatbot")
    user_input = st.text_input("Enter your message:")

    if user_input:
        try:
            # Generate response using Groq API
            chat_completion = groq_client.chat.completions.create(
                messages=[{"role": "user", "content": user_input}],
                model="mixtral-8x7b-32768",
                temperature=0.5,
                max_tokens=1024
            )
            response = chat_completion.choices[0].message.content
            st.write("Generated Response:", response)

            # Convert response to speech
            speech, *_ = tts_model(response, spembs=tts_model.spembs[0])  # Use the first speaker embedding
            sf.write("response.wav", speech, 22050)
            st.audio("response.wav")
        except Exception as e:
            st.error(f"Error generating response: {e}")

elif mode == "Voice Chatbot":
    # Voice Chatbot
    st.header("Voice Chatbot")

    # Audio recorder
    st.write("Record your voice:")
    webrtc_ctx = webrtc_streamer(
        key="audio-recorder",
        mode=WebRtcMode.SENDONLY,
        audio_processor_factory=AudioRecorder,
        media_stream_constraints={"audio": True, "video": False},
    )

    if webrtc_ctx.audio_processor:
        st.write("Recording... Press 'Stop' to finish recording.")

        # Save recorded audio to a WAV file
        if st.button("Stop and Process Recording"):
            audio_frames = webrtc_ctx.audio_processor.audio_frames
            if audio_frames:
                # Combine audio frames into a single array
                audio_data = np.concatenate(audio_frames)
                # Save as WAV file
                sf.write("recorded_audio.wav", audio_data, samplerate=16000)
                st.success("Recording saved as recorded_audio.wav")

                # Process the recorded audio
                speech, _ = sf.read("recorded_audio.wav")
                output = stt_pipe(speech)  # Transcribe with timestamps

                # Display the full transcribed text
                st.write("Transcribed Text:", output['text'])

                # Display the text with timestamps (optional)
                if 'chunks' in output:
                    st.write("Transcribed Text with Timestamps:")
                    for chunk in output['chunks']:
                        st.write(f"{chunk['timestamp'][0]:.2f} - {chunk['timestamp'][1]:.2f}: {chunk['text']}")

                # Generate response using Groq API
                try:
                    chat_completion = groq_client.chat.completions.create(
                        messages=[{"role": "user", "content": output['text']}],
                        model="mixtral-8x7b-32768",
                        temperature=0.5,
                        max_tokens=1024
                    )
                    response = chat_completion.choices[0].message.content
                    st.write("Generated Response:", response)

                    # Convert response to speech
                    speech, *_ = tts_model(response, spembs=tts_model.spembs[0])  # Use the first speaker embedding
                    sf.write("response.wav", speech, 22050)
                    st.audio("response.wav")
                except Exception as e:
                    st.error(f"Error generating response: {e}")
            else:
                st.error("No audio recorded. Please try again.")