File size: 5,297 Bytes
fd320b1
 
 
 
 
 
 
 
 
 
 
6e2761a
 
 
 
 
fd320b1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import os
import streamlit as st
from groq import Groq
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, pipeline
from espnet2.bin.tts_inference import Text2Speech
import soundfile as sf
from pydub import AudioSegment
import io
from streamlit_webrtc import webrtc_streamer, WebRtcMode, AudioProcessorBase
import av
import numpy as np
import nltk

# Download NLTK data
nltk.download("averaged_perceptron_tagger")
nltk.download("cmudict")

# Load Groq API key from environment secrets
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
if not GROQ_API_KEY:
    st.error("Groq API key not found. Please add it as a secret.")
    st.stop()

# Initialize Groq client
groq_client = Groq(api_key=GROQ_API_KEY)

# Load models
@st.cache_resource  # Use st.cache_resource for caching models
def load_models():
    # Speech-to-Text
    processor = AutoProcessor.from_pretrained("openai/whisper-small")
    stt_model = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-small")
    stt_pipe = pipeline(
        "automatic-speech-recognition",
        model=stt_model,
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
        return_timestamps=True  # Enable timestamps for long-form audio
    )

    # Text-to-Speech
    tts_model = Text2Speech.from_pretrained("espnet/espnet_tts_vctk_espnet_spk_voxceleb12_rawnet")

    return stt_pipe, tts_model

stt_pipe, tts_model = load_models()

# Audio recorder
class AudioRecorder(AudioProcessorBase):
    def __init__(self):
        self.audio_frames = []

    def recv(self, frame: av.AudioFrame) -> av.AudioFrame:
        self.audio_frames.append(frame.to_ndarray())
        return frame

# Streamlit app
st.title("Voice and Text Chatbot")

# Sidebar for mode selection
mode = st.sidebar.radio("Select Mode", ["Text Chatbot", "Voice Chatbot"])

if mode == "Text Chatbot":
    # Text Chatbot
    st.header("Text Chatbot")
    user_input = st.text_input("Enter your message:")

    if user_input:
        try:
            # Generate response using Groq API
            chat_completion = groq_client.chat.completions.create(
                messages=[{"role": "user", "content": user_input}],
                model="mixtral-8x7b-32768",
                temperature=0.5,
                max_tokens=1024
            )
            response = chat_completion.choices[0].message.content
            st.write("Generated Response:", response)

            # Convert response to speech
            speech, *_ = tts_model(response, spembs=tts_model.spembs[0])  # Use the first speaker embedding
            sf.write("response.wav", speech, 22050)
            st.audio("response.wav")
        except Exception as e:
            st.error(f"Error generating response: {e}")

elif mode == "Voice Chatbot":
    # Voice Chatbot
    st.header("Voice Chatbot")

    # Audio recorder
    st.write("Record your voice:")
    webrtc_ctx = webrtc_streamer(
        key="audio-recorder",
        mode=WebRtcMode.SENDONLY,
        audio_processor_factory=AudioRecorder,
        media_stream_constraints={"audio": True, "video": False},
    )

    if webrtc_ctx.audio_processor:
        st.write("Recording... Press 'Stop' to finish recording.")

        # Save recorded audio to a WAV file
        if st.button("Stop and Process Recording"):
            audio_frames = webrtc_ctx.audio_processor.audio_frames
            if audio_frames:
                # Combine audio frames into a single array
                audio_data = np.concatenate(audio_frames)
                # Save as WAV file
                sf.write("recorded_audio.wav", audio_data, samplerate=16000)
                st.success("Recording saved as recorded_audio.wav")

                # Process the recorded audio
                speech, _ = sf.read("recorded_audio.wav")
                output = stt_pipe(speech)  # Transcribe with timestamps

                # Display the full transcribed text
                st.write("Transcribed Text:", output['text'])

                # Display the text with timestamps (optional)
                if 'chunks' in output:
                    st.write("Transcribed Text with Timestamps:")
                    for chunk in output['chunks']:
                        st.write(f"{chunk['timestamp'][0]:.2f} - {chunk['timestamp'][1]:.2f}: {chunk['text']}")

                # Generate response using Groq API
                try:
                    chat_completion = groq_client.chat.completions.create(
                        messages=[{"role": "user", "content": output['text']}],
                        model="mixtral-8x7b-32768",
                        temperature=0.5,
                        max_tokens=1024
                    )
                    response = chat_completion.choices[0].message.content
                    st.write("Generated Response:", response)

                    # Convert response to speech
                    speech, *_ = tts_model(response, spembs=tts_model.spembs[0])  # Use the first speaker embedding
                    sf.write("response.wav", speech, 22050)
                    st.audio("response.wav")
                except Exception as e:
                    st.error(f"Error generating response: {e}")
            else:
                st.error("No audio recorded. Please try again.")