Spaces:

agnixcode
/

multimodel_chatbot

Sleeping

App Files Files Community

Dua Rajper commited on Mar 3, 2025

Commit

fd320b1

verified ·

1 Parent(s): 8bad5cf

Create app.py

Browse files

Files changed (1) hide show

app.py +139 -0

app.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import os
+import streamlit as st
+from groq import Groq
+from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, pipeline
+from espnet2.bin.tts_inference import Text2Speech
+import soundfile as sf
+from pydub import AudioSegment
+import io
+from streamlit_webrtc import webrtc_streamer, WebRtcMode, AudioProcessorBase
+import av
+import numpy as np
+# Load Groq API key from environment secrets
+GROQ_API_KEY = os.getenv("GROQ_API_KEY")
+if not GROQ_API_KEY:
+    st.error("Groq API key not found. Please add it as a secret.")
+    st.stop()
+# Initialize Groq client
+groq_client = Groq(api_key=GROQ_API_KEY)
+# Load models
+@st.cache_resource  # Use st.cache_resource for caching models
+def load_models():
+    # Speech-to-Text
+    processor = AutoProcessor.from_pretrained("openai/whisper-small")
+    stt_model = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-small")
+    stt_pipe = pipeline(
+        "automatic-speech-recognition",
+        model=stt_model,
+        tokenizer=processor.tokenizer,
+        feature_extractor=processor.feature_extractor,
+        return_timestamps=True  # Enable timestamps for long-form audio
+    )
+    # Text-to-Speech
+    tts_model = Text2Speech.from_pretrained("espnet/espnet_tts_vctk_espnet_spk_voxceleb12_rawnet")
+    return stt_pipe, tts_model
+stt_pipe, tts_model = load_models()
+# Audio recorder
+class AudioRecorder(AudioProcessorBase):
+    def __init__(self):
+        self.audio_frames = []
+    def recv(self, frame: av.AudioFrame) -> av.AudioFrame:
+        self.audio_frames.append(frame.to_ndarray())
+        return frame
+# Streamlit app
+st.title("Voice and Text Chatbot")
+# Sidebar for mode selection
+mode = st.sidebar.radio("Select Mode", ["Text Chatbot", "Voice Chatbot"])
+if mode == "Text Chatbot":
+    # Text Chatbot
+    st.header("Text Chatbot")
+    user_input = st.text_input("Enter your message:")
+    if user_input:
+        try:
+            # Generate response using Groq API
+            chat_completion = groq_client.chat.completions.create(
+                messages=[{"role": "user", "content": user_input}],
+                model="mixtral-8x7b-32768",
+                temperature=0.5,
+                max_tokens=1024
+            )
+            response = chat_completion.choices[0].message.content
+            st.write("Generated Response:", response)
+            # Convert response to speech
+            speech, *_ = tts_model(response, spembs=tts_model.spembs[0])  # Use the first speaker embedding
+            sf.write("response.wav", speech, 22050)
+            st.audio("response.wav")
+        except Exception as e:
+            st.error(f"Error generating response: {e}")
+elif mode == "Voice Chatbot":
+    # Voice Chatbot
+    st.header("Voice Chatbot")
+    # Audio recorder
+    st.write("Record your voice:")
+    webrtc_ctx = webrtc_streamer(
+        key="audio-recorder",
+        mode=WebRtcMode.SENDONLY,
+        audio_processor_factory=AudioRecorder,
+        media_stream_constraints={"audio": True, "video": False},
+    )
+    if webrtc_ctx.audio_processor:
+        st.write("Recording... Press 'Stop' to finish recording.")
+        # Save recorded audio to a WAV file
+        if st.button("Stop and Process Recording"):
+            audio_frames = webrtc_ctx.audio_processor.audio_frames
+            if audio_frames:
+                # Combine audio frames into a single array
+                audio_data = np.concatenate(audio_frames)
+                # Save as WAV file
+                sf.write("recorded_audio.wav", audio_data, samplerate=16000)
+                st.success("Recording saved as recorded_audio.wav")
+                # Process the recorded audio
+                speech, _ = sf.read("recorded_audio.wav")
+                output = stt_pipe(speech)  # Transcribe with timestamps
+                # Display the full transcribed text
+                st.write("Transcribed Text:", output['text'])
+                # Display the text with timestamps (optional)
+                if 'chunks' in output:
+                    st.write("Transcribed Text with Timestamps:")
+                    for chunk in output['chunks']:
+                        st.write(f"{chunk['timestamp'][0]:.2f} - {chunk['timestamp'][1]:.2f}: {chunk['text']}")
+                # Generate response using Groq API
+                try:
+                    chat_completion = groq_client.chat.completions.create(
+                        messages=[{"role": "user", "content": output['text']}],
+                        model="mixtral-8x7b-32768",
+                        temperature=0.5,
+                        max_tokens=1024
+                    )
+                    response = chat_completion.choices[0].message.content
+                    st.write("Generated Response:", response)
+                    # Convert response to speech
+                    speech, *_ = tts_model(response, spembs=tts_model.spembs[0])  # Use the first speaker embedding
+                    sf.write("response.wav", speech, 22050)
+                    st.audio("response.wav")
+                except Exception as e:
+                    st.error(f"Error generating response: {e}")
+            else:
+                st.error("No audio recorded. Please try again.")