Spaces:
Sleeping
Sleeping
File size: 5,297 Bytes
fd320b1 6e2761a fd320b1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 | import os
import streamlit as st
from groq import Groq
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, pipeline
from espnet2.bin.tts_inference import Text2Speech
import soundfile as sf
from pydub import AudioSegment
import io
from streamlit_webrtc import webrtc_streamer, WebRtcMode, AudioProcessorBase
import av
import numpy as np
import nltk
# Download NLTK data
nltk.download("averaged_perceptron_tagger")
nltk.download("cmudict")
# Load Groq API key from environment secrets
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
if not GROQ_API_KEY:
st.error("Groq API key not found. Please add it as a secret.")
st.stop()
# Initialize Groq client
groq_client = Groq(api_key=GROQ_API_KEY)
# Load models
@st.cache_resource # Use st.cache_resource for caching models
def load_models():
# Speech-to-Text
processor = AutoProcessor.from_pretrained("openai/whisper-small")
stt_model = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-small")
stt_pipe = pipeline(
"automatic-speech-recognition",
model=stt_model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
return_timestamps=True # Enable timestamps for long-form audio
)
# Text-to-Speech
tts_model = Text2Speech.from_pretrained("espnet/espnet_tts_vctk_espnet_spk_voxceleb12_rawnet")
return stt_pipe, tts_model
stt_pipe, tts_model = load_models()
# Audio recorder
class AudioRecorder(AudioProcessorBase):
def __init__(self):
self.audio_frames = []
def recv(self, frame: av.AudioFrame) -> av.AudioFrame:
self.audio_frames.append(frame.to_ndarray())
return frame
# Streamlit app
st.title("Voice and Text Chatbot")
# Sidebar for mode selection
mode = st.sidebar.radio("Select Mode", ["Text Chatbot", "Voice Chatbot"])
if mode == "Text Chatbot":
# Text Chatbot
st.header("Text Chatbot")
user_input = st.text_input("Enter your message:")
if user_input:
try:
# Generate response using Groq API
chat_completion = groq_client.chat.completions.create(
messages=[{"role": "user", "content": user_input}],
model="mixtral-8x7b-32768",
temperature=0.5,
max_tokens=1024
)
response = chat_completion.choices[0].message.content
st.write("Generated Response:", response)
# Convert response to speech
speech, *_ = tts_model(response, spembs=tts_model.spembs[0]) # Use the first speaker embedding
sf.write("response.wav", speech, 22050)
st.audio("response.wav")
except Exception as e:
st.error(f"Error generating response: {e}")
elif mode == "Voice Chatbot":
# Voice Chatbot
st.header("Voice Chatbot")
# Audio recorder
st.write("Record your voice:")
webrtc_ctx = webrtc_streamer(
key="audio-recorder",
mode=WebRtcMode.SENDONLY,
audio_processor_factory=AudioRecorder,
media_stream_constraints={"audio": True, "video": False},
)
if webrtc_ctx.audio_processor:
st.write("Recording... Press 'Stop' to finish recording.")
# Save recorded audio to a WAV file
if st.button("Stop and Process Recording"):
audio_frames = webrtc_ctx.audio_processor.audio_frames
if audio_frames:
# Combine audio frames into a single array
audio_data = np.concatenate(audio_frames)
# Save as WAV file
sf.write("recorded_audio.wav", audio_data, samplerate=16000)
st.success("Recording saved as recorded_audio.wav")
# Process the recorded audio
speech, _ = sf.read("recorded_audio.wav")
output = stt_pipe(speech) # Transcribe with timestamps
# Display the full transcribed text
st.write("Transcribed Text:", output['text'])
# Display the text with timestamps (optional)
if 'chunks' in output:
st.write("Transcribed Text with Timestamps:")
for chunk in output['chunks']:
st.write(f"{chunk['timestamp'][0]:.2f} - {chunk['timestamp'][1]:.2f}: {chunk['text']}")
# Generate response using Groq API
try:
chat_completion = groq_client.chat.completions.create(
messages=[{"role": "user", "content": output['text']}],
model="mixtral-8x7b-32768",
temperature=0.5,
max_tokens=1024
)
response = chat_completion.choices[0].message.content
st.write("Generated Response:", response)
# Convert response to speech
speech, *_ = tts_model(response, spembs=tts_model.spembs[0]) # Use the first speaker embedding
sf.write("response.wav", speech, 22050)
st.audio("response.wav")
except Exception as e:
st.error(f"Error generating response: {e}")
else:
st.error("No audio recorded. Please try again.") |