File size: 6,161 Bytes
fcb2b04 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 | import requests
from typing import Optional, Union
import io
import json
from voice_client import VoiceClient
class StackWithVoice:
def __init__(self, stack_api_url: str, voice_api_url: str = "http://localhost:8000"):
self.stack_api_url = stack_api_url
self.voice_client = VoiceClient(voice_api_url)
self.session = requests.Session()
# Cache for voice models to avoid repeated API calls
self._voice_cache = {}
def _get_stack_response(self, prompt: str) -> str:
"""Get response from Stack 2.9 API"""
try:
response = self.session.post(
f"{self.stack_api_url}/api/chat",
json={"prompt": prompt, "model": "stack-2.9"},
headers={"Content-Type": "application/json"}
)
response.raise_for_status()
data = response.json()
return data.get("response", "")
except requests.RequestException as e:
raise Exception(f"Stack API request failed: {str(e)}")
def _get_voice_model(self, voice_name: str) -> Optional[dict]:
"""Get voice model info from cache or API"""
if voice_name in self._voice_cache:
return self._voice_cache[voice_name]
try:
voices = self.voice_client.list_voices()
for voice in voices:
if voice == voice_name:
self._voice_cache[voice_name] = {"name": voice_name}
return {"name": voice_name}
return None
except Exception as e:
print(f"Warning: Failed to get voice models: {e}")
return None
def voice_chat(self, prompt_audio_path: str, voice_name: str = "default") -> Optional[bytes]:
"""Complete voice chat workflow: audio → text → response → audio"""
# Step 1: Convert audio to text (placeholder - in real implementation, use speech-to-text)
print(f"Converting audio to text: {prompt_audio_path}")
prompt_text = self._audio_to_text(prompt_audio_path)
if not prompt_text:
return None
print(f"User prompt: {prompt_text}")
# Step 2: Get response from Stack 2.9
print("Getting response from Stack 2.9...")
response_text = self._get_stack_response(prompt_text)
if not response_text:
return None
print(f"Stack response: {response_text}")
# Step 3: Convert response to audio
print(f"Generating voice response with voice: {voice_name}")
audio_data = self.voice_client.synthesize(response_text, voice_name)
return audio_data
def _audio_to_text(self, audio_path: str) -> str:
"""Convert audio to text (placeholder implementation)"""
# In a real implementation, you would use a speech-to-text service
# For now, return a placeholder or read from a text file with the same name
text_path = audio_path.replace(".wav", ".txt").replace(".mp3", ".txt")
if os.path.exists(text_path):
with open(text_path, 'r') as f:
return f.read().strip()
# Fallback: return a generic prompt
return "This is a test voice prompt."
def voice_command(self, command: str, voice_name: str = "default") -> Optional[bytes]:
"""Execute voice command and get spoken response"""
print(f"Executing voice command: {command}")
# In a real implementation, you would parse the command and execute appropriate actions
# For now, just pass it to Stack 2.9 as-is
response_text = self._get_stack_response(command)
if not response_text:
return None
print(f"Command response: {response_text}")
# Generate voice response
audio_data = self.voice_client.synthesize(response_text, voice_name)
return audio_data
def streaming_voice_chat(self, prompt_audio_path: str, voice_name: str = "default") -> None:
"""Stream voice chat (placeholder implementation)"""
print("Starting streaming voice chat...")
# Get initial response
prompt_text = self._audio_to_text(prompt_audio_path)
response_text = self._get_stack_response(prompt_text)
if not response_text:
print("No response received")
return
print("Streaming response:")
print(response_text)
# In a real streaming implementation, you would:
# 1. Stream audio chunks to speech-to-text
# 2. Send partial prompts to Stack 2.9
# 3. Stream partial responses to TTS
# 4. Play audio as it's generated
# For now, just generate the complete response
audio_data = self.voice_client.synthesize(response_text, voice_name, stream=True)
# Save to file for demonstration
output_path = "./streaming_response.wav"
self.voice_client.download_audio(audio_data, output_path)
print(f"Streaming response saved to: {output_path}")
# Example usage
if __name__ == "__main__":
stack_voice = StackWithVoice(
stack_api_url="http://localhost:5000", # Example Stack 2.9 API URL
voice_api_url="http://localhost:8000"
)
print("Testing Stack with Voice integration...")
# Test voice chat
# audio_data = stack_voice.voice_chat("test_prompt.wav", "default")
# if audio_data:
# stack_voice.voice_client.download_audio(audio_data, "stack_response.wav")
# print("Voice chat response saved to stack_response.wav")
# Test voice command
# audio_data = stack_voice.voice_command("Write a Python function to calculate factorial", "default")
# if audio_data:
# stack_voice.voice_client.download_audio(audio_data, "command_response.wav")
# print("Voice command response saved to command_response.wav")
# Test streaming
# stack_voice.streaming_voice_chat("test_prompt.wav", "default") |