Stack-2-9-finetuned / stack /voice /stack_voice_integration.py

walidsobhie-code

refactor: Squeeze folders further - cleaner structure

65888d5 22 days ago

6.16 kB

	import requests
	from typing import Optional, Union
	import io
	import json
	from voice_client import VoiceClient

	class StackWithVoice:
	def __init__(self, stack_api_url: str, voice_api_url: str = "http://localhost:8000"):
	self.stack_api_url = stack_api_url
	self.voice_client = VoiceClient(voice_api_url)
	self.session = requests.Session()

	# Cache for voice models to avoid repeated API calls
	self._voice_cache = {}

	def _get_stack_response(self, prompt: str) -> str:
	"""Get response from Stack 2.9 API"""
	try:
	response = self.session.post(
	f"{self.stack_api_url}/api/chat",
	json={"prompt": prompt, "model": "stack-2.9"},
	headers={"Content-Type": "application/json"}
	)
	response.raise_for_status()

	data = response.json()
	return data.get("response", "")

	except requests.RequestException as e:
	raise Exception(f"Stack API request failed: {str(e)}")

	def _get_voice_model(self, voice_name: str) -> Optional[dict]:
	"""Get voice model info from cache or API"""
	if voice_name in self._voice_cache:
	return self._voice_cache[voice_name]

	try:
	voices = self.voice_client.list_voices()
	for voice in voices:
	if voice == voice_name:
	self._voice_cache[voice_name] = {"name": voice_name}
	return {"name": voice_name}
	return None
	except Exception as e:
	print(f"Warning: Failed to get voice models: {e}")
	return None

	def voice_chat(self, prompt_audio_path: str, voice_name: str = "default") -> Optional[bytes]:
	"""Complete voice chat workflow: audio → text → response → audio"""
	# Step 1: Convert audio to text (placeholder - in real implementation, use speech-to-text)
	print(f"Converting audio to text: {prompt_audio_path}")
	prompt_text = self._audio_to_text(prompt_audio_path)
	if not prompt_text:
	return None

	print(f"User prompt: {prompt_text}")

	# Step 2: Get response from Stack 2.9
	print("Getting response from Stack 2.9...")
	response_text = self._get_stack_response(prompt_text)

	if not response_text:
	return None

	print(f"Stack response: {response_text}")

	# Step 3: Convert response to audio
	print(f"Generating voice response with voice: {voice_name}")
	audio_data = self.voice_client.synthesize(response_text, voice_name)

	return audio_data

	def _audio_to_text(self, audio_path: str) -> str:
	"""Convert audio to text (placeholder implementation)"""
	# In a real implementation, you would use a speech-to-text service
	# For now, return a placeholder or read from a text file with the same name
	text_path = audio_path.replace(".wav", ".txt").replace(".mp3", ".txt")

	if os.path.exists(text_path):
	with open(text_path, 'r') as f:
	return f.read().strip()

	# Fallback: return a generic prompt
	return "This is a test voice prompt."

	def voice_command(self, command: str, voice_name: str = "default") -> Optional[bytes]:
	"""Execute voice command and get spoken response"""
	print(f"Executing voice command: {command}")

	# In a real implementation, you would parse the command and execute appropriate actions
	# For now, just pass it to Stack 2.9 as-is
	response_text = self._get_stack_response(command)

	if not response_text:
	return None

	print(f"Command response: {response_text}")

	# Generate voice response
	audio_data = self.voice_client.synthesize(response_text, voice_name)

	return audio_data

	def streaming_voice_chat(self, prompt_audio_path: str, voice_name: str = "default") -> None:
	"""Stream voice chat (placeholder implementation)"""
	print("Starting streaming voice chat...")

	# Get initial response
	prompt_text = self._audio_to_text(prompt_audio_path)
	response_text = self._get_stack_response(prompt_text)

	if not response_text:
	print("No response received")
	return

	print("Streaming response:")
	print(response_text)

	# In a real streaming implementation, you would:
	# 1. Stream audio chunks to speech-to-text
	# 2. Send partial prompts to Stack 2.9
	# 3. Stream partial responses to TTS
	# 4. Play audio as it's generated

	# For now, just generate the complete response
	audio_data = self.voice_client.synthesize(response_text, voice_name, stream=True)

	# Save to file for demonstration
	output_path = "./streaming_response.wav"
	self.voice_client.download_audio(audio_data, output_path)
	print(f"Streaming response saved to: {output_path}")

	# Example usage
	if __name__ == "__main__":
	stack_voice = StackWithVoice(
	stack_api_url="http://localhost:5000", # Example Stack 2.9 API URL
	voice_api_url="http://localhost:8000"
	)

	print("Testing Stack with Voice integration...")

	# Test voice chat
	# audio_data = stack_voice.voice_chat("test_prompt.wav", "default")
	# if audio_data:
	# stack_voice.voice_client.download_audio(audio_data, "stack_response.wav")
	# print("Voice chat response saved to stack_response.wav")

	# Test voice command
	# audio_data = stack_voice.voice_command("Write a Python function to calculate factorial", "default")
	# if audio_data:
	# stack_voice.voice_client.download_audio(audio_data, "command_response.wav")
	# print("Voice command response saved to command_response.wav")

	# Test streaming
	# stack_voice.streaming_voice_chat("test_prompt.wav", "default")