Spaces:

MSGEncrypted
/

lesson-agent-dev

Sleeping

lesson-agent-dev / libs /echocoach /src /echocoach /voiceout.py

MSG

Feat/enhance UI and monorepo (#9)

7a28b9f 12 days ago

3.69 kB

	"""Reusable VoiceOut helpers for TeacherVoice, Chat, and other tabs."""

	from __future__ import annotations

	import re

	from echocoach.config import get_echo_coach_config, outputs_dir
	from echocoach.tts.piper import get_tts_backend


	def strip_references_for_tts(text: str) -> str:
	"""Remove citation/reference blocks before TTS."""
	cleaned = text
	for marker in ("\n\n## References", "\n\nReferences"):
	if marker in cleaned:
	cleaned = cleaned.split(marker, 1)[0]
	return cleaned.strip()


	def extract_message_text(content: object) -> str:
	"""Normalize Gradio chat content (plain string or message blocks) to text."""
	if content is None:
	return ""
	if isinstance(content, str):
	return content.strip()
	if isinstance(content, (list, tuple)):
	parts: list[str] = []
	for block in content:
	if isinstance(block, str):
	text = block.strip()
	elif isinstance(block, dict):
	if block.get("path") or block.get("file"):
	continue
	text = str(block.get("text") or block.get("content") or "").strip()
	else:
	text = str(block).strip()
	if text:
	parts.append(text)
	return "\n".join(parts)
	return str(content).strip()


	def last_assistant_message(history: list) -> str \| None:
	"""Return the most recent assistant message from Gradio chat history."""
	for item in reversed(history or []):
	if isinstance(item, dict) and item.get("role") == "assistant":
	content = extract_message_text(item.get("content"))
	return content or None
	if isinstance(item, (list, tuple)) and len(item) == 2 and item[1]:
	return extract_message_text(item[1]) or None
	return None


	def split_sentences(text: str) -> list[str]:
	parts = re.split(r"(?<=[.!?])\s+", text.strip())
	return [p.strip() for p in parts if p.strip()]


	def synthesize_voice_reply(
	text: str,
	*,
	language: str,
	tts_preset: str \| None = None,
	chunk_first: bool = True,
	out_subdir: str = "voiceout",
	) -> tuple[str \| None, str \| None, str \| None]:
	"""Return (full_wav, first_sentence_wav, warning)."""
	if not text.strip():
	return None, None, "No text to synthesize."

	config = get_echo_coach_config()
	preset = tts_preset or config.tts_preset
	tts = get_tts_backend(preset)
	out_dir = outputs_dir() / out_subdir
	full_path, warning = tts.synthesize(text, language=language, out_dir=out_dir)

	first_path = None
	if chunk_first:
	sentences = split_sentences(text)
	if len(sentences) > 1:
	first_path, first_warning = tts.synthesize(
	sentences[0],
	language=language,
	out_dir=out_dir,
	)
	if first_warning and not warning:
	warning = first_warning
	elif full_path:
	first_path = full_path

	return full_path, first_path, warning


	def speak_assistant_text(
	text: str,
	*,
	language: str = "en",
	tts_preset: str \| None = None,
	first_sentence_only: bool = False,
	) -> tuple[str \| None, str \| None, str \| None]:
	"""Synthesize assistant reply audio. Returns (playback_path, alt_path, warning)."""
	clean = strip_references_for_tts(text)
	full_path, first_path, warning = synthesize_voice_reply(
	clean,
	language=language,
	tts_preset=tts_preset,
	chunk_first=True,
	)
	if first_sentence_only:
	return first_path or full_path, full_path, warning
	return full_path or first_path, first_path, warning