Prevent acoustic feedback loops in voice chat by ignoring mic input during playback and terminate websocket loop immediately on drop
Browse files
backend/main.py
CHANGED
|
@@ -247,7 +247,6 @@ def chat_endpoint(request: ChatRequest):
|
|
| 247 |
detail=f"An error occurred: {str(e)}"
|
| 248 |
)
|
| 249 |
|
| 250 |
-
# WebSocket Endpoint for Socratic voice dialogue via Gemini Multimodal Live API
|
| 251 |
@app.websocket("/api/live-ws")
|
| 252 |
async def websocket_live_endpoint(websocket: WebSocket):
|
| 253 |
await websocket.accept()
|
|
@@ -258,10 +257,21 @@ async def websocket_live_endpoint(websocket: WebSocket):
|
|
| 258 |
await websocket.close(code=4000, reason="GEMINI_API_KEY is missing.")
|
| 259 |
return
|
| 260 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 261 |
try:
|
| 262 |
from google import genai
|
| 263 |
from google.genai import types
|
| 264 |
-
except ImportError:
|
|
|
|
| 265 |
await websocket.close(code=4001, reason="google-genai SDK not installed.")
|
| 266 |
return
|
| 267 |
|
|
@@ -285,15 +295,20 @@ async def websocket_live_endpoint(websocket: WebSocket):
|
|
| 285 |
try:
|
| 286 |
# Establish async WebSocket connection to Gemini Live using the Gemini 3.1 Flash Live model
|
| 287 |
async with client.aio.live.connect(model="gemini-3.1-flash-live-preview", config=config) as session:
|
|
|
|
| 288 |
|
| 289 |
async def receive_from_client():
|
| 290 |
try:
|
|
|
|
| 291 |
while True:
|
| 292 |
# Receive JSON from browser client
|
| 293 |
message = await websocket.receive_json()
|
| 294 |
msg_type = message.get("type")
|
| 295 |
|
| 296 |
if msg_type == "audio":
|
|
|
|
|
|
|
|
|
|
| 297 |
# Decode base64 PCM audio chunk sent from frontend
|
| 298 |
audio_bytes = base64.b64decode(message["data"])
|
| 299 |
# Stream real-time audio (using 'audio' instead of deprecated 'media')
|
|
@@ -301,12 +316,13 @@ async def websocket_live_endpoint(websocket: WebSocket):
|
|
| 301 |
audio=types.Blob(data=audio_bytes, mime_type="audio/pcm;rate=16000")
|
| 302 |
)
|
| 303 |
elif msg_type == "text":
|
|
|
|
| 304 |
# Send real-time text input
|
| 305 |
await session.send_realtime_input(text=message["data"])
|
| 306 |
except WebSocketDisconnect:
|
| 307 |
-
|
| 308 |
except Exception as e:
|
| 309 |
-
|
| 310 |
|
| 311 |
async def send_to_client():
|
| 312 |
try:
|
|
@@ -325,6 +341,7 @@ async def websocket_live_endpoint(websocket: WebSocket):
|
|
| 325 |
})
|
| 326 |
elif part.text is not None:
|
| 327 |
# Stream text transcription back to client
|
|
|
|
| 328 |
await websocket.send_json({
|
| 329 |
"type": "text",
|
| 330 |
"data": part.text
|
|
@@ -332,16 +349,27 @@ async def websocket_live_endpoint(websocket: WebSocket):
|
|
| 332 |
|
| 333 |
# Handle turn completion (model finished speaking)
|
| 334 |
if server_content.turn_complete:
|
|
|
|
| 335 |
await websocket.send_json({"type": "turn_complete"})
|
| 336 |
except Exception as e:
|
| 337 |
-
|
| 338 |
|
| 339 |
-
# Run both tasks concurrently
|
| 340 |
-
await asyncio.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 341 |
|
| 342 |
except Exception as e:
|
| 343 |
-
|
| 344 |
finally:
|
|
|
|
| 345 |
try:
|
| 346 |
await websocket.close()
|
| 347 |
except Exception:
|
|
|
|
| 247 |
detail=f"An error occurred: {str(e)}"
|
| 248 |
)
|
| 249 |
|
|
|
|
| 250 |
@app.websocket("/api/live-ws")
|
| 251 |
async def websocket_live_endpoint(websocket: WebSocket):
|
| 252 |
await websocket.accept()
|
|
|
|
| 257 |
await websocket.close(code=4000, reason="GEMINI_API_KEY is missing.")
|
| 258 |
return
|
| 259 |
|
| 260 |
+
def ws_log(msg: str):
|
| 261 |
+
log_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), "ws_debug.log")
|
| 262 |
+
try:
|
| 263 |
+
with open(log_file, "a", encoding="utf-8") as f:
|
| 264 |
+
f.write(f"[{datetime.now().strftime('%H:%M:%S.%f')[:-3]}] {msg}\n")
|
| 265 |
+
except Exception:
|
| 266 |
+
pass
|
| 267 |
+
|
| 268 |
+
ws_log("Client WebSocket connected. Initializing Live connection...")
|
| 269 |
+
|
| 270 |
try:
|
| 271 |
from google import genai
|
| 272 |
from google.genai import types
|
| 273 |
+
except ImportError as e:
|
| 274 |
+
ws_log(f"ImportError: google-genai not installed. {e}")
|
| 275 |
await websocket.close(code=4001, reason="google-genai SDK not installed.")
|
| 276 |
return
|
| 277 |
|
|
|
|
| 295 |
try:
|
| 296 |
# Establish async WebSocket connection to Gemini Live using the Gemini 3.1 Flash Live model
|
| 297 |
async with client.aio.live.connect(model="gemini-3.1-flash-live-preview", config=config) as session:
|
| 298 |
+
ws_log("Successfully connected to Gemini Live session.")
|
| 299 |
|
| 300 |
async def receive_from_client():
|
| 301 |
try:
|
| 302 |
+
audio_chunk_count = 0
|
| 303 |
while True:
|
| 304 |
# Receive JSON from browser client
|
| 305 |
message = await websocket.receive_json()
|
| 306 |
msg_type = message.get("type")
|
| 307 |
|
| 308 |
if msg_type == "audio":
|
| 309 |
+
audio_chunk_count += 1
|
| 310 |
+
if audio_chunk_count % 50 == 1:
|
| 311 |
+
ws_log(f"Received audio chunk {audio_chunk_count} from client.")
|
| 312 |
# Decode base64 PCM audio chunk sent from frontend
|
| 313 |
audio_bytes = base64.b64decode(message["data"])
|
| 314 |
# Stream real-time audio (using 'audio' instead of deprecated 'media')
|
|
|
|
| 316 |
audio=types.Blob(data=audio_bytes, mime_type="audio/pcm;rate=16000")
|
| 317 |
)
|
| 318 |
elif msg_type == "text":
|
| 319 |
+
ws_log(f"Received text query from client: {message['data']}")
|
| 320 |
# Send real-time text input
|
| 321 |
await session.send_realtime_input(text=message["data"])
|
| 322 |
except WebSocketDisconnect:
|
| 323 |
+
ws_log("Client WebSocket disconnected (WebSocketDisconnect).")
|
| 324 |
except Exception as e:
|
| 325 |
+
ws_log(f"[WebSocket Proxy Client -> Gemini] Error: {e}")
|
| 326 |
|
| 327 |
async def send_to_client():
|
| 328 |
try:
|
|
|
|
| 341 |
})
|
| 342 |
elif part.text is not None:
|
| 343 |
# Stream text transcription back to client
|
| 344 |
+
ws_log(f"Streaming text chunk from Gemini: {part.text}")
|
| 345 |
await websocket.send_json({
|
| 346 |
"type": "text",
|
| 347 |
"data": part.text
|
|
|
|
| 349 |
|
| 350 |
# Handle turn completion (model finished speaking)
|
| 351 |
if server_content.turn_complete:
|
| 352 |
+
ws_log("Gemini sent turn_complete.")
|
| 353 |
await websocket.send_json({"type": "turn_complete"})
|
| 354 |
except Exception as e:
|
| 355 |
+
ws_log(f"[WebSocket Proxy Gemini -> Client] Error: {e}")
|
| 356 |
|
| 357 |
+
# Run both tasks concurrently and terminate when the first one finishes
|
| 358 |
+
done, pending = await asyncio.wait(
|
| 359 |
+
[
|
| 360 |
+
asyncio.create_task(receive_from_client()),
|
| 361 |
+
asyncio.create_task(send_to_client())
|
| 362 |
+
],
|
| 363 |
+
return_when=asyncio.FIRST_COMPLETED
|
| 364 |
+
)
|
| 365 |
+
|
| 366 |
+
for task in pending:
|
| 367 |
+
task.cancel()
|
| 368 |
|
| 369 |
except Exception as e:
|
| 370 |
+
ws_log(f"WebSocket Gemini Live connection failed: {e}")
|
| 371 |
finally:
|
| 372 |
+
ws_log("Closing WebSocket and cleaning up.")
|
| 373 |
try:
|
| 374 |
await websocket.close()
|
| 375 |
except Exception:
|
frontend/src/components/VoiceSessionModal.jsx
CHANGED
|
@@ -72,6 +72,11 @@ export default function VoiceSessionModal({ isOpen, onClose, apiKey }) {
|
|
| 72 |
if (isMutedRef.current) return;
|
| 73 |
if (statusRef.current === 'speaking' || statusRef.current === 'connecting') return;
|
| 74 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
const inputData = e.inputBuffer.getChannelData(0);
|
| 76 |
// Convert Float32 to Int16 PCM
|
| 77 |
const pcmData = new Int16Array(inputData.length);
|
|
|
|
| 72 |
if (isMutedRef.current) return;
|
| 73 |
if (statusRef.current === 'speaking' || statusRef.current === 'connecting') return;
|
| 74 |
|
| 75 |
+
const pContext = playbackContextRef.current;
|
| 76 |
+
if (pContext && pContext.currentTime < nextPlayTimeRef.current) {
|
| 77 |
+
return;
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
const inputData = e.inputBuffer.getChannelData(0);
|
| 81 |
// Convert Float32 to Int16 PCM
|
| 82 |
const pcmData = new Int16Array(inputData.length);
|