3v324v23 commited on
Commit
34529d9
·
1 Parent(s): 6f6528a

Prevent acoustic feedback loops in voice chat by ignoring mic input during playback and terminate websocket loop immediately on drop

Browse files
backend/main.py CHANGED
@@ -247,7 +247,6 @@ def chat_endpoint(request: ChatRequest):
247
  detail=f"An error occurred: {str(e)}"
248
  )
249
 
250
- # WebSocket Endpoint for Socratic voice dialogue via Gemini Multimodal Live API
251
  @app.websocket("/api/live-ws")
252
  async def websocket_live_endpoint(websocket: WebSocket):
253
  await websocket.accept()
@@ -258,10 +257,21 @@ async def websocket_live_endpoint(websocket: WebSocket):
258
  await websocket.close(code=4000, reason="GEMINI_API_KEY is missing.")
259
  return
260
 
 
 
 
 
 
 
 
 
 
 
261
  try:
262
  from google import genai
263
  from google.genai import types
264
- except ImportError:
 
265
  await websocket.close(code=4001, reason="google-genai SDK not installed.")
266
  return
267
 
@@ -285,15 +295,20 @@ async def websocket_live_endpoint(websocket: WebSocket):
285
  try:
286
  # Establish async WebSocket connection to Gemini Live using the Gemini 3.1 Flash Live model
287
  async with client.aio.live.connect(model="gemini-3.1-flash-live-preview", config=config) as session:
 
288
 
289
  async def receive_from_client():
290
  try:
 
291
  while True:
292
  # Receive JSON from browser client
293
  message = await websocket.receive_json()
294
  msg_type = message.get("type")
295
 
296
  if msg_type == "audio":
 
 
 
297
  # Decode base64 PCM audio chunk sent from frontend
298
  audio_bytes = base64.b64decode(message["data"])
299
  # Stream real-time audio (using 'audio' instead of deprecated 'media')
@@ -301,12 +316,13 @@ async def websocket_live_endpoint(websocket: WebSocket):
301
  audio=types.Blob(data=audio_bytes, mime_type="audio/pcm;rate=16000")
302
  )
303
  elif msg_type == "text":
 
304
  # Send real-time text input
305
  await session.send_realtime_input(text=message["data"])
306
  except WebSocketDisconnect:
307
- pass
308
  except Exception as e:
309
- print(f"[WebSocket Proxy Client -> Gemini] Error: {e}")
310
 
311
  async def send_to_client():
312
  try:
@@ -325,6 +341,7 @@ async def websocket_live_endpoint(websocket: WebSocket):
325
  })
326
  elif part.text is not None:
327
  # Stream text transcription back to client
 
328
  await websocket.send_json({
329
  "type": "text",
330
  "data": part.text
@@ -332,16 +349,27 @@ async def websocket_live_endpoint(websocket: WebSocket):
332
 
333
  # Handle turn completion (model finished speaking)
334
  if server_content.turn_complete:
 
335
  await websocket.send_json({"type": "turn_complete"})
336
  except Exception as e:
337
- print(f"[WebSocket Proxy Gemini -> Client] Error: {e}")
338
 
339
- # Run both tasks concurrently
340
- await asyncio.gather(receive_from_client(), send_to_client())
 
 
 
 
 
 
 
 
 
341
 
342
  except Exception as e:
343
- print(f"WebSocket Gemini Live connection failed: {e}")
344
  finally:
 
345
  try:
346
  await websocket.close()
347
  except Exception:
 
247
  detail=f"An error occurred: {str(e)}"
248
  )
249
 
 
250
  @app.websocket("/api/live-ws")
251
  async def websocket_live_endpoint(websocket: WebSocket):
252
  await websocket.accept()
 
257
  await websocket.close(code=4000, reason="GEMINI_API_KEY is missing.")
258
  return
259
 
260
+ def ws_log(msg: str):
261
+ log_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), "ws_debug.log")
262
+ try:
263
+ with open(log_file, "a", encoding="utf-8") as f:
264
+ f.write(f"[{datetime.now().strftime('%H:%M:%S.%f')[:-3]}] {msg}\n")
265
+ except Exception:
266
+ pass
267
+
268
+ ws_log("Client WebSocket connected. Initializing Live connection...")
269
+
270
  try:
271
  from google import genai
272
  from google.genai import types
273
+ except ImportError as e:
274
+ ws_log(f"ImportError: google-genai not installed. {e}")
275
  await websocket.close(code=4001, reason="google-genai SDK not installed.")
276
  return
277
 
 
295
  try:
296
  # Establish async WebSocket connection to Gemini Live using the Gemini 3.1 Flash Live model
297
  async with client.aio.live.connect(model="gemini-3.1-flash-live-preview", config=config) as session:
298
+ ws_log("Successfully connected to Gemini Live session.")
299
 
300
  async def receive_from_client():
301
  try:
302
+ audio_chunk_count = 0
303
  while True:
304
  # Receive JSON from browser client
305
  message = await websocket.receive_json()
306
  msg_type = message.get("type")
307
 
308
  if msg_type == "audio":
309
+ audio_chunk_count += 1
310
+ if audio_chunk_count % 50 == 1:
311
+ ws_log(f"Received audio chunk {audio_chunk_count} from client.")
312
  # Decode base64 PCM audio chunk sent from frontend
313
  audio_bytes = base64.b64decode(message["data"])
314
  # Stream real-time audio (using 'audio' instead of deprecated 'media')
 
316
  audio=types.Blob(data=audio_bytes, mime_type="audio/pcm;rate=16000")
317
  )
318
  elif msg_type == "text":
319
+ ws_log(f"Received text query from client: {message['data']}")
320
  # Send real-time text input
321
  await session.send_realtime_input(text=message["data"])
322
  except WebSocketDisconnect:
323
+ ws_log("Client WebSocket disconnected (WebSocketDisconnect).")
324
  except Exception as e:
325
+ ws_log(f"[WebSocket Proxy Client -> Gemini] Error: {e}")
326
 
327
  async def send_to_client():
328
  try:
 
341
  })
342
  elif part.text is not None:
343
  # Stream text transcription back to client
344
+ ws_log(f"Streaming text chunk from Gemini: {part.text}")
345
  await websocket.send_json({
346
  "type": "text",
347
  "data": part.text
 
349
 
350
  # Handle turn completion (model finished speaking)
351
  if server_content.turn_complete:
352
+ ws_log("Gemini sent turn_complete.")
353
  await websocket.send_json({"type": "turn_complete"})
354
  except Exception as e:
355
+ ws_log(f"[WebSocket Proxy Gemini -> Client] Error: {e}")
356
 
357
+ # Run both tasks concurrently and terminate when the first one finishes
358
+ done, pending = await asyncio.wait(
359
+ [
360
+ asyncio.create_task(receive_from_client()),
361
+ asyncio.create_task(send_to_client())
362
+ ],
363
+ return_when=asyncio.FIRST_COMPLETED
364
+ )
365
+
366
+ for task in pending:
367
+ task.cancel()
368
 
369
  except Exception as e:
370
+ ws_log(f"WebSocket Gemini Live connection failed: {e}")
371
  finally:
372
+ ws_log("Closing WebSocket and cleaning up.")
373
  try:
374
  await websocket.close()
375
  except Exception:
frontend/src/components/VoiceSessionModal.jsx CHANGED
@@ -72,6 +72,11 @@ export default function VoiceSessionModal({ isOpen, onClose, apiKey }) {
72
  if (isMutedRef.current) return;
73
  if (statusRef.current === 'speaking' || statusRef.current === 'connecting') return;
74
 
 
 
 
 
 
75
  const inputData = e.inputBuffer.getChannelData(0);
76
  // Convert Float32 to Int16 PCM
77
  const pcmData = new Int16Array(inputData.length);
 
72
  if (isMutedRef.current) return;
73
  if (statusRef.current === 'speaking' || statusRef.current === 'connecting') return;
74
 
75
+ const pContext = playbackContextRef.current;
76
+ if (pContext && pContext.currentTime < nextPlayTimeRef.current) {
77
+ return;
78
+ }
79
+
80
  const inputData = e.inputBuffer.getChannelData(0);
81
  // Convert Float32 to Int16 PCM
82
  const pcmData = new Int16Array(inputData.length);