Spaces:

Scribbler310
/

SentimentDetectiontest

Running

App Files Files Community

3v324v23 commited on about 3 hours ago

Commit

34529d9

1 Parent(s): 6f6528a

Prevent acoustic feedback loops in voice chat by ignoring mic input during playback and terminate websocket loop immediately on drop

Browse files

Files changed (2) hide show

backend/main.py +36 -8
frontend/src/components/VoiceSessionModal.jsx +5 -0

backend/main.py CHANGED Viewed

@@ -247,7 +247,6 @@ def chat_endpoint(request: ChatRequest):
             detail=f"An error occurred: {str(e)}"
         )
-# WebSocket Endpoint for Socratic voice dialogue via Gemini Multimodal Live API
 @app.websocket("/api/live-ws")
 async def websocket_live_endpoint(websocket: WebSocket):
     await websocket.accept()
@@ -258,10 +257,21 @@ async def websocket_live_endpoint(websocket: WebSocket):
         await websocket.close(code=4000, reason="GEMINI_API_KEY is missing.")
         return
     try:
         from google import genai
         from google.genai import types
-    except ImportError:
         await websocket.close(code=4001, reason="google-genai SDK not installed.")
         return
@@ -285,15 +295,20 @@ async def websocket_live_endpoint(websocket: WebSocket):
     try:
         # Establish async WebSocket connection to Gemini Live using the Gemini 3.1 Flash Live model
         async with client.aio.live.connect(model="gemini-3.1-flash-live-preview", config=config) as session:
             async def receive_from_client():
                 try:
                     while True:
                         # Receive JSON from browser client
                         message = await websocket.receive_json()
                         msg_type = message.get("type")
                         if msg_type == "audio":
                             # Decode base64 PCM audio chunk sent from frontend
                             audio_bytes = base64.b64decode(message["data"])
                             # Stream real-time audio (using 'audio' instead of deprecated 'media')
@@ -301,12 +316,13 @@ async def websocket_live_endpoint(websocket: WebSocket):
                                 audio=types.Blob(data=audio_bytes, mime_type="audio/pcm;rate=16000")
                             )
                         elif msg_type == "text":
                             # Send real-time text input
                             await session.send_realtime_input(text=message["data"])
                 except WebSocketDisconnect:
-                    pass
                 except Exception as e:
-                    print(f"[WebSocket Proxy Client -> Gemini] Error: {e}")
             async def send_to_client():
                 try:
@@ -325,6 +341,7 @@ async def websocket_live_endpoint(websocket: WebSocket):
                                         })
                                     elif part.text is not None:
                                         # Stream text transcription back to client
                                         await websocket.send_json({
                                             "type": "text",
                                             "data": part.text
@@ -332,16 +349,27 @@ async def websocket_live_endpoint(websocket: WebSocket):
                             # Handle turn completion (model finished speaking)
                             if server_content.turn_complete:
                                 await websocket.send_json({"type": "turn_complete"})
                 except Exception as e:
-                    print(f"[WebSocket Proxy Gemini -> Client] Error: {e}")
-            # Run both tasks concurrently
-            await asyncio.gather(receive_from_client(), send_to_client())
     except Exception as e:
-        print(f"WebSocket Gemini Live connection failed: {e}")
     finally:
         try:
             await websocket.close()
         except Exception:

             detail=f"An error occurred: {str(e)}"
         )
 @app.websocket("/api/live-ws")
 async def websocket_live_endpoint(websocket: WebSocket):
     await websocket.accept()
         await websocket.close(code=4000, reason="GEMINI_API_KEY is missing.")
         return
+    def ws_log(msg: str):
+        log_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), "ws_debug.log")
+        try:
+            with open(log_file, "a", encoding="utf-8") as f:
+                f.write(f"[{datetime.now().strftime('%H:%M:%S.%f')[:-3]}] {msg}\n")
+        except Exception:
+            pass
+    ws_log("Client WebSocket connected. Initializing Live connection...")
     try:
         from google import genai
         from google.genai import types
+    except ImportError as e:
+        ws_log(f"ImportError: google-genai not installed. {e}")
         await websocket.close(code=4001, reason="google-genai SDK not installed.")
         return
     try:
         # Establish async WebSocket connection to Gemini Live using the Gemini 3.1 Flash Live model
         async with client.aio.live.connect(model="gemini-3.1-flash-live-preview", config=config) as session:
+            ws_log("Successfully connected to Gemini Live session.")
             async def receive_from_client():
                 try:
+                    audio_chunk_count = 0
                     while True:
                         # Receive JSON from browser client
                         message = await websocket.receive_json()
                         msg_type = message.get("type")
                         if msg_type == "audio":
+                            audio_chunk_count += 1
+                            if audio_chunk_count % 50 == 1:
+                                ws_log(f"Received audio chunk {audio_chunk_count} from client.")
                             # Decode base64 PCM audio chunk sent from frontend
                             audio_bytes = base64.b64decode(message["data"])
                             # Stream real-time audio (using 'audio' instead of deprecated 'media')
                                 audio=types.Blob(data=audio_bytes, mime_type="audio/pcm;rate=16000")
                             )
                         elif msg_type == "text":
+                            ws_log(f"Received text query from client: {message['data']}")
                             # Send real-time text input
                             await session.send_realtime_input(text=message["data"])
                 except WebSocketDisconnect:
+                    ws_log("Client WebSocket disconnected (WebSocketDisconnect).")
                 except Exception as e:
+                    ws_log(f"[WebSocket Proxy Client -> Gemini] Error: {e}")
             async def send_to_client():
                 try:
                                         })
                                     elif part.text is not None:
                                         # Stream text transcription back to client
+                                        ws_log(f"Streaming text chunk from Gemini: {part.text}")
                                         await websocket.send_json({
                                             "type": "text",
                                             "data": part.text
                             # Handle turn completion (model finished speaking)
                             if server_content.turn_complete:
+                                ws_log("Gemini sent turn_complete.")
                                 await websocket.send_json({"type": "turn_complete"})
                 except Exception as e:
+                    ws_log(f"[WebSocket Proxy Gemini -> Client] Error: {e}")
+            # Run both tasks concurrently and terminate when the first one finishes
+            done, pending = await asyncio.wait(
+                [
+                    asyncio.create_task(receive_from_client()),
+                    asyncio.create_task(send_to_client())
+                ],
+                return_when=asyncio.FIRST_COMPLETED
+            )
+            for task in pending:
+                task.cancel()
     except Exception as e:
+        ws_log(f"WebSocket Gemini Live connection failed: {e}")
     finally:
+        ws_log("Closing WebSocket and cleaning up.")
         try:
             await websocket.close()
         except Exception:

frontend/src/components/VoiceSessionModal.jsx CHANGED Viewed

@@ -72,6 +72,11 @@ export default function VoiceSessionModal({ isOpen, onClose, apiKey }) {
           if (isMutedRef.current) return;
           if (statusRef.current === 'speaking' || statusRef.current === 'connecting') return;
           const inputData = e.inputBuffer.getChannelData(0);
           // Convert Float32 to Int16 PCM
           const pcmData = new Int16Array(inputData.length);

           if (isMutedRef.current) return;
           if (statusRef.current === 'speaking' || statusRef.current === 'connecting') return;
+          const pContext = playbackContextRef.current;
+          if (pContext && pContext.currentTime < nextPlayTimeRef.current) {
+            return;
+          }
           const inputData = e.inputBuffer.getChannelData(0);
           // Convert Float32 to Int16 PCM
           const pcmData = new Int16Array(inputData.length);