Voice-AI-Agent / frontend /script.js
rakib72642's picture
checkpoint 3 stable
496a69a
'use strict';
// ─── DOM refs ─────────────────────────────────────────────────────────────────
const chatBox = document.getElementById('chat-box');
const sendBtn = document.getElementById('send-btn');
const textInput = document.getElementById('text-input');
const micBtn = document.getElementById('mic-btn');
const micLabel = micBtn.querySelector('.mic-label');
const stopBtn = document.getElementById('stop-btn');
const stateLabel = document.getElementById('state-label');
const stateDot = document.getElementById('state-dot');
const clearBtn = document.getElementById('clear-btn');
const brainBtn = document.getElementById('brain-mode-btn');
const voiceCaption = document.getElementById('voice-caption');
const brainStage = document.getElementById('brain-stage');
const brainBubbleStt = document.getElementById('brain-bubble-stt');
const brainBubbleTts = document.getElementById('brain-bubble-tts');
const brainBubbleSttText = document.getElementById('brain-bubble-stt-text');
const brainBubbleTtsText = document.getElementById('brain-bubble-tts-text');
const voiceViz = document.getElementById('voice-viz');
const vizBars = Array.from(voiceViz.querySelectorAll('.viz-bar'));
const queueBars = Array.from(document.querySelectorAll('.queue-bar'));
const chunksCount = document.getElementById('chunks-count');
const sidebarEl = document.getElementById('sidebar');
const sidebarToggle = document.getElementById('sidebar-toggle');
const mobileMenuBtn = document.getElementById('mobile-menu-btn');
const appEl = document.getElementById('app');
const sThreshold = document.getElementById('s-threshold');
const sThresholdVal = document.getElementById('s-threshold-val');
const sTimeout = document.getElementById('s-timeout');
const sTimeoutVal = document.getElementById('s-timeout-val');
const sVoice = document.getElementById('s-voice');
const mStt = document.getElementById('m-stt');
const mLlm = document.getElementById('m-llm');
const mTts = document.getElementById('m-tts');
const mTotal = document.getElementById('m-total');
const sysStat = document.getElementById('sys-status');
// ─── Ephemeral user identity ──────────────────────────────────────────────────
// New page load = new user. Reloading the app generates a fresh ID.
const USER_ID = (() => {
if (window.crypto && typeof window.crypto.randomUUID === 'function') {
return 'u_' + window.crypto.randomUUID().replace(/-/g, '').slice(0, 16);
}
return (
'u_' +
Date.now().toString(36) +
'_' +
Math.random().toString(36).slice(2, 10)
);
})();
// ─── WebSocket base URL ────────────────────────────────────────────────────────
const WS_BASES = (() => {
const scheme = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
const bases = [];
const host =
window.location.host && window.location.host !== 'null'
? `${scheme}//${window.location.host}`
: '';
const push = (base) => {
if (base && !bases.includes(base)) bases.push(base);
};
push(host);
push(`${scheme}//127.0.0.1:8000`);
push(`${scheme}//127.0.0.1:8679`);
push(`${scheme}//localhost:8000`);
push(`${scheme}//localhost:8679`);
return bases;
})();
let _wsBaseIndex = 0;
console.log('[Boot] WS bases:', WS_BASES.join(', '));
// ─── WS handles ───────────────────────────────────────────────────────────────
let chatWS = null;
let voiceWS = null;
let _chatRetry = 0;
let _voiceRetry = 0;
let _chatRetryTimer = null;
let _voiceRetryTimer = null;
// ─── VAD / recording settings ─────────────────────────────────────────────────
let SILENCE_MS = 800; // default; user-adjustable in UI
let SILENCE_DB = -30;
const VAD_MS = 60;
const MIN_SPEECH_MS = 320; // discard noise bursts shorter than this
// ─── Playback state ───────────────────────────────────────────────────────────
let _ctx = null;
let _schedEnd = 0;
let _endTimer = null;
let _cancelled = false;
let _inFlight = 0;
let _ttsPlaying = false;
let _activeSources = [];
let _bargeInArmedAt = 0;
let _bargeInFiredAt = 0;
let _dropAudioUntil = 0;
let _audioChain = Promise.resolve();
let _playbackGen = 0;
let _expectedSeq = 0;
let _pendingAudio = new Map();
let _currentTurn = 0;
// Client-side playback speed multiplier.
// This makes speech faster immediately even if the TTS provider speed setting
// is limited/ignored. 1.0 = normal, >1.0 = faster.
let TTS_PLAYBACK_RATE = 1.0;
let brainMode = false;
let brainVoiceActive = false;
let brainRestartTimer = null;
let brainAutoRestartTimer = null;
let brainPendingAudio = null;
let voicePendingPackets = [];
let brainLastResponse = '';
let _brainWelcomed = false;
const VISIBLE_DIGIT_MAP = {
'০': '0',
'১': '1',
'২': '2',
'৩': '3',
'৪': '4',
'৫': '5',
'৬': '6',
'৭': '7',
'৮': '8',
'৯': '9',
'٠': '0',
'١': '1',
'٢': '2',
'٣': '3',
'٤': '4',
'٥': '5',
'٦': '6',
'٧': '7',
'٨': '8',
'٩': '9',
};
function _toAsciiDigits(text) {
return String(text || '').replace(
/[০-৯٠-٩]/g,
(ch) => VISIBLE_DIGIT_MAP[ch] || ch,
);
}
function _normalizeVisibleAiText(text) {
if (!text) return '';
return _toAsciiDigits(
String(text).replaceAll('উপলব্ধ', 'এভেলেবেল').replaceAll('জ্বি', 'আচ্ছা'),
);
}
const BRAIN_WELCOME_TEXT =
'[calm] হ্যালো, আমি আয়েশা! হাসপাতাল রিসেপশন থেকে বলছি। আপনি কি কোনো অ্যাপয়েন্টমেন্ট বুক করতে চান?';
// ─── Recording state ──────────────────────────────────────────────────────────
let micStream = null;
let analyserCtx = null;
let analyser = null;
let mediaRecorder = null;
let audioChunks = [];
let isListening = false;
let isSpeaking = false;
let isProcessing = false;
let isRecordingLocked = false;
let silenceTimer = null;
let vadInt = null;
let vizInt = null;
let _speechStartMs = 0;
let _recorderMime = 'audio/webm';
// ─── AI streaming bubble state ────────────────────────────────────────────────
let aiEl = null;
let aiTxt = '';
let thinkingEl = null;
let _captionRaf = 0;
let _captionText = '';
// ─── Latency timestamps ───────────────────────────────────────────────────────
let tSend = 0,
tStt = 0,
tLlm = 0,
tTts = 0;
function boot() {
initWebSockets();
appEl.classList.add('visible');
setState('ready');
}
// ═══════════════════════════════════════════════════════════════════════════════
// WEBSOCKETS
// ═══════════════════════════════════════════════════════════════════════════════
function _backoff(r) {
return Math.min(1000 * Math.pow(2, r), 16000);
}
function _wsBase() {
return WS_BASES[Math.min(_wsBaseIndex, WS_BASES.length - 1)] || WS_BASES[0];
}
function _advanceWsBase() {
if (WS_BASES.length <= 1) return _wsBase();
_wsBaseIndex = (_wsBaseIndex + 1) % WS_BASES.length;
console.log('[WS] Switching base to:', _wsBase());
return _wsBase();
}
function _setSysStatus(online) {
if (!sysStat) return;
sysStat.textContent = online ? 'Ready' : 'Reconnecting';
sysStat.className =
'status-badge ' + (online ? 'badge-green' : 'badge-yellow');
}
function _connectChat() {
if (chatWS && chatWS.readyState <= WebSocket.OPEN) return;
chatWS = new WebSocket(`${_wsBase()}/ws/chat`);
chatWS.onopen = () => {
_chatRetry = 0;
console.log('[Chat WS] connected');
chatWS.send(JSON.stringify({ type: 'init', user_id: USER_ID }));
};
chatWS.onerror = (e) => console.error('[Chat WS] error:', e);
chatWS.onclose = (ev) => {
console.log(`[Chat WS] closed (${ev.code})`);
_advanceWsBase();
clearTimeout(_chatRetryTimer);
_chatRetryTimer = setTimeout(() => {
_chatRetry++;
_connectChat();
}, _backoff(_chatRetry));
};
chatWS.onmessage = onChatMsg;
}
function _connectVoice() {
if (voiceWS && voiceWS.readyState <= WebSocket.OPEN) return;
voiceWS = new WebSocket(`${_wsBase()}/ws/voice`);
voiceWS.binaryType = 'arraybuffer';
voiceWS.onopen = () => {
_voiceRetry = 0;
console.log('[Voice WS] connected, uid:', USER_ID);
voiceWS.send(JSON.stringify({ type: 'init', user_id: USER_ID }));
_setSysStatus(true);
_flushVoicePendingPackets();
_flushBrainPendingAudio();
};
voiceWS.onerror = (e) => console.error('[Voice WS] error:', e);
voiceWS.onclose = (ev) => {
console.log(`[Voice WS] closed (${ev.code})`);
_setSysStatus(false);
if (isListening || isSpeaking || isProcessing) {
_teardownMicHardware();
_resetVoiceState();
setState('ready');
setMic('off');
micBtn.disabled = false;
}
clearTimeout(_voiceRetryTimer);
_advanceWsBase();
_voiceRetryTimer = setTimeout(() => {
_voiceRetry++;
_connectVoice();
}, _backoff(_voiceRetry));
if (brainMode && brainVoiceActive) {
_queueBrainReconnect();
}
};
voiceWS.onmessage = onVoiceMsg;
}
function initWebSockets() {
_connectChat();
_connectVoice();
}
// ── Chat WS handler ───────────────────────────────────────────────────────────
function onChatMsg(ev) {
let msg;
try {
msg = JSON.parse(ev.data);
} catch {
return;
}
console.log('[Chat WS]', msg.type);
switch (msg.type) {
case 'llm_token':
if (!msg.token) break;
if (tLlm === 0) {
tLlm = Date.now();
if (tSend > 0) mLlm.textContent = tLlm - tSend + ' ms';
}
_removeThinking();
if (!aiEl) {
aiEl = document.createElement('div');
aiEl.className = 'message ai';
chatBox.appendChild(aiEl);
}
aiTxt += msg.token;
_renderAiText();
break;
case 'chat':
if (!msg.text) break;
_removeThinking();
if (!aiEl) {
aiEl = document.createElement('div');
aiEl.className = 'message ai';
chatBox.appendChild(aiEl);
}
aiTxt = msg.text;
_renderAiText();
break;
case 'end':
_removeThinking();
_renderAiText(true);
aiEl = null;
aiTxt = '';
_setCaption('');
if (tSend > 0) mTotal.textContent = Date.now() - tSend + ' ms';
tSend = tStt = tLlm = tTts = 0;
isProcessing = false;
setState('ready');
break;
case 'error':
_removeThinking();
appendMsg('⚠️ ' + msg.text, 'system');
aiEl = null;
aiTxt = '';
_setCaption('');
isProcessing = false;
setState('ready');
break;
}
}
// ── Voice WS handler ──────────────────────────────────────────────────────────
function onVoiceMsg(ev) {
if (ev.data instanceof ArrayBuffer) {
if (Date.now() < _dropAudioUntil) return; // drop late packets after cancel
_ttsPlaying = true;
// Framed audio: 4-byte big-endian turn id + 4-byte big-endian seq id + raw audio bytes.
// We buffer/reorder by seq inside a turn, and ignore late packets from older turns.
const u8 = new Uint8Array(ev.data);
if (u8.length <= 8) return;
const turn = (u8[0] << 24) | (u8[1] << 16) | (u8[2] << 8) | (u8[3] << 0);
const seq = (u8[4] << 24) | (u8[5] << 16) | (u8[6] << 8) | (u8[7] << 0);
const turnU = turn >>> 0;
if (turnU !== _currentTurn >>> 0) return;
const payload = ev.data.slice(8);
_pendingAudio.set(seq >>> 0, payload);
const gen = _playbackGen;
while (_pendingAudio.has(_expectedSeq)) {
const buf = _pendingAudio.get(_expectedSeq);
_pendingAudio.delete(_expectedSeq);
const playBuf = buf;
_audioChain = _audioChain
.catch(() => {})
.then(() => {
if (gen !== _playbackGen) return;
if (_cancelled) return;
return enqueueAudio(playBuf);
});
_expectedSeq++;
}
return;
}
let msg;
try {
msg = JSON.parse(ev.data);
} catch {
return;
}
console.log('[Voice WS]', msg.type);
switch (msg.type) {
case 'init_ack':
console.log('[Voice WS] ack uid:', msg.user_id);
break;
case 'stt':
// New turn: reset audio ordering/buffers.
if (typeof msg.turn === 'number') _currentTurn = msg.turn >>> 0;
_expectedSeq = 0;
_pendingAudio.clear();
tStt = Date.now();
if (tSend > 0) mStt.textContent = tStt - tSend + ' ms';
_removeThinking();
if (!brainMode) appendMsg('🎤 ' + msg.text, 'user');
aiEl = null;
aiTxt = '';
_setCaption('');
_brainSetSttBubble(msg.text);
if (brainMode) _brainSetTtsBubble(brainLastResponse || '', false);
_brainModeSetSearch(true);
appendThinking();
setState('processing');
break;
case 'llm_token':
if (!msg.token) break;
const tokenText = _normalizeVisibleAiText(msg.token);
if (tLlm === 0) {
tLlm = Date.now();
if (tStt > 0) mLlm.textContent = tLlm - tStt + ' ms';
}
_removeThinking();
aiTxt = _normalizeVisibleAiText(aiTxt + tokenText);
_setCaption(aiTxt);
brainLastResponse = aiTxt;
_brainSetTtsBubble(brainLastResponse);
_brainModeSetSearch(true);
if (!brainMode) {
if (!aiEl) {
aiEl = document.createElement('div');
aiEl.className = 'message ai';
chatBox.appendChild(aiEl);
}
_renderAiText();
}
break;
case 'llm_full':
if (!msg.text) break;
// Best-effort recovery path: if any streamed tokens were dropped, the
// server sends the final full text once at turn end.
if (typeof msg.turn === 'number') {
_currentTurn = msg.turn >>> 0;
_expectedSeq = 0;
_pendingAudio.clear();
}
brainLastResponse = _normalizeVisibleAiText(msg.text);
aiTxt = brainLastResponse;
_brainSetTtsBubble(brainLastResponse);
if (!brainMode) {
if (!aiEl) {
aiEl = document.createElement('div');
aiEl.className = 'message ai';
chatBox.appendChild(aiEl);
}
_renderAiText();
}
break;
case 'end':
// In brain mode we don't stream tokens into chat UI, so append a final
// transcript line at turn end.
if (brainMode && aiTxt) appendMsg(aiTxt, 'ai');
_renderAiText(true);
_removeThinking();
if (brainMode) brainLastResponse = aiTxt || brainLastResponse;
aiEl = null;
aiTxt = '';
_setCaption('');
_expectedSeq = 0;
_pendingAudio.clear();
if (tSend > 0) mTotal.textContent = Date.now() - tSend + ' ms';
tSend = tStt = tLlm = tTts = 0;
isProcessing = false;
// BUG-FIX-C: schedule _done() to fire after TTS audio drains.
// If no TTS audio arrived (_schedEnd == 0), _done fires in ~300 ms.
_scheduleEnd();
break;
case 'error':
_removeThinking();
appendMsg('⚠️ ' + msg.text, 'system');
aiEl = null;
aiTxt = '';
_setCaption('');
_expectedSeq = 0;
_pendingAudio.clear();
_brainSetTtsBubble('', false);
_brainModeSetSearch(false);
isProcessing = false;
// BUG-FIX-C: unconditionally unlock on error
_done();
break;
case 'pong':
break;
default:
console.log('[Voice WS] unknown:', msg.type);
}
}
// ─── Thinking bubble ──────────────────────────────────────────────────────────
function appendThinking() {
if (brainMode) return;
if (thinkingEl) return;
thinkingEl = document.createElement('div');
thinkingEl.className = 'message ai thinking';
thinkingEl.innerHTML =
'<span class="dot"></span><span class="dot"></span><span class="dot"></span>';
chatBox.appendChild(thinkingEl);
chatBox.scrollTop = chatBox.scrollHeight;
}
function _removeThinking() {
if (thinkingEl) {
thinkingEl.remove();
thinkingEl = null;
}
}
function _renderAiText(force = false) {
if (!aiEl || !aiTxt) {
if (force && aiEl) aiEl.innerHTML = '';
return;
}
aiEl.innerHTML =
typeof marked !== 'undefined'
? marked.parse(aiTxt)
: aiTxt.replace(/\n/g, '<br>');
chatBox.scrollTop = chatBox.scrollHeight;
}
function _setCaption(text) {
_captionText = _normalizeVisibleAiText(text);
if (_captionRaf) return;
_captionRaf = requestAnimationFrame(() => {
_captionRaf = 0;
if (!voiceCaption) return;
voiceCaption.textContent = brainMode ? '' : _captionText;
});
}
// ═══════════════════════════════════════════════════════════════════════════════
// AUDIO PLAYBACK
// ═══════════════════════════════════════════════════════════════════════════════
function _ctxEnsure() {
if (!_ctx || _ctx.state === 'closed') {
_ctx = new (window.AudioContext || window.webkitAudioContext)();
_schedEnd = 0;
}
if (_ctx.state === 'suspended') _ctx.resume();
return _ctx;
}
function _stopAllSources() {
const sources = _activeSources.splice(0);
for (const src of sources) {
try {
src.onended = null;
src.stop(0);
} catch {}
try {
src.disconnect();
} catch {}
}
}
async function enqueueAudio(buf) {
if (_cancelled) return;
_inFlight++;
_vizQ();
const ctx = _ctxEnsure();
let decoded;
try {
decoded = await ctx.decodeAudioData(buf.slice(0));
} catch (e) {
console.warn('[Audio] decode error:', e.message);
_inFlight = Math.max(0, _inFlight - 1);
_vizQ();
return;
}
if (!decoded || decoded.duration < 0.001 || _cancelled) {
_inFlight = Math.max(0, _inFlight - 1);
_vizQ();
return;
}
if (tTts === 0 && tLlm > 0) {
tTts = Date.now();
mTts.textContent = tTts - tLlm + ' ms';
}
const src = ctx.createBufferSource();
src.buffer = decoded;
try {
src.playbackRate.value = Math.max(0.85, Math.min(2.0, TTS_PLAYBACK_RATE));
} catch {}
src.connect(ctx.destination);
const now = ctx.currentTime;
const GAP_S = 0.001;
const start = Math.max(now + 0.01, _schedEnd + GAP_S);
if (_cancelled) {
_inFlight = Math.max(0, _inFlight - 1);
_vizQ();
return;
}
_activeSources.push(src);
src.start(start);
const rate = (() => {
try {
return src.playbackRate.value || 1.0;
} catch {
return 1.0;
}
})();
_schedEnd = start + decoded.duration / Math.max(0.01, rate);
src.onended = () => {
_inFlight = Math.max(0, _inFlight - 1);
_vizQ();
const idx = _activeSources.indexOf(src);
if (idx >= 0) _activeSources.splice(idx, 1);
};
setState('speaking');
}
function _vizQ() {
if (chunksCount) chunksCount.textContent = _inFlight;
queueBars.forEach((b, i) => {
b.classList.toggle('active', i < _inFlight);
b.style.height = (i < _inFlight ? 12 + Math.random() * 30 : 4) + 'px';
});
}
function _scheduleEnd() {
clearTimeout(_endTimer);
const ctx = _ctx;
if (!ctx || ctx.state === 'closed') {
// No audio context — unlock immediately
setTimeout(_done, 300);
return;
}
const remainingMs = Math.max(0, (_schedEnd - ctx.currentTime) * 1000);
// BUG-FIX-C: always call _done regardless of _cancelled — we must
// release the lock. Use a minimal delay when no audio was scheduled.
_endTimer = setTimeout(_done, remainingMs + 300);
}
/**
* _done — returns system to fully idle state.
* ALWAYS unlocks the mic. Never auto-restarts recording.
*/
function _done() {
_ttsPlaying = false;
isProcessing = false;
isRecordingLocked = false;
_brainModeSetSearch(false);
_brainSetTtsBubble(brainLastResponse || '', false);
_inFlight = 0;
_vizQ();
micBtn.disabled = false;
setState('ready');
setMic('off');
if (brainMode && brainVoiceActive) {
clearTimeout(brainAutoRestartTimer);
brainAutoRestartTimer = setTimeout(() => {
if (
!brainMode ||
!brainVoiceActive ||
isListening ||
isProcessing ||
isRecordingLocked
) {
return;
}
_brainResumeListening();
}, 0);
}
console.log('[Voice] Idle — ready for next manual press');
}
function stopAllAudio() {
_cancelled = true;
_ttsPlaying = false;
// With turn-id framed audio, we can shorten the drop window; late packets
// are ignored by turn mismatch.
_dropAudioUntil = Date.now() + 120;
_playbackGen++;
_audioChain = Promise.resolve();
_expectedSeq = 0;
_pendingAudio.clear();
_stopAllSources();
clearTimeout(_endTimer);
_endTimer = null;
_schedEnd = 0;
_inFlight = 0;
_vizQ();
if (_ctx && _ctx.state !== 'closed') {
// Close releases scheduled audio immediately; a new ctx is created on demand.
_ctx.close().catch(() => {});
}
_ctx = null;
if (voiceWS && voiceWS.readyState === WebSocket.OPEN) {
voiceWS.send(JSON.stringify({ type: 'cancel' }));
}
}
function _bargeInNow(reason = 'speech') {
const now = Date.now();
if (now - _bargeInFiredAt < 500) return; // debounce
_bargeInFiredAt = now;
console.log('[BargeIn] interrupt:', reason);
stopAllAudio();
// Unlock immediately so the user can speak right away.
isProcessing = false;
isRecordingLocked = false;
_cancelled = false;
aiEl = null;
aiTxt = '';
_setCaption('');
_removeThinking();
micBtn.disabled = false;
// If mic is already warm (brain continuous mode), just re-arm VAD.
if (brainMode && brainVoiceActive) {
_brainModeSetSearch(false);
// If analyser/mic are already active, VAD tick will immediately
// transition into recording on the next speech sample.
_brainResumeListening();
return;
}
// Otherwise, start listening fresh (user initiated by speaking).
startListening().catch(() => {});
}
// ═══════════════════════════════════════════════════════════════════════════════
// TEXT CHAT
// ═══════════════════════════════════════════════════════════════════════════════
sendBtn.onclick = sendText;
textInput.addEventListener('keydown', (e) => {
if (e.key === 'Enter' && !e.shiftKey) sendText();
});
function sendText() {
const text = textInput.value.trim();
if (!text || isProcessing) return;
appendMsg(text, 'user');
textInput.value = '';
_cancelled = false;
isProcessing = true;
tSend = Date.now();
tLlm = tTts = 0;
aiEl = null;
aiTxt = '';
setState('processing');
appendThinking();
_sendViaChat(text);
}
function _sendViaChat(text) {
const payload = JSON.stringify({ user_id: USER_ID, user_query: text });
if (chatWS && chatWS.readyState === WebSocket.OPEN) {
chatWS.send(payload);
} else {
const _retry = () => {
if (chatWS && chatWS.readyState === WebSocket.OPEN) chatWS.send(payload);
else setTimeout(_retry, 300);
};
_retry();
}
}
// ═══════════════════════════════════════════════════════════════════════════════
// MICROPHONE / VAD
// ═══════════════════════════════════════════════════════════════════════════════
micBtn.onclick = async () => {
if (isRecordingLocked || isProcessing) {
console.log('[Mic] Ignored — system busy');
return;
}
if (isListening) {
if (brainMode && brainVoiceActive) {
console.log('[Brain] Continuous mode active — use Stop to exit');
return;
}
_teardownMicHardware();
_resetVoiceState();
setState('ready');
setMic('off');
} else {
await startListening();
}
};
stopBtn.onclick = () => {
brainVoiceActive = false;
clearTimeout(brainAutoRestartTimer);
clearTimeout(brainRestartTimer);
brainPendingAudio = null;
stopAllAudio();
if (isListening || isSpeaking) _teardownMicHardware();
_resetVoiceState();
setState('ready');
setMic('off');
micBtn.disabled = false;
};
// ── startListening ────────────────────────────────────────────────────────────
async function startListening() {
if (isListening || isProcessing || isRecordingLocked) return;
_ctxEnsure();
try {
micStream = await navigator.mediaDevices.getUserMedia({
audio: {
echoCancellation: true,
noiseSuppression: true,
autoGainControl: true,
channelCount: 1,
sampleRate: 16000,
},
});
} catch (err) {
console.error('[Mic] getUserMedia failed:', err);
appendMsg('⚠️ মাইক্রোফোন অ্যাক্সেস দেওয়া হয়নি।', 'system');
return;
}
analyserCtx = new AudioContext({ sampleRate: 16000 });
const src = analyserCtx.createMediaStreamSource(micStream);
analyser = analyserCtx.createAnalyser();
analyser.fftSize = 512;
analyser.smoothingTimeConstant = 0.6;
src.connect(analyser);
isListening = true;
audioChunks = [];
setMic('listening');
setState('listening');
voiceViz.classList.add('active');
vadInt = setInterval(vadTick, VAD_MS);
vizInt = setInterval(vizTick, 60);
console.log('[Mic] Listening started');
}
// ── _teardownMicHardware ──────────────────────────────────────────────────────
// Stops hardware: intervals, recorder (silenced), mic tracks, AudioContext.
// IMPORTANT: does NOT clear audioChunks — caller's onstop captures them first.
function _teardownMicHardware() {
clearInterval(vadInt);
clearInterval(vizInt);
clearTimeout(silenceTimer);
vadInt = vizInt = silenceTimer = null;
// Silence callbacks so no onstop logic fires after forced teardown
if (mediaRecorder && mediaRecorder.state !== 'inactive') {
mediaRecorder.ondataavailable = () => {};
mediaRecorder.onstop = () => {};
mediaRecorder.stop();
}
mediaRecorder = null;
micStream?.getTracks().forEach((t) => t.stop());
micStream = null;
if (analyserCtx && analyserCtx.state !== 'closed') {
analyserCtx.close().catch(() => {});
}
analyserCtx = null;
analyser = null;
voiceViz.classList.remove('active');
vizBars.forEach((b) => (b.style.height = '4px'));
console.log('[Mic] Hardware torn down');
}
// ── _resetVoiceState ──────────────────────────────────────────────────────────
function _resetVoiceState() {
isListening = false;
isSpeaking = false;
isProcessing = false;
isRecordingLocked = false;
_ttsPlaying = false;
_speechStartMs = 0;
audioChunks = [];
}
// ── VAD tick ──────────────────────────────────────────────────────────────────
function vadTick() {
if (!analyser) return;
// In brain mode we allow "barge-in": user speech interrupts TTS playback.
// In non-brain mode we still keep the hard lock to prevent overlapping turns.
if (!brainMode && (isProcessing || isRecordingLocked)) return;
const buf = new Float32Array(analyser.frequencyBinCount);
analyser.getFloatTimeDomainData(buf);
let sum = 0;
for (let i = 0; i < buf.length; i++) sum += buf[i] * buf[i];
const db = 20 * Math.log10(Math.sqrt(sum / buf.length) || 1e-10);
const speech = db > SILENCE_DB;
if (speech) {
// ── Barge-in detector ────────────────────────────────────────────────
if (
brainMode &&
brainVoiceActive &&
(_ttsPlaying || isProcessing || isRecordingLocked)
) {
// Stricter threshold reduces false triggers from echo + noise.
const loud = db > SILENCE_DB + 4;
if (loud) {
if (!_bargeInArmedAt) _bargeInArmedAt = Date.now();
if (Date.now() - _bargeInArmedAt >= 90) {
_bargeInArmedAt = 0;
_bargeInNow(_ttsPlaying ? 'vad_tts' : 'vad_thinking');
// After barge-in unlock, continue into the normal recording start
// path in this same tick.
} else {
// Don't start recording until we confirm it’s real barge-in speech.
return;
}
} else {
_bargeInArmedAt = 0;
return;
}
}
clearTimeout(silenceTimer);
silenceTimer = null;
if (!isSpeaking) {
if (mediaRecorder && mediaRecorder.state !== 'inactive') return; // duplicate guard
isSpeaking = true;
_speechStartMs = Date.now();
_cancelled = false;
_ctxEnsure();
startRecorder();
setMic('recording');
setState('recording');
console.log('[VAD] Speech detected — recording');
}
} else {
_bargeInArmedAt = 0;
if (isSpeaking && !silenceTimer) {
silenceTimer = setTimeout(_onSilenceTimeout, SILENCE_MS);
}
}
}
// ── _onSilenceTimeout ─────────────────────────────────────────────────────────
function _onSilenceTimeout() {
silenceTimer = null;
const speechDuration = Date.now() - _speechStartMs;
if (speechDuration < MIN_SPEECH_MS) {
console.log(
`[VAD] Too short (${speechDuration} ms) — discard & resume listening`,
);
isSpeaking = false;
discardRecorder();
// BUG-FIX-D: restart intervals so listening continues
if (isListening && !vadInt) {
vadInt = setInterval(vadTick, VAD_MS);
vizInt = setInterval(vizTick, 60);
}
setMic('listening');
setState('listening');
return;
}
console.log(
`[VAD] Silence after ${speechDuration} ms — finalising utterance`,
);
const keepBrainMicWarm = brainMode && brainVoiceActive;
// In brain mode we keep VAD running so we can detect barge-in while the AI is
// thinking/speaking. Outside brain mode we stop VAD during processing.
if (!keepBrainMicWarm) {
clearInterval(vadInt);
clearInterval(vizInt);
vadInt = vizInt = null;
}
// Lock state BEFORE stopRecorder (onstop may fire almost immediately)
isSpeaking = false;
isListening = keepBrainMicWarm; // mic stays "hot" in brain mode
isProcessing = true;
isRecordingLocked = true;
_cancelled = false;
tSend = Date.now();
tLlm = 0;
tTts = 0;
micBtn.disabled = !keepBrainMicWarm;
setMic(keepBrainMicWarm ? 'listening' : 'processing');
setState(keepBrainMicWarm ? 'listening' : 'processing');
stopRecorder(); // → triggers onstop asynchronously
}
// ── Viz tick ──────────────────────────────────────────────────────────────────
function vizTick() {
if (!analyser) return;
const data = new Uint8Array(analyser.frequencyBinCount);
analyser.getByteFrequencyData(data);
const step = Math.floor(data.length / vizBars.length);
vizBars.forEach((b, i) => {
const v = data[i * step] / 255;
b.style.height = Math.max(4, v * (isSpeaking ? 48 : 18)) + 'px';
});
}
// ── MediaRecorder ─────────────────────────────────────────────────────────────
function startRecorder() {
if (!micStream) return;
if (mediaRecorder && mediaRecorder.state !== 'inactive') {
console.warn('[Recorder] Duplicate startRecorder() — ignored');
return;
}
audioChunks = [];
_recorderMime = MediaRecorder.isTypeSupported('audio/webm;codecs=opus')
? 'audio/webm;codecs=opus'
: 'audio/webm';
try {
mediaRecorder = new MediaRecorder(micStream, { mimeType: _recorderMime });
} catch (err) {
console.error('[Recorder] Creation failed:', err);
isSpeaking = false;
setMic('listening');
setState('listening');
return;
}
mediaRecorder.ondataavailable = (e) => {
if (e.data && e.data.size > 0) audioChunks.push(e.data);
};
/**
* onstop handler
*
* BUG-FIX-A: Capture audioChunks into a LOCAL variable as the very
* first action, before any teardown or async work. Then clear the
* module-level audioChunks. _teardownMicHardware() does NOT touch
* audioChunks, so the local copy is safe.
*
* Old (broken) order:
* 1. _fullMicTeardown() ← set audioChunks = [] HERE
* 2. new Blob(audioChunks) ← always empty!
*
* New (correct) order:
* 1. const captured = audioChunks.slice() ← copy before anything
* 2. audioChunks = [] ← clear module ref
* 3. _teardownMicHardware() ← safe, chunks are local
* 4. new Blob(captured) ← has actual audio data
*/
mediaRecorder.onstop = async () => {
// ── 1. Capture chunks locally (MUST be first) ──────────────────────────
const captured = audioChunks.slice();
audioChunks = [];
const keepBrainMicWarm = brainMode && brainVoiceActive;
// ── 2. Tear down mic hardware unless brain mode wants a live loop ─────
if (keepBrainMicWarm) {
mediaRecorder = null;
setMic('off');
} else {
_teardownMicHardware();
setMic('off');
}
console.log(
`[Recorder] onstop: ${captured.length} chunk(s), ${captured
.reduce((s, c) => s + c.size, 0)
.toLocaleString()} bytes total`,
);
// ── 3. Validate ────────────────────────────────────────────────────────
if (!captured.length) {
console.warn('[Recorder] No audio chunks — possible threshold issue');
appendMsg(
'⚠️ কোনো অডিও রেকর্ড হয়নি। Silence threshold কমিয়ে দেখুন।',
'system',
);
_resetVoiceState();
setState(keepBrainMicWarm ? 'listening' : 'ready');
micBtn.disabled = false;
if (keepBrainMicWarm) _brainResumeListening();
return;
}
// ── 4. Build ArrayBuffer ───────────────────────────────────────────────
const blob = new Blob(captured, { type: _recorderMime });
let buf;
try {
buf = await blob.arrayBuffer();
} catch (err) {
console.error('[Recorder] arrayBuffer() error:', err);
_resetVoiceState();
setState(keepBrainMicWarm ? 'listening' : 'ready');
setMic('off');
micBtn.disabled = false;
if (keepBrainMicWarm) _brainResumeListening();
return;
}
console.log(`[VAD] → voice WS: ${buf.byteLength.toLocaleString()} bytes`);
// ── 5. Send to backend ─────────────────────────────────────────────────
if (voiceWS && voiceWS.readyState === WebSocket.OPEN) {
appendThinking();
voiceWS.send(buf);
// isProcessing + isRecordingLocked stay true until _done() fires
} else {
console.warn('[VAD] Voice WS not open — queueing utterance');
voicePendingPackets.push(buf);
_connectVoice();
_resetVoiceState();
setState(keepBrainMicWarm ? 'listening' : 'ready');
setMic('off');
micBtn.disabled = false;
if (keepBrainMicWarm) _brainResumeListening();
}
};
mediaRecorder.start();
console.log('[Recorder] Started, mime:', _recorderMime);
}
function stopRecorder() {
if (mediaRecorder && mediaRecorder.state !== 'inactive') {
mediaRecorder.stop(); // triggers onstop asynchronously
}
}
function discardRecorder() {
if (!mediaRecorder || mediaRecorder.state === 'inactive') {
audioChunks = [];
return;
}
mediaRecorder.ondataavailable = () => {};
mediaRecorder.onstop = () => {
audioChunks = [];
};
mediaRecorder.stop();
mediaRecorder = null;
audioChunks = [];
}
// ═══════════════════════════════════════════════════════════════════════════════
// UI HELPERS
// ═══════════════════════════════════════════════════════════════════════════════
const STATE_MAP = {
ready: { label: 'প্রস্তুত', cls: '' },
listening: { label: 'শুনছি…', cls: 'listening' },
recording: { label: 'রেকর্ড হচ্ছে…', cls: 'recording' },
processing: { label: 'প্রক্রিয়া করছে…', cls: 'processing' },
speaking: { label: 'AI বলছে…', cls: 'speaking' },
};
function setState(s) {
const cfg = STATE_MAP[s] || STATE_MAP.ready;
stateLabel.textContent = cfg.label;
stateDot.className = 'state-dot' + (cfg.cls ? ' ' + cfg.cls : '');
if (brainStage) brainStage.dataset.state = s;
}
const MIC_MAP = {
off: { cls: 'mic-off', label: 'Press to Start talking', icon: '🎤' },
listening: {
cls: 'mic-listening',
label: 'Listening...',
icon: '🟢',
},
recording: { cls: 'mic-recording', label: 'Listening..', icon: '🔴' },
processing: { cls: 'mic-processing', label: 'Please wait !!!', icon: '⏳' },
};
function setMic(s) {
const cfg = MIC_MAP[s] || MIC_MAP.off;
micBtn.className = 'mic-btn ' + cfg.cls;
micLabel.textContent = cfg.label;
micBtn.querySelector('.mic-icon').textContent = cfg.icon;
}
function appendMsg(text, who) {
// In brain mode, keep user messages hidden (brain panel acts as UI),
// but still show AI messages as a readable transcript.
if (brainMode && who === 'user') return null;
const d = document.createElement('div');
d.className = 'message ' + who;
const visibleText = _normalizeVisibleAiText(text);
if (who === 'ai' && typeof marked !== 'undefined') {
d.innerHTML = marked.parse(visibleText || '');
} else {
d.textContent = visibleText;
}
chatBox.appendChild(d);
chatBox.scrollTop = chatBox.scrollHeight;
return d;
}
clearBtn.onclick = () => {
chatBox.innerHTML = '';
thinkingEl = null;
if (!brainMode) appendMsg('চ্যাট পরিষ্কার করা হয়েছে।', 'system');
};
brainBtn.onclick = () => {
setBrainMode(!brainMode);
};
sidebarToggle.onclick = () => {
sidebarEl.classList.toggle('collapsed');
sidebarToggle.textContent = sidebarEl.classList.contains('collapsed')
? '›'
: '‹';
};
mobileMenuBtn.onclick = () => sidebarEl.classList.toggle('mobile-open');
function setBrainMode(on) {
brainMode = !!on;
document.body.classList.toggle('brain-mode', brainMode);
brainBtn.classList.toggle('active', brainMode);
brainBtn.setAttribute('aria-pressed', String(brainMode));
if (brainStage) brainStage.setAttribute('aria-hidden', String(!brainMode));
if (voiceCaption) voiceCaption.textContent = '';
_sendVoiceControl({ type: 'brain_mode', enabled: brainMode });
if (brainMode) {
brainBubbleSttText.textContent = 'Listening…';
brainBubbleTtsText.textContent =
_normalizeVisibleAiText(brainLastResponse) || 'Waiting…';
brainVoiceActive = true;
sidebarEl.classList.add('collapsed');
sidebarToggle.textContent = '›';
chatBox.scrollTop = chatBox.scrollHeight;
textInput.blur();
_brainModeSetSearch(
isProcessing || isListening || isSpeaking || _ttsPlaying,
);
// One-time welcome when entering brain mode (per page load).
if (!_brainWelcomed) {
_brainWelcomed = true;
setTimeout(() => {
if (!brainMode || !brainVoiceActive) return;
if (isProcessing || isSpeaking || _ttsPlaying) return;
_brainSendWelcome();
}, 0);
}
if (!isListening && !isProcessing && !isRecordingLocked) {
setTimeout(() => {
if (
brainMode &&
brainVoiceActive &&
!isListening &&
!isProcessing &&
!isRecordingLocked
) {
_brainResumeListening();
}
}, 0);
}
} else {
brainVoiceActive = false;
clearTimeout(brainAutoRestartTimer);
clearTimeout(brainRestartTimer);
brainPendingAudio = null;
sidebarEl.classList.remove('collapsed');
sidebarToggle.textContent = '‹';
_brainModeSetSearch(false);
_brainSetSttBubble('');
_brainSetTtsBubble('', false);
}
}
function _sendVoiceControl(payload) {
const packet = JSON.stringify(payload);
if (voiceWS && voiceWS.readyState === WebSocket.OPEN) {
try {
voiceWS.send(packet);
return;
} catch {}
}
voicePendingPackets.push(packet);
_connectVoice();
}
function _brainSendWelcome() {
const payload = JSON.stringify({ type: 'speak', text: BRAIN_WELCOME_TEXT });
if (!voiceWS || voiceWS.readyState !== WebSocket.OPEN) {
// If the socket is reconnecting, queue for later.
voicePendingPackets.push(payload);
_connectVoice();
return;
}
try {
appendThinking();
voiceWS.send(payload);
console.log('[Brain] welcome sent');
} catch (err) {
console.error('[Brain] welcome send failed:', err);
voicePendingPackets.push(payload);
_connectVoice();
}
}
function _brainModeSetSearch(active) {
if (!brainStage) return;
brainStage.classList.toggle('searching', !!active);
}
function _brainSetSttBubble(text) {
if (!brainBubbleStt || !brainBubbleSttText) return;
const value = _normalizeVisibleAiText(text).trim();
brainBubbleSttText.textContent = value || 'Listening…';
brainBubbleStt.classList.toggle('active', !!value);
}
function _brainSetTtsBubble(text, active = true) {
if (!brainBubbleTts || !brainBubbleTtsText) return;
const value = _normalizeVisibleAiText(text).trim();
brainBubbleTtsText.textContent = value || 'Waiting…';
brainBubbleTts.classList.toggle('active', !!value || !!active);
brainBubbleTts.classList.toggle('speaking', !!active);
}
function _brainResumeListening() {
if (
!brainMode ||
!brainVoiceActive ||
isListening ||
isProcessing ||
isRecordingLocked
) {
return;
}
if (micStream && analyserCtx && analyser) {
isListening = true;
setMic('listening');
setState('listening');
voiceViz.classList.add('active');
vadInt = setInterval(vadTick, VAD_MS);
vizInt = setInterval(vizTick, 60);
_brainModeSetSearch(false);
console.log('[Brain] Mic re-armed');
return;
}
startListening().catch((err) => {
console.error('[Brain] resume failed:', err);
});
}
function _queueBrainReconnect() {
if (!brainMode || !brainVoiceActive) return;
clearTimeout(brainRestartTimer);
brainRestartTimer = setTimeout(() => {
if (!brainMode || !brainVoiceActive) return;
_flushBrainPendingAudio();
}, 700);
}
function _flushVoicePendingPackets() {
if (
!voiceWS ||
voiceWS.readyState !== WebSocket.OPEN ||
!voicePendingPackets.length
) {
return;
}
const packets = voicePendingPackets.splice(0);
for (const packet of packets) {
try {
voiceWS.send(packet);
appendThinking();
console.log('[Voice] queued packet flushed');
} catch (err) {
console.error('[Voice] flush failed:', err);
voicePendingPackets.unshift(packet);
_connectVoice();
break;
}
}
}
function _flushBrainPendingAudio() {
if (!brainPendingAudio) return;
if (!voiceWS || voiceWS.readyState !== WebSocket.OPEN) {
_queueBrainReconnect();
return;
}
const buf = brainPendingAudio;
brainPendingAudio = null;
try {
appendThinking();
voiceWS.send(buf);
console.log('[Brain] queued utterance flushed');
} catch (err) {
console.error('[Brain] flush failed:', err);
brainPendingAudio = buf;
_queueBrainReconnect();
}
}
sThreshold.value = SILENCE_DB;
sThresholdVal.textContent = SILENCE_DB + ' dB';
sThreshold.oninput = () => {
SILENCE_DB = +sThreshold.value;
sThresholdVal.textContent = SILENCE_DB + ' dB';
};
sTimeout.value = SILENCE_MS;
sTimeoutVal.textContent = SILENCE_MS + ' ms';
sTimeout.oninput = () => {
SILENCE_MS = +sTimeout.value;
sTimeoutVal.textContent = SILENCE_MS + ' ms';
};
sVoice.onchange = () => appendMsg('🔊 TTS voice: ' + sVoice.value, 'system');
setInterval(() => {
if (_inFlight > 0) _vizQ();
}, 140);
// ═══════════════════════════════════════════════════════════════════════════════
// BOOT
// ═══════════════════════════════════════════════════════════════════════════════
boot();