Spaces:
Sleeping
Sleeping
| /** | |
| * WindsurfClient β talks to the local language server binary via gRPC (HTTP/2). | |
| * | |
| * Two flows: | |
| * Legacy β RawGetChatMessage (streaming, for enum-only models) | |
| * Cascade β StartCascade β SendUserCascadeMessage β poll (for modelUid models) | |
| */ | |
| import https from 'https'; | |
| import { randomUUID } from 'crypto'; | |
| import { log } from './config.js'; | |
| import { grpcFrame, grpcUnary, grpcStream } from './grpc.js'; | |
| import { getLsEntryByPort } from './langserver.js'; | |
| import { | |
| buildRawGetChatMessageRequest, parseRawResponse, | |
| buildInitializePanelStateRequest, | |
| buildAddTrackedWorkspaceRequest, | |
| buildUpdateWorkspaceTrustRequest, | |
| buildStartCascadeRequest, parseStartCascadeResponse, | |
| buildSendCascadeMessageRequest, | |
| buildGetTrajectoryRequest, parseTrajectoryStatus, | |
| buildGetTrajectoryStepsRequest, parseTrajectorySteps, | |
| buildGetGeneratorMetadataRequest, parseGeneratorMetadata, | |
| buildGetUserStatusRequest, parseGetUserStatusResponse, | |
| } from './windsurf.js'; | |
| const LS_SERVICE = '/exa.language_server_pb.LanguageServerService'; | |
| function contentToString(content) { | |
| if (typeof content === 'string') return content; | |
| if (Array.isArray(content)) { | |
| return content.map(p => (typeof p?.text === 'string' ? p.text : JSON.stringify(p))).join(''); | |
| } | |
| return content == null ? '' : JSON.stringify(content); | |
| } | |
| // βββ WindsurfClient ββββββββββββββββββββββββββββββββββββββββ | |
| export class WindsurfClient { | |
| /** | |
| * @param {string} apiKey - Codeium API key | |
| * @param {number} port - Language server gRPC port | |
| * @param {string} csrfToken - CSRF token for auth | |
| */ | |
| constructor(apiKey, port, csrfToken) { | |
| this.apiKey = apiKey; | |
| this.port = port; | |
| this.csrfToken = csrfToken; | |
| } | |
| // βββ Legacy: RawGetChatMessage (streaming) βββββββββββββββ | |
| /** | |
| * Stream chat via RawGetChatMessage. | |
| * Used for models without a string UID (enum < 280 generally). | |
| * | |
| * @param {Array} messages - OpenAI-format messages | |
| * @param {number} modelEnum - Model enum value | |
| * @param {string} [modelName] - Optional model name | |
| * @param {object} opts - { onChunk, onEnd, onError } | |
| */ | |
| rawGetChatMessage(messages, modelEnum, modelName, opts = {}) { | |
| const { onChunk, onEnd, onError } = opts; | |
| const proto = buildRawGetChatMessageRequest(this.apiKey, messages, modelEnum, modelName); | |
| const body = grpcFrame(proto); | |
| log.debug(`RawGetChatMessage: enum=${modelEnum} msgs=${messages.length}`); | |
| return new Promise((resolve, reject) => { | |
| const chunks = []; | |
| grpcStream(this.port, this.csrfToken, `${LS_SERVICE}/RawGetChatMessage`, body, { | |
| onData: (payload) => { | |
| try { | |
| const parsed = parseRawResponse(payload); | |
| if (parsed.text) { | |
| // Detect server-side errors returned as text | |
| const errMatch = /^(permission_denied|failed_precondition|not_found|unauthenticated):/.test(parsed.text.trim()); | |
| if (parsed.isError || errMatch) { | |
| const err = new Error(parsed.text.trim()); | |
| // Mark model-level errors so they don't count against the account | |
| err.isModelError = /permission_denied|failed_precondition/.test(parsed.text); | |
| reject(err); | |
| return; | |
| } | |
| chunks.push(parsed); | |
| onChunk?.(parsed); | |
| } | |
| } catch (e) { | |
| log.error('RawGetChatMessage parse error:', e.message); | |
| } | |
| }, | |
| onEnd: () => { | |
| onEnd?.(chunks); | |
| resolve(chunks); | |
| }, | |
| onError: (err) => { | |
| onError?.(err); | |
| reject(err); | |
| }, | |
| }); | |
| }); | |
| } | |
| /** | |
| * Run (or wait for) the one-shot Cascade workspace init for this LS. | |
| * Idempotent β the LS entry caches the in-flight Promise so concurrent | |
| * callers share one init round. Safe to call from a startup warmup path | |
| * so the first real chat request skips these 3 gRPC round-trips. | |
| */ | |
| warmupCascade(force = false) { | |
| const lsEntry = getLsEntryByPort(this.port); | |
| if (!lsEntry) return Promise.resolve(); | |
| if (force) { | |
| lsEntry.workspaceInit = null; | |
| lsEntry.sessionId = randomUUID(); | |
| } | |
| if (!lsEntry.sessionId) lsEntry.sessionId = randomUUID(); | |
| if (lsEntry.workspaceInit) return lsEntry.workspaceInit; | |
| const sessionId = lsEntry.sessionId; | |
| const workspacePath = '/tmp/windsurf-workspace'; | |
| const workspaceUri = 'file:///tmp/windsurf-workspace'; | |
| lsEntry.workspaceInit = (async () => { | |
| try { | |
| const initProto = buildInitializePanelStateRequest(this.apiKey, sessionId); | |
| await grpcUnary(this.port, this.csrfToken, | |
| `${LS_SERVICE}/InitializeCascadePanelState`, grpcFrame(initProto), 5000); | |
| } catch (e) { log.warn(`InitializeCascadePanelState: ${e.message}`); } | |
| try { | |
| const addWsProto = buildAddTrackedWorkspaceRequest(this.apiKey, workspacePath, sessionId); | |
| await grpcUnary(this.port, this.csrfToken, | |
| `${LS_SERVICE}/AddTrackedWorkspace`, grpcFrame(addWsProto), 5000); | |
| } catch (e) { log.warn(`AddTrackedWorkspace: ${e.message}`); } | |
| try { | |
| const trustProto = buildUpdateWorkspaceTrustRequest(this.apiKey, workspaceUri, true, sessionId); | |
| await grpcUnary(this.port, this.csrfToken, | |
| `${LS_SERVICE}/UpdateWorkspaceTrust`, grpcFrame(trustProto), 5000); | |
| } catch (e) { log.warn(`UpdateWorkspaceTrust: ${e.message}`); } | |
| log.info(`Cascade workspace init complete for LS port=${this.port}`); | |
| })().catch(e => { | |
| lsEntry.workspaceInit = null; | |
| throw e; | |
| }); | |
| return lsEntry.workspaceInit; | |
| } | |
| // βββ Cascade flow ββββββββββββββββββββββββββββββββββββββββ | |
| /** | |
| * Chat via Cascade flow (for premium models with string UIDs). | |
| * | |
| * 1. StartCascade β cascade_id | |
| * 2. SendUserCascadeMessage (with model config) | |
| * 3. Poll GetCascadeTrajectorySteps until IDLE | |
| * | |
| * @param {Array} messages | |
| * @param {number} modelEnum | |
| * @param {string} modelUid | |
| * @param {object} opts - { onChunk, onEnd, onError } | |
| */ | |
| async cascadeChat(messages, modelEnum, modelUid, opts = {}) { | |
| const { onChunk, onEnd, onError, signal, reuseEntry, toolPreamble } = opts; | |
| const aborted = () => signal?.aborted; | |
| const inputChars = messages.reduce((n, m) => n + contentToString(m?.content).length, 0); | |
| log.debug(`CascadeChat: uid=${modelUid} enum=${modelEnum} msgs=${messages.length} reuse=${!!reuseEntry}`); | |
| // One-shot per-LS workspace init (idempotent; typically pre-warmed at | |
| // LS startup). Falls back to a local session id if the LS entry is gone. | |
| const lsEntry = getLsEntryByPort(this.port); | |
| await this.warmupCascade().catch(() => {}); | |
| let sessionId = reuseEntry?.sessionId || lsEntry?.sessionId || randomUUID(); | |
| // "panel state not found" means the LS forgot the panel for our sessionId | |
| // (LS restarted, TTL expired, etc.). Re-run warmupCascade with a fresh | |
| // sessionId and retry the handshake once. | |
| const isPanelMissing = (e) => /panel state not found|not_found.*panel/i.test(e?.message || ''); | |
| try { | |
| // Step 1: Start cascade β with retry on panel-state-not-found | |
| let cascadeId; | |
| const openCascade = async () => { | |
| if (reuseEntry?.cascadeId) { | |
| log.debug(`Cascade resumed: ${reuseEntry.cascadeId}`); | |
| return reuseEntry.cascadeId; | |
| } | |
| const startProto = buildStartCascadeRequest(this.apiKey, sessionId); | |
| const startResp = await grpcUnary( | |
| this.port, this.csrfToken, `${LS_SERVICE}/StartCascade`, grpcFrame(startProto) | |
| ); | |
| const id = parseStartCascadeResponse(startResp); | |
| if (!id) throw new Error('StartCascade returned empty cascade_id'); | |
| log.debug(`Cascade started: ${id}`); | |
| return id; | |
| }; | |
| try { | |
| cascadeId = await openCascade(); | |
| } catch (e) { | |
| if (!isPanelMissing(e)) throw e; | |
| log.warn(`Panel state missing, re-warming LS port=${this.port}`); | |
| await this.warmupCascade(true).catch(() => {}); | |
| sessionId = getLsEntryByPort(this.port)?.sessionId || randomUUID(); | |
| if (reuseEntry) reuseEntry.cascadeId = null; // force StartCascade | |
| cascadeId = await openCascade(); | |
| } | |
| // Build the text payload. Two cases: | |
| // - Resuming an existing cascade: the backend already has the prior | |
| // turns cached, so we only send the newest user message. | |
| // - Fresh cascade: we have to pack the entire history into one shot | |
| // (Cascade doesn't accept a messages array). System blocks go on | |
| // top, then we render u/a turns as a labeled transcript so the | |
| // model can see its own prior replies β previously we dropped | |
| // assistant turns entirely and multi-turn context was broken. | |
| // | |
| // The caller (handlers/chat.js) is responsible for any tool-protocol | |
| // preamble that needs to sit in front of the user text (client-defined | |
| // OpenAI tools are serialized into a '<tool_call>{...}</tool_call>' | |
| // emission contract there). This function just stitches system + u/a | |
| // turns into the single text payload Cascade accepts. | |
| let text; | |
| if (reuseEntry?.cascadeId) { | |
| const lastUser = [...messages].reverse().find(m => m.role === 'user'); | |
| text = lastUser ? contentToString(lastUser.content) : ''; | |
| } else { | |
| const systemMsgs = messages.filter(m => m.role === 'system'); | |
| const convo = messages.filter(m => m.role === 'user' || m.role === 'assistant'); | |
| const sysText = systemMsgs.map(m => contentToString(m.content)).join('\n').trim(); | |
| if (convo.length <= 1) { | |
| const last = convo[convo.length - 1]; | |
| text = last ? contentToString(last.content) : ''; | |
| } else { | |
| const lines = []; | |
| for (let i = 0; i < convo.length - 1; i++) { | |
| const m = convo[i]; | |
| const label = m.role === 'user' ? 'User' : 'Assistant'; | |
| lines.push(`${label}: ${contentToString(m.content)}`); | |
| } | |
| const latest = convo[convo.length - 1]; | |
| const latestText = latest ? contentToString(latest.content) : ''; | |
| text = `[Conversation so far]\n${lines.join('\n\n')}\n\n[Current user message]\n${latestText}`; | |
| } | |
| if (sysText) text = sysText + '\n\n' + text; | |
| } | |
| // Step 2: Send message (retry once on panel-state-not-found) | |
| const sendMessage = async () => { | |
| const sendProto = buildSendCascadeMessageRequest(this.apiKey, cascadeId, text, modelEnum, modelUid, sessionId, { toolPreamble }); | |
| await grpcUnary( | |
| this.port, this.csrfToken, `${LS_SERVICE}/SendUserCascadeMessage`, grpcFrame(sendProto) | |
| ); | |
| }; | |
| try { | |
| await sendMessage(); | |
| } catch (e) { | |
| if (!isPanelMissing(e)) throw e; | |
| log.warn(`Panel state missing on Send, re-warming + restarting cascade port=${this.port}`); | |
| await this.warmupCascade(true).catch(() => {}); | |
| sessionId = getLsEntryByPort(this.port)?.sessionId || randomUUID(); | |
| const startProto = buildStartCascadeRequest(this.apiKey, sessionId); | |
| const startResp = await grpcUnary( | |
| this.port, this.csrfToken, `${LS_SERVICE}/StartCascade`, grpcFrame(startProto) | |
| ); | |
| cascadeId = parseStartCascadeResponse(startResp); | |
| if (!cascadeId) throw new Error('StartCascade returned empty cascade_id after re-warm'); | |
| await sendMessage(); | |
| } | |
| // Step 3: Poll for response. | |
| // Track per-step text cursors instead of a single global `lastYielded`. | |
| // The cascade trajectory can contain MULTIPLE PLANNER_RESPONSE steps | |
| // (thinking step + final response, or multi-turn). The old single-cursor | |
| // code silently dropped any step whose text was shorter than the longest | |
| // step seen so far β which showed up as "30k in / 200 out" where the real | |
| // answer was split across two steps and only one was emitted. | |
| const chunks = []; | |
| const yieldedByStep = new Map(); // stepIndex β emitted text length | |
| const thinkingByStep = new Map(); // stepIndex β emitted thinking length | |
| // Server-reported token usage, one entry per step keyed by step index. | |
| // Each value is the latest {inputTokens, outputTokens, cacheReadTokens, | |
| // cacheWriteTokens} observed on that step's CortexStepMetadata.model_usage. | |
| // Summed across all steps at return time β the response's real usage. | |
| const usageByStep = new Map(); | |
| const seenToolCallIds = new Set(); | |
| const toolCalls = []; | |
| let totalYielded = 0; | |
| let totalThinking = 0; | |
| let idleCount = 0; | |
| let pollCount = 0; | |
| let sawActive = false; // true once we've seen a non-IDLE status | |
| let sawText = false; // true once at least one PLANNER_RESPONSE with text arrived | |
| let lastStatus = -1; | |
| // "Progress" is ANY forward motion on the trajectory β text, thinking, | |
| // new tool call, or a new step appearing. Using this (instead of text | |
| // alone) for stall detection fixes the false-positive warm stalls where | |
| // Cascade is legitimately mid-thinking but `responseText` hasn't moved. | |
| let lastGrowthAt = Date.now(); | |
| let lastStepCount = 0; | |
| const maxWait = 180_000; | |
| const pollInterval = 250; | |
| const IDLE_GRACE_MS = 8_000; // minimum time before idle-break allowed | |
| // 25s no progress on any signal = genuine stall. Was 15s + text-only, | |
| // which misfired on long thinking phases and returned tiny "Let meβ¦" | |
| // preambles as if they were complete replies. | |
| const NO_GROWTH_STALL_MS = 25_000; | |
| const STALL_RETRY_MIN_TEXT = 300; // stalls shorter than this β retryable error, not partial success | |
| const startTime = Date.now(); | |
| let endReason = 'unknown'; | |
| while (Date.now() - startTime < maxWait) { | |
| if (aborted()) { endReason = 'aborted'; break; } | |
| await new Promise(r => setTimeout(r, pollInterval)); | |
| if (aborted()) { endReason = 'aborted'; break; } | |
| pollCount++; | |
| // Get steps | |
| const stepsProto = buildGetTrajectoryStepsRequest(cascadeId, 0); | |
| const stepsResp = await grpcUnary( | |
| this.port, this.csrfToken, `${LS_SERVICE}/GetCascadeTrajectorySteps`, grpcFrame(stepsProto) | |
| ); | |
| const steps = parseTrajectorySteps(stepsResp); | |
| // CORTEX_STEP_TYPE_ERROR_MESSAGE = 17. An error step means the cascade | |
| // refused the request (permission denied, model unavailable, etc.) β | |
| // raise it as a model-level error so the account isn't blamed. | |
| for (const step of steps) { | |
| if (step.type === 17 && step.errorText) { | |
| // Log the full trajectory context so we can see WHICH tool call | |
| // (if any) the error refers to. "invalid tool call" without | |
| // context is useless for debugging. | |
| const trail = steps.map(s => ({ | |
| type: s.type, | |
| status: s.status, | |
| textLen: s.text?.length || 0, | |
| tools: (s.toolCalls || []).map(tc => tc.name).join(','), | |
| })); | |
| log.warn('Cascade error step', { errorText: step.errorText.trim(), trail }); | |
| const err = new Error(step.errorText.trim()); | |
| err.isModelError = true; | |
| throw err; | |
| } | |
| } | |
| // Stall detection β two flavors: | |
| // (a) "cold stall": 30s+ ACTIVE but never saw any text or tool | |
| // call β planner is deadlocked before even starting to | |
| // produce output. Rotate account, don't make the user wait. | |
| // (b) "warm stall": we already streamed some text, but it hasn't | |
| // grown for 15s while status is still non-IDLE β planner is | |
| // stuck in a tool round-trip or upstream throttle. Accept | |
| // what we have as a complete response rather than waiting | |
| // out the full 180s maxWait with the client hanging. | |
| const elapsed = Date.now() - startTime; | |
| // Cap at maxWait (180s): long-context requests can legitimately take | |
| // that long to emit the first token from Cascade. Was 90s which | |
| // still tripped on very long prompts (issue #5). | |
| const coldStallMs = Math.min(maxWait, 30_000 + Math.floor(inputChars / 1500) * 5_000); | |
| if (elapsed > coldStallMs && sawActive && !sawText && seenToolCallIds.size === 0) { | |
| log.warn(`Cascade cold stall: ${elapsed}ms active without any text or tool call (threshold=${coldStallMs}ms, inputChars=${inputChars}), bailing`); | |
| endReason = 'stall_cold'; | |
| const err = new Error(`Cascade planner stalled β no output after ${Math.round(coldStallMs / 1000)}s`); | |
| err.isModelError = true; | |
| throw err; | |
| } | |
| if (sawText && lastStatus !== 1 && (Date.now() - lastGrowthAt) > NO_GROWTH_STALL_MS) { | |
| const diag = { | |
| msSinceGrowth: Date.now() - lastGrowthAt, | |
| textLen: totalYielded, | |
| thinkingLen: totalThinking, | |
| stepCount: yieldedByStep.size, | |
| toolCalls: seenToolCallIds.size, | |
| lastStatus, | |
| }; | |
| // Short-reply stall β treat as error so handlers/chat.js retries on | |
| // another account. A 50-char preamble is worse than no reply at all | |
| // because the client accepts it as "successful" and shows it to the | |
| // user. Retry only if we haven't streamed anything substantial yet | |
| // (if we did, partial delivery + idle end is fine). | |
| if (totalYielded < STALL_RETRY_MIN_TEXT) { | |
| log.warn('Cascade warm stall (short, retrying on next account)', diag); | |
| endReason = 'stall_warm_retry'; | |
| const err = new Error('Cascade planner stalled after preamble β no progress for 25s'); | |
| err.isModelError = true; | |
| throw err; | |
| } | |
| log.warn('Cascade warm stall (accepting partial)', diag); | |
| endReason = 'stall_warm'; | |
| break; // return what we have as a successful response | |
| } | |
| // Any trajectory change counts as forward progress. A new step, a new | |
| // tool call proposal, or thinking growth all reset the stall timer so | |
| // Cascade's slow silent planning phases don't get cut off mid-think. | |
| if (steps.length > lastStepCount) { | |
| lastStepCount = steps.length; | |
| lastGrowthAt = Date.now(); | |
| } | |
| for (let i = 0; i < steps.length; i++) { | |
| const step = steps[i]; | |
| // Per-step token usage. Overwrite on every poll so the map always | |
| // holds the latest reported numbers (they grow monotonically as | |
| // the generator emits more output). We sum across steps at the | |
| // end to compute the response's total usage. | |
| if (step.usage) usageByStep.set(i, step.usage); | |
| // Collect tool calls β dedupe by id so the same step seen across | |
| // polls only emits once. A tool call with an existing `result` | |
| // means the LS already executed it (built-in Cascade tool); we | |
| // pass it through to the client for visibility. | |
| if (step.toolCalls && step.toolCalls.length) { | |
| for (const tc of step.toolCalls) { | |
| const key = tc.id || `${tc.name}:${tc.argumentsJson}`; | |
| if (seenToolCallIds.has(key)) continue; | |
| seenToolCallIds.add(key); | |
| toolCalls.push(tc); | |
| lastGrowthAt = Date.now(); | |
| } | |
| } | |
| // Thinking delta: the LS keeps `thinking` as the cumulative | |
| // reasoning text for the step. Track a per-step cursor and emit | |
| // only the tail as reasoning_content. Crucially, thinking growth | |
| // *also* resets lastGrowthAt β prior code only watched response | |
| // text, so long silent thinking phases got falsely flagged as | |
| // stalls and 20% of Cascade requests came back as 50-char | |
| // preambles (`/tmp/...` style "let me analyze" stubs). | |
| const liveThink = step.thinking || ''; | |
| if (liveThink) { | |
| const prevThink = thinkingByStep.get(i) || 0; | |
| if (liveThink.length > prevThink) { | |
| const thinkDelta = liveThink.slice(prevThink); | |
| thinkingByStep.set(i, liveThink.length); | |
| totalThinking += thinkDelta.length; | |
| lastGrowthAt = Date.now(); | |
| const tchunk = { text: '', thinking: thinkDelta, isError: false }; | |
| chunks.push(tchunk); | |
| onChunk?.(tchunk); | |
| } | |
| } | |
| // Text delta rule: prefer `responseText` (append-only stream) over | |
| // `modifiedText` (LS post-pass rewrite) while we're streaming. The | |
| // LS periodically swaps `response` β `modified_response` mid-turn | |
| // with slightly different wording; if we blindly `entry.text = | |
| // modifiedText || responseText` and take a length-based slice, the | |
| // rewritten middle bytes vanish because we already advanced the | |
| // cursor past them in an earlier poll. Using responseText keeps the | |
| // slice monotonic. At turn end we top up with `modifiedText` (see | |
| // below) so the final accumulated text is still the LS's polished | |
| // version when one exists. | |
| const liveText = step.responseText || step.text || ''; | |
| if (!liveText) continue; | |
| const prev = yieldedByStep.get(i) || 0; | |
| if (liveText.length > prev) { | |
| const delta = liveText.slice(prev); | |
| yieldedByStep.set(i, liveText.length); | |
| totalYielded += delta.length; | |
| lastGrowthAt = Date.now(); | |
| sawText = true; | |
| const chunk = { text: delta, thinking: '', isError: false }; | |
| chunks.push(chunk); | |
| onChunk?.(chunk); | |
| } | |
| } | |
| // Check status | |
| const statusProto = buildGetTrajectoryRequest(cascadeId); | |
| const statusResp = await grpcUnary( | |
| this.port, this.csrfToken, `${LS_SERVICE}/GetCascadeTrajectory`, grpcFrame(statusProto) | |
| ); | |
| const status = parseTrajectoryStatus(statusResp); | |
| lastStatus = status; | |
| if (status !== 1) sawActive = true; | |
| if (status === 1) { // IDLE | |
| // Don't allow idle-break during the warmup window unless we've | |
| // already seen the planner go non-IDLE at least once. Without this | |
| // guard, cascades whose trajectory hasn't kicked off yet (status | |
| // stuck at 1 for the first ~600ms) terminate after only 2 polls | |
| // and the client sees a near-empty reply. | |
| const elapsed = Date.now() - startTime; | |
| const graceOver = elapsed > IDLE_GRACE_MS; | |
| if (!sawActive && !graceOver) { | |
| continue; // still warming up β don't count this as idle | |
| } | |
| idleCount++; | |
| // Require at least a little text OR a long idle streak before | |
| // accepting "done", so we don't race the first visible chunk. | |
| const canBreak = sawText ? idleCount >= 2 : idleCount >= 4; | |
| if (canBreak) { | |
| // Final sweep | |
| const finalResp = await grpcUnary( | |
| this.port, this.csrfToken, `${LS_SERVICE}/GetCascadeTrajectorySteps`, grpcFrame(stepsProto) | |
| ); | |
| const finalSteps = parseTrajectorySteps(finalResp); | |
| for (let i = 0; i < finalSteps.length; i++) { | |
| const step = finalSteps[i]; | |
| const responseText = step.responseText || ''; | |
| const modifiedText = step.modifiedText || ''; | |
| const prev = yieldedByStep.get(i) || 0; | |
| // Normal top-up: responseText grew past what we streamed. | |
| if (responseText.length > prev) { | |
| const delta = responseText.slice(prev); | |
| yieldedByStep.set(i, responseText.length); | |
| totalYielded += delta.length; | |
| chunks.push({ text: delta, thinking: '', isError: false }); | |
| onChunk?.({ text: delta, thinking: '', isError: false }); | |
| } | |
| // Modified-response top-up: only if it's a strict extension of | |
| // what we already emitted. If modifiedText rewrites the prefix | |
| // (common when LS polishes), emitting the tail would splice | |
| // wrong content onto the stream, so we skip it and keep the | |
| // raw responseText we already showed. | |
| const cursor = yieldedByStep.get(i) || 0; | |
| if (modifiedText.length > cursor && modifiedText.startsWith(responseText)) { | |
| const delta = modifiedText.slice(cursor); | |
| yieldedByStep.set(i, modifiedText.length); | |
| totalYielded += delta.length; | |
| chunks.push({ text: delta, thinking: '', isError: false }); | |
| onChunk?.({ text: delta, thinking: '', isError: false }); | |
| } | |
| } | |
| endReason = sawText ? 'idle_done' : 'idle_empty'; | |
| break; | |
| } | |
| } else { | |
| idleCount = 0; | |
| } | |
| } | |
| if (endReason === 'unknown') endReason = 'max_wait'; | |
| // Structured summary so we can diagnose short/empty completions after | |
| // the fact. sawActive=false + sawText=false + idle_empty = the planner | |
| // never actually ran on this cascade β likely an upstream starvation. | |
| const summary = { | |
| cascadeId: cascadeId.slice(0, 8), | |
| reason: endReason, | |
| polls: pollCount, | |
| textLen: totalYielded, | |
| thinkingLen: totalThinking, | |
| stepCount: Math.max(yieldedByStep.size, thinkingByStep.size, lastStepCount), | |
| toolCalls: seenToolCallIds.size, | |
| sawActive, | |
| sawText, | |
| lastStatus, | |
| ms: Date.now() - startTime, | |
| }; | |
| if (totalYielded < 20 && endReason !== 'aborted') { | |
| log.warn('Cascade short reply', summary); | |
| } else { | |
| log.info('Cascade done', summary); | |
| } | |
| onEnd?.(chunks); | |
| // ββ Real token usage via GetCascadeTrajectoryGeneratorMetadata ββ | |
| // CortexStepMetadata.model_usage (the per-step field) is usually empty | |
| // in the step trajectory response β the LS only populates the real | |
| // token counts in a separate RPC keyed off cascade_id. We fire this | |
| // once after the polling loop ends. Keep it non-fatal: a network blip | |
| // here just drops usage back to the chars/4 estimator, the response | |
| // itself is already formed. | |
| let serverUsage = null; | |
| try { | |
| const metaReq = buildGetGeneratorMetadataRequest(cascadeId, 0); | |
| const metaResp = await grpcUnary( | |
| this.port, this.csrfToken, | |
| `${LS_SERVICE}/GetCascadeTrajectoryGeneratorMetadata`, | |
| grpcFrame(metaReq), 5000 | |
| ); | |
| serverUsage = parseGeneratorMetadata(metaResp); | |
| } catch (e) { | |
| log.debug(`GetCascadeTrajectoryGeneratorMetadata failed: ${e.message}`); | |
| } | |
| // Fallback: if the generator metadata RPC didn't give us anything, | |
| // check the per-step metadata we collected during polling (some LS | |
| // versions do populate CortexStepMetadata.model_usage directly). | |
| if (!serverUsage && usageByStep.size > 0) { | |
| let inT = 0, outT = 0, cacheR = 0, cacheW = 0; | |
| for (const u of usageByStep.values()) { | |
| inT += u.inputTokens || 0; | |
| outT += u.outputTokens || 0; | |
| cacheR += u.cacheReadTokens || 0; | |
| cacheW += u.cacheWriteTokens || 0; | |
| } | |
| if (inT || outT || cacheR || cacheW) { | |
| serverUsage = { | |
| inputTokens: inT, | |
| outputTokens: outT, | |
| cacheReadTokens: cacheR, | |
| cacheWriteTokens: cacheW, | |
| }; | |
| } | |
| } | |
| // Attach cascade metadata so the caller can check it back into the | |
| // conversation pool. We still return the array so existing callers | |
| // that iterate over it keep working. | |
| chunks.cascadeId = cascadeId; | |
| chunks.sessionId = sessionId; | |
| chunks.toolCalls = toolCalls; | |
| chunks.usage = serverUsage; | |
| if (serverUsage) { | |
| log.info(`Cascade usage: in=${serverUsage.inputTokens} out=${serverUsage.outputTokens} cache_r=${serverUsage.cacheReadTokens} cache_w=${serverUsage.cacheWriteTokens}`); | |
| } | |
| if (toolCalls.length) log.info(`Cascade tool calls: ${toolCalls.length}`, { names: toolCalls.map(t => t.name) }); | |
| return chunks; | |
| } catch (err) { | |
| onError?.(err); | |
| throw err; | |
| } | |
| } | |
| // βββ Register user (JSON REST, unchanged) ββββββββββββββββ | |
| async registerUser(firebaseToken) { | |
| return new Promise((resolve, reject) => { | |
| const postData = JSON.stringify({ firebase_id_token: firebaseToken }); | |
| const req = https.request({ | |
| hostname: 'api.codeium.com', | |
| port: 443, | |
| path: '/register_user/', | |
| method: 'POST', | |
| headers: { | |
| 'Content-Type': 'application/json', | |
| 'Content-Length': Buffer.byteLength(postData), | |
| }, | |
| }, (res) => { | |
| let raw = ''; | |
| res.on('data', d => raw += d); | |
| res.on('end', () => { | |
| try { | |
| const json = JSON.parse(raw); | |
| if (res.statusCode >= 400) { | |
| reject(new Error(`RegisterUser failed (${res.statusCode}): ${raw}`)); | |
| return; | |
| } | |
| if (!json.api_key) { | |
| reject(new Error(`RegisterUser response missing api_key: ${raw}`)); | |
| return; | |
| } | |
| resolve({ apiKey: json.api_key, name: json.name, apiServerUrl: json.api_server_url }); | |
| } catch { | |
| reject(new Error(`RegisterUser parse error: ${raw}`)); | |
| } | |
| }); | |
| res.on('error', reject); | |
| }); | |
| req.on('error', reject); | |
| req.write(postData); | |
| req.end(); | |
| }); | |
| } | |
| // ββ GetUserStatus ββββββββββββββββββββββββββββββββββββββββ | |
| // | |
| // One-shot RPC that returns the account's canonical tier + cascade | |
| // model allowlist + credit usage + trial end time. Replaces the | |
| // probe-based tier inference for accounts where this call succeeds. | |
| async getUserStatus() { | |
| const proto = buildGetUserStatusRequest(this.apiKey); | |
| const resp = await grpcUnary( | |
| this.port, this.csrfToken, | |
| `${LS_SERVICE}/GetUserStatus`, grpcFrame(proto), 10000, | |
| ); | |
| return parseGetUserStatusResponse(resp); | |
| } | |
| } | |