Spaces:

DanielleNguyen
/

WindsurfAPI

Sleeping

File size: 30,849 Bytes

f6266b9

/**
 * WindsurfClient — talks to the local language server binary via gRPC (HTTP/2).
 *
 * Two flows:
 *   Legacy  → RawGetChatMessage (streaming, for enum-only models)
 *   Cascade → StartCascade → SendUserCascadeMessage → poll (for modelUid models)
 */

import https from 'https';
import { randomUUID } from 'crypto';
import { log } from './config.js';
import { grpcFrame, grpcUnary, grpcStream } from './grpc.js';
import { getLsEntryByPort } from './langserver.js';
import {
  buildRawGetChatMessageRequest, parseRawResponse,
  buildInitializePanelStateRequest,
  buildAddTrackedWorkspaceRequest,
  buildUpdateWorkspaceTrustRequest,
  buildStartCascadeRequest, parseStartCascadeResponse,
  buildSendCascadeMessageRequest,
  buildGetTrajectoryRequest, parseTrajectoryStatus,
  buildGetTrajectoryStepsRequest, parseTrajectorySteps,
  buildGetGeneratorMetadataRequest, parseGeneratorMetadata,
  buildGetUserStatusRequest, parseGetUserStatusResponse,
} from './windsurf.js';

const LS_SERVICE = '/exa.language_server_pb.LanguageServerService';

function contentToString(content) {
  if (typeof content === 'string') return content;
  if (Array.isArray(content)) {
    return content.map(p => (typeof p?.text === 'string' ? p.text : JSON.stringify(p))).join('');
  }
  return content == null ? '' : JSON.stringify(content);
}

// ─── WindsurfClient ────────────────────────────────────────

export class WindsurfClient {
  /**
   * @param {string} apiKey - Codeium API key
   * @param {number} port - Language server gRPC port
   * @param {string} csrfToken - CSRF token for auth
   */
  constructor(apiKey, port, csrfToken) {
    this.apiKey = apiKey;
    this.port = port;
    this.csrfToken = csrfToken;
  }

  // ─── Legacy: RawGetChatMessage (streaming) ───────────────

  /**
   * Stream chat via RawGetChatMessage.
   * Used for models without a string UID (enum < 280 generally).
   *
   * @param {Array} messages - OpenAI-format messages
   * @param {number} modelEnum - Model enum value
   * @param {string} [modelName] - Optional model name
   * @param {object} opts - { onChunk, onEnd, onError }
   */
  rawGetChatMessage(messages, modelEnum, modelName, opts = {}) {
    const { onChunk, onEnd, onError } = opts;
    const proto = buildRawGetChatMessageRequest(this.apiKey, messages, modelEnum, modelName);
    const body = grpcFrame(proto);

    log.debug(`RawGetChatMessage: enum=${modelEnum} msgs=${messages.length}`);

    return new Promise((resolve, reject) => {
      const chunks = [];

      grpcStream(this.port, this.csrfToken, `${LS_SERVICE}/RawGetChatMessage`, body, {
        onData: (payload) => {
          try {
            const parsed = parseRawResponse(payload);
            if (parsed.text) {
              // Detect server-side errors returned as text
              const errMatch = /^(permission_denied|failed_precondition|not_found|unauthenticated):/.test(parsed.text.trim());
              if (parsed.isError || errMatch) {
                const err = new Error(parsed.text.trim());
                // Mark model-level errors so they don't count against the account
                err.isModelError = /permission_denied|failed_precondition/.test(parsed.text);
                reject(err);
                return;
              }
              chunks.push(parsed);
              onChunk?.(parsed);
            }
          } catch (e) {
            log.error('RawGetChatMessage parse error:', e.message);
          }
        },
        onEnd: () => {
          onEnd?.(chunks);
          resolve(chunks);
        },
        onError: (err) => {
          onError?.(err);
          reject(err);
        },
      });
    });
  }

  /**
   * Run (or wait for) the one-shot Cascade workspace init for this LS.
   * Idempotent — the LS entry caches the in-flight Promise so concurrent
   * callers share one init round. Safe to call from a startup warmup path
   * so the first real chat request skips these 3 gRPC round-trips.
   */
  warmupCascade(force = false) {
    const lsEntry = getLsEntryByPort(this.port);
    if (!lsEntry) return Promise.resolve();
    if (force) {
      lsEntry.workspaceInit = null;
      lsEntry.sessionId = randomUUID();
    }
    if (!lsEntry.sessionId) lsEntry.sessionId = randomUUID();
    if (lsEntry.workspaceInit) return lsEntry.workspaceInit;

    const sessionId = lsEntry.sessionId;
    const workspacePath = '/tmp/windsurf-workspace';
    const workspaceUri = 'file:///tmp/windsurf-workspace';

    lsEntry.workspaceInit = (async () => {
      try {
        const initProto = buildInitializePanelStateRequest(this.apiKey, sessionId);
        await grpcUnary(this.port, this.csrfToken,
          `${LS_SERVICE}/InitializeCascadePanelState`, grpcFrame(initProto), 5000);
      } catch (e) { log.warn(`InitializeCascadePanelState: ${e.message}`); }
      try {
        const addWsProto = buildAddTrackedWorkspaceRequest(this.apiKey, workspacePath, sessionId);
        await grpcUnary(this.port, this.csrfToken,
          `${LS_SERVICE}/AddTrackedWorkspace`, grpcFrame(addWsProto), 5000);
      } catch (e) { log.warn(`AddTrackedWorkspace: ${e.message}`); }
      try {
        const trustProto = buildUpdateWorkspaceTrustRequest(this.apiKey, workspaceUri, true, sessionId);
        await grpcUnary(this.port, this.csrfToken,
          `${LS_SERVICE}/UpdateWorkspaceTrust`, grpcFrame(trustProto), 5000);
      } catch (e) { log.warn(`UpdateWorkspaceTrust: ${e.message}`); }
      log.info(`Cascade workspace init complete for LS port=${this.port}`);
    })().catch(e => {
      lsEntry.workspaceInit = null;
      throw e;
    });
    return lsEntry.workspaceInit;
  }

  // ─── Cascade flow ────────────────────────────────────────

  /**
   * Chat via Cascade flow (for premium models with string UIDs).
   *
   * 1. StartCascade → cascade_id
   * 2. SendUserCascadeMessage (with model config)
   * 3. Poll GetCascadeTrajectorySteps until IDLE
   *
   * @param {Array} messages
   * @param {number} modelEnum
   * @param {string} modelUid
   * @param {object} opts - { onChunk, onEnd, onError }
   */
  async cascadeChat(messages, modelEnum, modelUid, opts = {}) {
    const { onChunk, onEnd, onError, signal, reuseEntry, toolPreamble } = opts;
    const aborted = () => signal?.aborted;
    const inputChars = messages.reduce((n, m) => n + contentToString(m?.content).length, 0);

    log.debug(`CascadeChat: uid=${modelUid} enum=${modelEnum} msgs=${messages.length} reuse=${!!reuseEntry}`);

    // One-shot per-LS workspace init (idempotent; typically pre-warmed at
    // LS startup). Falls back to a local session id if the LS entry is gone.
    const lsEntry = getLsEntryByPort(this.port);
    await this.warmupCascade().catch(() => {});
    let sessionId = reuseEntry?.sessionId || lsEntry?.sessionId || randomUUID();

    // "panel state not found" means the LS forgot the panel for our sessionId
    // (LS restarted, TTL expired, etc.). Re-run warmupCascade with a fresh
    // sessionId and retry the handshake once.
    const isPanelMissing = (e) => /panel state not found|not_found.*panel/i.test(e?.message || '');

    try {
      // Step 1: Start cascade — with retry on panel-state-not-found
      let cascadeId;
      const openCascade = async () => {
        if (reuseEntry?.cascadeId) {
          log.debug(`Cascade resumed: ${reuseEntry.cascadeId}`);
          return reuseEntry.cascadeId;
        }
        const startProto = buildStartCascadeRequest(this.apiKey, sessionId);
        const startResp = await grpcUnary(
          this.port, this.csrfToken, `${LS_SERVICE}/StartCascade`, grpcFrame(startProto)
        );
        const id = parseStartCascadeResponse(startResp);
        if (!id) throw new Error('StartCascade returned empty cascade_id');
        log.debug(`Cascade started: ${id}`);
        return id;
      };
      try {
        cascadeId = await openCascade();
      } catch (e) {
        if (!isPanelMissing(e)) throw e;
        log.warn(`Panel state missing, re-warming LS port=${this.port}`);
        await this.warmupCascade(true).catch(() => {});
        sessionId = getLsEntryByPort(this.port)?.sessionId || randomUUID();
        if (reuseEntry) reuseEntry.cascadeId = null; // force StartCascade
        cascadeId = await openCascade();
      }

      // Build the text payload. Two cases:
      //   - Resuming an existing cascade: the backend already has the prior
      //     turns cached, so we only send the newest user message.
      //   - Fresh cascade: we have to pack the entire history into one shot
      //     (Cascade doesn't accept a messages array). System blocks go on
      //     top, then we render u/a turns as a labeled transcript so the
      //     model can see its own prior replies — previously we dropped
      //     assistant turns entirely and multi-turn context was broken.
      //
      // The caller (handlers/chat.js) is responsible for any tool-protocol
      // preamble that needs to sit in front of the user text (client-defined
      // OpenAI tools are serialized into a '<tool_call>{...}</tool_call>'
      // emission contract there). This function just stitches system + u/a
      // turns into the single text payload Cascade accepts.
      let text;
      if (reuseEntry?.cascadeId) {
        const lastUser = [...messages].reverse().find(m => m.role === 'user');
        text = lastUser ? contentToString(lastUser.content) : '';
      } else {
        const systemMsgs = messages.filter(m => m.role === 'system');
        const convo = messages.filter(m => m.role === 'user' || m.role === 'assistant');
        const sysText = systemMsgs.map(m => contentToString(m.content)).join('\n').trim();

        if (convo.length <= 1) {
          const last = convo[convo.length - 1];
          text = last ? contentToString(last.content) : '';
        } else {
          const lines = [];
          for (let i = 0; i < convo.length - 1; i++) {
            const m = convo[i];
            const label = m.role === 'user' ? 'User' : 'Assistant';
            lines.push(`${label}: ${contentToString(m.content)}`);
          }
          const latest = convo[convo.length - 1];
          const latestText = latest ? contentToString(latest.content) : '';
          text = `[Conversation so far]\n${lines.join('\n\n')}\n\n[Current user message]\n${latestText}`;
        }
        if (sysText) text = sysText + '\n\n' + text;
      }

      // Step 2: Send message (retry once on panel-state-not-found)
      const sendMessage = async () => {
        const sendProto = buildSendCascadeMessageRequest(this.apiKey, cascadeId, text, modelEnum, modelUid, sessionId, { toolPreamble });
        await grpcUnary(
          this.port, this.csrfToken, `${LS_SERVICE}/SendUserCascadeMessage`, grpcFrame(sendProto)
        );
      };
      try {
        await sendMessage();
      } catch (e) {
        if (!isPanelMissing(e)) throw e;
        log.warn(`Panel state missing on Send, re-warming + restarting cascade port=${this.port}`);
        await this.warmupCascade(true).catch(() => {});
        sessionId = getLsEntryByPort(this.port)?.sessionId || randomUUID();
        const startProto = buildStartCascadeRequest(this.apiKey, sessionId);
        const startResp = await grpcUnary(
          this.port, this.csrfToken, `${LS_SERVICE}/StartCascade`, grpcFrame(startProto)
        );
        cascadeId = parseStartCascadeResponse(startResp);
        if (!cascadeId) throw new Error('StartCascade returned empty cascade_id after re-warm');
        await sendMessage();
      }

      // Step 3: Poll for response.
      // Track per-step text cursors instead of a single global `lastYielded`.
      // The cascade trajectory can contain MULTIPLE PLANNER_RESPONSE steps
      // (thinking step + final response, or multi-turn). The old single-cursor
      // code silently dropped any step whose text was shorter than the longest
      // step seen so far — which showed up as "30k in / 200 out" where the real
      // answer was split across two steps and only one was emitted.
      const chunks = [];
      const yieldedByStep = new Map(); // stepIndex → emitted text length
      const thinkingByStep = new Map(); // stepIndex → emitted thinking length
      // Server-reported token usage, one entry per step keyed by step index.
      // Each value is the latest {inputTokens, outputTokens, cacheReadTokens,
      // cacheWriteTokens} observed on that step's CortexStepMetadata.model_usage.
      // Summed across all steps at return time → the response's real usage.
      const usageByStep = new Map();
      const seenToolCallIds = new Set();
      const toolCalls = [];
      let totalYielded = 0;
      let totalThinking = 0;
      let idleCount = 0;
      let pollCount = 0;
      let sawActive = false;   // true once we've seen a non-IDLE status
      let sawText = false;     // true once at least one PLANNER_RESPONSE with text arrived
      let lastStatus = -1;
      // "Progress" is ANY forward motion on the trajectory — text, thinking,
      // new tool call, or a new step appearing. Using this (instead of text
      // alone) for stall detection fixes the false-positive warm stalls where
      // Cascade is legitimately mid-thinking but `responseText` hasn't moved.
      let lastGrowthAt = Date.now();
      let lastStepCount = 0;
      const maxWait = 180_000;
      const pollInterval = 250;
      const IDLE_GRACE_MS = 8_000;     // minimum time before idle-break allowed
      // 25s no progress on any signal = genuine stall. Was 15s + text-only,
      // which misfired on long thinking phases and returned tiny "Let me…"
      // preambles as if they were complete replies.
      const NO_GROWTH_STALL_MS = 25_000;
      const STALL_RETRY_MIN_TEXT = 300;  // stalls shorter than this → retryable error, not partial success
      const startTime = Date.now();
      let endReason = 'unknown';

      while (Date.now() - startTime < maxWait) {
        if (aborted()) { endReason = 'aborted'; break; }
        await new Promise(r => setTimeout(r, pollInterval));
        if (aborted()) { endReason = 'aborted'; break; }
        pollCount++;

        // Get steps
        const stepsProto = buildGetTrajectoryStepsRequest(cascadeId, 0);
        const stepsResp = await grpcUnary(
          this.port, this.csrfToken, `${LS_SERVICE}/GetCascadeTrajectorySteps`, grpcFrame(stepsProto)
        );
        const steps = parseTrajectorySteps(stepsResp);

        // CORTEX_STEP_TYPE_ERROR_MESSAGE = 17. An error step means the cascade
        // refused the request (permission denied, model unavailable, etc.) —
        // raise it as a model-level error so the account isn't blamed.
        for (const step of steps) {
          if (step.type === 17 && step.errorText) {
            // Log the full trajectory context so we can see WHICH tool call
            // (if any) the error refers to. "invalid tool call" without
            // context is useless for debugging.
            const trail = steps.map(s => ({
              type: s.type,
              status: s.status,
              textLen: s.text?.length || 0,
              tools: (s.toolCalls || []).map(tc => tc.name).join(','),
            }));
            log.warn('Cascade error step', { errorText: step.errorText.trim(), trail });
            const err = new Error(step.errorText.trim());
            err.isModelError = true;
            throw err;
          }
        }

        // Stall detection — two flavors:
        //   (a) "cold stall": 30s+ ACTIVE but never saw any text or tool
        //       call → planner is deadlocked before even starting to
        //       produce output. Rotate account, don't make the user wait.
        //   (b) "warm stall": we already streamed some text, but it hasn't
        //       grown for 15s while status is still non-IDLE → planner is
        //       stuck in a tool round-trip or upstream throttle. Accept
        //       what we have as a complete response rather than waiting
        //       out the full 180s maxWait with the client hanging.
        const elapsed = Date.now() - startTime;
        // Cap at maxWait (180s): long-context requests can legitimately take
        // that long to emit the first token from Cascade. Was 90s which
        // still tripped on very long prompts (issue #5).
        const coldStallMs = Math.min(maxWait, 30_000 + Math.floor(inputChars / 1500) * 5_000);
        if (elapsed > coldStallMs && sawActive && !sawText && seenToolCallIds.size === 0) {
          log.warn(`Cascade cold stall: ${elapsed}ms active without any text or tool call (threshold=${coldStallMs}ms, inputChars=${inputChars}), bailing`);
          endReason = 'stall_cold';
          const err = new Error(`Cascade planner stalled — no output after ${Math.round(coldStallMs / 1000)}s`);
          err.isModelError = true;
          throw err;
        }
        if (sawText && lastStatus !== 1 && (Date.now() - lastGrowthAt) > NO_GROWTH_STALL_MS) {
          const diag = {
            msSinceGrowth: Date.now() - lastGrowthAt,
            textLen: totalYielded,
            thinkingLen: totalThinking,
            stepCount: yieldedByStep.size,
            toolCalls: seenToolCallIds.size,
            lastStatus,
          };
          // Short-reply stall → treat as error so handlers/chat.js retries on
          // another account. A 50-char preamble is worse than no reply at all
          // because the client accepts it as "successful" and shows it to the
          // user. Retry only if we haven't streamed anything substantial yet
          // (if we did, partial delivery + idle end is fine).
          if (totalYielded < STALL_RETRY_MIN_TEXT) {
            log.warn('Cascade warm stall (short, retrying on next account)', diag);
            endReason = 'stall_warm_retry';
            const err = new Error('Cascade planner stalled after preamble — no progress for 25s');
            err.isModelError = true;
            throw err;
          }
          log.warn('Cascade warm stall (accepting partial)', diag);
          endReason = 'stall_warm';
          break; // return what we have as a successful response
        }

        // Any trajectory change counts as forward progress. A new step, a new
        // tool call proposal, or thinking growth all reset the stall timer so
        // Cascade's slow silent planning phases don't get cut off mid-think.
        if (steps.length > lastStepCount) {
          lastStepCount = steps.length;
          lastGrowthAt = Date.now();
        }

        for (let i = 0; i < steps.length; i++) {
          const step = steps[i];

          // Per-step token usage. Overwrite on every poll so the map always
          // holds the latest reported numbers (they grow monotonically as
          // the generator emits more output). We sum across steps at the
          // end to compute the response's total usage.
          if (step.usage) usageByStep.set(i, step.usage);

          // Collect tool calls — dedupe by id so the same step seen across
          // polls only emits once. A tool call with an existing `result`
          // means the LS already executed it (built-in Cascade tool); we
          // pass it through to the client for visibility.
          if (step.toolCalls && step.toolCalls.length) {
            for (const tc of step.toolCalls) {
              const key = tc.id || `${tc.name}:${tc.argumentsJson}`;
              if (seenToolCallIds.has(key)) continue;
              seenToolCallIds.add(key);
              toolCalls.push(tc);
              lastGrowthAt = Date.now();
            }
          }

          // Thinking delta: the LS keeps `thinking` as the cumulative
          // reasoning text for the step. Track a per-step cursor and emit
          // only the tail as reasoning_content. Crucially, thinking growth
          // *also* resets lastGrowthAt — prior code only watched response
          // text, so long silent thinking phases got falsely flagged as
          // stalls and 20% of Cascade requests came back as 50-char
          // preambles (`/tmp/...` style "let me analyze" stubs).
          const liveThink = step.thinking || '';
          if (liveThink) {
            const prevThink = thinkingByStep.get(i) || 0;
            if (liveThink.length > prevThink) {
              const thinkDelta = liveThink.slice(prevThink);
              thinkingByStep.set(i, liveThink.length);
              totalThinking += thinkDelta.length;
              lastGrowthAt = Date.now();
              const tchunk = { text: '', thinking: thinkDelta, isError: false };
              chunks.push(tchunk);
              onChunk?.(tchunk);
            }
          }

          // Text delta rule: prefer `responseText` (append-only stream) over
          // `modifiedText` (LS post-pass rewrite) while we're streaming. The
          // LS periodically swaps `response` → `modified_response` mid-turn
          // with slightly different wording; if we blindly `entry.text =
          // modifiedText || responseText` and take a length-based slice, the
          // rewritten middle bytes vanish because we already advanced the
          // cursor past them in an earlier poll. Using responseText keeps the
          // slice monotonic. At turn end we top up with `modifiedText` (see
          // below) so the final accumulated text is still the LS's polished
          // version when one exists.
          const liveText = step.responseText || step.text || '';
          if (!liveText) continue;
          const prev = yieldedByStep.get(i) || 0;
          if (liveText.length > prev) {
            const delta = liveText.slice(prev);
            yieldedByStep.set(i, liveText.length);
            totalYielded += delta.length;
            lastGrowthAt = Date.now();
            sawText = true;
            const chunk = { text: delta, thinking: '', isError: false };
            chunks.push(chunk);
            onChunk?.(chunk);
          }
        }

        // Check status
        const statusProto = buildGetTrajectoryRequest(cascadeId);
        const statusResp = await grpcUnary(
          this.port, this.csrfToken, `${LS_SERVICE}/GetCascadeTrajectory`, grpcFrame(statusProto)
        );
        const status = parseTrajectoryStatus(statusResp);
        lastStatus = status;

        if (status !== 1) sawActive = true;

        if (status === 1) { // IDLE
          // Don't allow idle-break during the warmup window unless we've
          // already seen the planner go non-IDLE at least once. Without this
          // guard, cascades whose trajectory hasn't kicked off yet (status
          // stuck at 1 for the first ~600ms) terminate after only 2 polls
          // and the client sees a near-empty reply.
          const elapsed = Date.now() - startTime;
          const graceOver = elapsed > IDLE_GRACE_MS;
          if (!sawActive && !graceOver) {
            continue; // still warming up — don't count this as idle
          }
          idleCount++;
          // Require at least a little text OR a long idle streak before
          // accepting "done", so we don't race the first visible chunk.
          const canBreak = sawText ? idleCount >= 2 : idleCount >= 4;
          if (canBreak) {
            // Final sweep
            const finalResp = await grpcUnary(
              this.port, this.csrfToken, `${LS_SERVICE}/GetCascadeTrajectorySteps`, grpcFrame(stepsProto)
            );
            const finalSteps = parseTrajectorySteps(finalResp);
            for (let i = 0; i < finalSteps.length; i++) {
              const step = finalSteps[i];
              const responseText = step.responseText || '';
              const modifiedText = step.modifiedText || '';
              const prev = yieldedByStep.get(i) || 0;

              // Normal top-up: responseText grew past what we streamed.
              if (responseText.length > prev) {
                const delta = responseText.slice(prev);
                yieldedByStep.set(i, responseText.length);
                totalYielded += delta.length;
                chunks.push({ text: delta, thinking: '', isError: false });
                onChunk?.({ text: delta, thinking: '', isError: false });
              }

              // Modified-response top-up: only if it's a strict extension of
              // what we already emitted. If modifiedText rewrites the prefix
              // (common when LS polishes), emitting the tail would splice
              // wrong content onto the stream, so we skip it and keep the
              // raw responseText we already showed.
              const cursor = yieldedByStep.get(i) || 0;
              if (modifiedText.length > cursor && modifiedText.startsWith(responseText)) {
                const delta = modifiedText.slice(cursor);
                yieldedByStep.set(i, modifiedText.length);
                totalYielded += delta.length;
                chunks.push({ text: delta, thinking: '', isError: false });
                onChunk?.({ text: delta, thinking: '', isError: false });
              }
            }
            endReason = sawText ? 'idle_done' : 'idle_empty';
            break;
          }
        } else {
          idleCount = 0;
        }
      }
      if (endReason === 'unknown') endReason = 'max_wait';

      // Structured summary so we can diagnose short/empty completions after
      // the fact. sawActive=false + sawText=false + idle_empty = the planner
      // never actually ran on this cascade — likely an upstream starvation.
      const summary = {
        cascadeId: cascadeId.slice(0, 8),
        reason: endReason,
        polls: pollCount,
        textLen: totalYielded,
        thinkingLen: totalThinking,
        stepCount: Math.max(yieldedByStep.size, thinkingByStep.size, lastStepCount),
        toolCalls: seenToolCallIds.size,
        sawActive,
        sawText,
        lastStatus,
        ms: Date.now() - startTime,
      };
      if (totalYielded < 20 && endReason !== 'aborted') {
        log.warn('Cascade short reply', summary);
      } else {
        log.info('Cascade done', summary);
      }

      onEnd?.(chunks);

      // ── Real token usage via GetCascadeTrajectoryGeneratorMetadata ──
      // CortexStepMetadata.model_usage (the per-step field) is usually empty
      // in the step trajectory response — the LS only populates the real
      // token counts in a separate RPC keyed off cascade_id. We fire this
      // once after the polling loop ends. Keep it non-fatal: a network blip
      // here just drops usage back to the chars/4 estimator, the response
      // itself is already formed.
      let serverUsage = null;
      try {
        const metaReq = buildGetGeneratorMetadataRequest(cascadeId, 0);
        const metaResp = await grpcUnary(
          this.port, this.csrfToken,
          `${LS_SERVICE}/GetCascadeTrajectoryGeneratorMetadata`,
          grpcFrame(metaReq), 5000
        );
        serverUsage = parseGeneratorMetadata(metaResp);
      } catch (e) {
        log.debug(`GetCascadeTrajectoryGeneratorMetadata failed: ${e.message}`);
      }
      // Fallback: if the generator metadata RPC didn't give us anything,
      // check the per-step metadata we collected during polling (some LS
      // versions do populate CortexStepMetadata.model_usage directly).
      if (!serverUsage && usageByStep.size > 0) {
        let inT = 0, outT = 0, cacheR = 0, cacheW = 0;
        for (const u of usageByStep.values()) {
          inT += u.inputTokens || 0;
          outT += u.outputTokens || 0;
          cacheR += u.cacheReadTokens || 0;
          cacheW += u.cacheWriteTokens || 0;
        }
        if (inT || outT || cacheR || cacheW) {
          serverUsage = {
            inputTokens: inT,
            outputTokens: outT,
            cacheReadTokens: cacheR,
            cacheWriteTokens: cacheW,
          };
        }
      }

      // Attach cascade metadata so the caller can check it back into the
      // conversation pool. We still return the array so existing callers
      // that iterate over it keep working.
      chunks.cascadeId = cascadeId;
      chunks.sessionId = sessionId;
      chunks.toolCalls = toolCalls;
      chunks.usage = serverUsage;
      if (serverUsage) {
        log.info(`Cascade usage: in=${serverUsage.inputTokens} out=${serverUsage.outputTokens} cache_r=${serverUsage.cacheReadTokens} cache_w=${serverUsage.cacheWriteTokens}`);
      }
      if (toolCalls.length) log.info(`Cascade tool calls: ${toolCalls.length}`, { names: toolCalls.map(t => t.name) });
      return chunks;

    } catch (err) {
      onError?.(err);
      throw err;
    }
  }

  // ─── Register user (JSON REST, unchanged) ────────────────

  async registerUser(firebaseToken) {
    return new Promise((resolve, reject) => {
      const postData = JSON.stringify({ firebase_id_token: firebaseToken });
      const req = https.request({
        hostname: 'api.codeium.com',
        port: 443,
        path: '/register_user/',
        method: 'POST',
        headers: {
          'Content-Type': 'application/json',
          'Content-Length': Buffer.byteLength(postData),
        },
      }, (res) => {
        let raw = '';
        res.on('data', d => raw += d);
        res.on('end', () => {
          try {
            const json = JSON.parse(raw);
            if (res.statusCode >= 400) {
              reject(new Error(`RegisterUser failed (${res.statusCode}): ${raw}`));
              return;
            }
            if (!json.api_key) {
              reject(new Error(`RegisterUser response missing api_key: ${raw}`));
              return;
            }
            resolve({ apiKey: json.api_key, name: json.name, apiServerUrl: json.api_server_url });
          } catch {
            reject(new Error(`RegisterUser parse error: ${raw}`));
          }
        });
        res.on('error', reject);
      });
      req.on('error', reject);
      req.write(postData);
      req.end();
    });
  }

  // ── GetUserStatus ────────────────────────────────────────
  //
  // One-shot RPC that returns the account's canonical tier + cascade
  // model allowlist + credit usage + trial end time. Replaces the
  // probe-based tier inference for accounts where this call succeeds.
  async getUserStatus() {
    const proto = buildGetUserStatusRequest(this.apiKey);
    const resp = await grpcUnary(
      this.port, this.csrfToken,
      `${LS_SERVICE}/GetUserStatus`, grpcFrame(proto), 10000,
    );
    return parseGetUserStatusResponse(resp);
  }
}