File size: 6,314 Bytes
f6678ab
e9c2c73
6b6afea
f6678ab
6b6afea
f6678ab
3afbbdf
 
 
 
 
 
 
f6678ab
6b6afea
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f6678ab
 
e9c2c73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f6678ab
 
 
 
 
 
 
 
 
 
 
 
6b6afea
f6678ab
 
 
 
 
 
6b6afea
 
 
 
 
 
 
 
 
 
 
 
 
e9c2c73
f6678ab
 
6b6afea
f6678ab
 
 
 
 
 
 
 
 
 
 
 
df28ae4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f6678ab
df28ae4
 
 
 
 
 
 
 
 
f6678ab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
import { streamText, convertToModelMessages } from "ai";
import type { UIMessage } from "ai";
import { createOpenAICompatible } from "@ai-sdk/openai-compatible";
import type { Request, Response } from "express";
import { extractToken } from "../auth.js";

/**
 * `openai/gpt-oss-120b` is HF's trending tool-calling model with a
 * strong reputation on the editor's 18-tool agent loop. The `:fastest`
 * policy routes it to Cerebras, which we've validated end-to-end
 * (multi-turn, with the reasoning-strip below).
 */
export const DEFAULT_MODEL = "openai/gpt-oss-120b";

/**
 * Hugging Face Inference Providers exposes an OpenAI-compatible chat
 * completions endpoint at `https://router.huggingface.co/v1` that routes
 * to a fleet of providers (Cerebras, Together, Fireworks, ...). The
 * upside: any HF user token with the `inference-api` scope can call it,
 * so a forked Space gets AI features for free as soon as the user logs
 * in - no extra API key to wire up.
 *
 * See https://huggingface.co/docs/inference-providers
 */
const HF_INFERENCE_BASE_URL = "https://router.huggingface.co/v1";

/**
 * Resolve the HF token used to authenticate inference calls.
 *
 * Priority:
 *   1. The currently logged-in editor's OAuth token (forwarded from the
 *      `hf_access_token` cookie). This is the production path on a HF
 *      Space - no environment secret needed.
 *   2. The `HF_TOKEN` env var fallback. Useful for local dev when OAuth
 *      isn't configured, or as a server-side default when the OAuth
 *      scope doesn't include `inference-api` yet.
 */
function resolveHfToken(req: Request): string | undefined {
  const userToken = extractToken(req.headers.cookie);
  if (userToken) return userToken;
  const envToken = process.env.HF_TOKEN;
  if (envToken) return envToken;
  return undefined;
}

function createProvider(apiKey: string) {
  return createOpenAICompatible({
    name: "huggingface",
    baseURL: HF_INFERENCE_BASE_URL,
    apiKey,
  });
}

/**
 * Drop `reasoning` parts from past assistant messages.
 *
 * Reasoning models (gpt-oss, DeepSeek R1, ...) emit a `reasoning` part
 * alongside their text. The AI SDK persists those parts in the UI
 * message history, and on the next turn they round-trip back to the
 * provider as `reasoning_content` on the assistant message.
 *
 * Some HF Inference Providers (notably Cerebras for gpt-oss) reject
 * that field with a 400 `wrong_api_format`. Since the model doesn't
 * need to see its own prior reasoning to keep the conversation going,
 * the safest fix is to strip those parts before conversion.
 */
function stripReasoningParts(messages: UIMessage[]): UIMessage[] {
  return messages.map((m) => {
    if (m.role !== "assistant" || !Array.isArray(m.parts)) return m;
    const filtered = m.parts.filter(
      (p) => typeof p.type !== "string" || p.type !== "reasoning",
    );
    if (filtered.length === m.parts.length) return m;
    return { ...m, parts: filtered };
  });
}

interface StreamChatOptions {
  systemPrompt: string;
  tools: Parameters<typeof streamText>[0]["tools"];
  logPrefix: string;
}

export async function streamChatResponse(
  req: Request,
  res: Response,
  { systemPrompt, tools, logPrefix }: StreamChatOptions,
) {
  try {
    const { messages, model } = req.body;

    if (!messages || !Array.isArray(messages)) {
      res.status(400).json({ error: "messages array is required" });
      return;
    }

    const apiKey = resolveHfToken(req);
    if (!apiKey) {
      res.status(500).json({
        error:
          "No Hugging Face token available. Sign in with your HF account " +
          "(the OAuth token is used to call Inference Providers) or set " +
          "HF_TOKEN in the backend environment.",
      });
      return;
    }

    const provider = createProvider(apiKey);
    const modelId = model || process.env.HF_INFERENCE_MODEL || DEFAULT_MODEL;
    const modelMessages = await convertToModelMessages(stripReasoningParts(messages));

    const result = streamText({
      model: provider.chatModel(modelId),
      system: systemPrompt,
      messages: modelMessages,
      tools,
    });

    const webResponse = result.toUIMessageStreamResponse({
      onError: (error) => {
        console.error(`[${logPrefix}] stream error:`, error);
        return error instanceof Error ? error.message : "Stream error";
      },
    });

    // ----- Disable response buffering --------------------------------------
    // The AI SDK already streams tokens chunk-by-chunk, but a few layers
    // between Node and the browser can re-buffer them into one big payload
    // that arrives all at once, killing the "typing" effect the user
    // expects:
    //   - reverse proxies (Nginx on a HF Space) buffer responses unless
    //     `X-Accel-Buffering: no` is set;
    //   - some CDNs / loadbalancers gzip the body on the fly, which means
    //     they hold chunks until the compression window fills up;
    //   - Node's own socket keeps small writes in TCP buffers until they
    //     reach the MTU, which is fine on localhost but visible over a
    //     real network.
    // The headers below opt out of all of those. We also flush response
    // headers eagerly so the browser commits to streaming mode and starts
    // rendering as soon as the first byte arrives.
    const headers: Record<string, string> = Object.fromEntries(
      webResponse.headers.entries(),
    );
    headers["X-Accel-Buffering"] = "no";
    headers["Cache-Control"] = "no-cache, no-transform";
    headers["Content-Encoding"] = "identity";
    res.writeHead(webResponse.status, headers);
    res.flushHeaders?.();
    // Disable Nagle so small chunks ship immediately instead of waiting
    // to coalesce into a packet.
    res.socket?.setNoDelay?.(true);

    const reader = webResponse.body!.getReader();
    const pump = async (): Promise<void> => {
      const { done, value } = await reader.read();
      if (done) {
        res.end();
        return;
      }
      res.write(value);
      return pump();
    };
    await pump();
  } catch (error: unknown) {
    const message =
      error instanceof Error ? error.message : "Internal server error";
    console.error(`[${logPrefix}] error:`, message);

    if (!res.headersSent) {
      res.status(500).json({ error: message });
    }
  }
}