File size: 6,314 Bytes
f6678ab e9c2c73 6b6afea f6678ab 6b6afea f6678ab 3afbbdf f6678ab 6b6afea f6678ab e9c2c73 f6678ab 6b6afea f6678ab 6b6afea e9c2c73 f6678ab 6b6afea f6678ab df28ae4 f6678ab df28ae4 f6678ab | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 | import { streamText, convertToModelMessages } from "ai";
import type { UIMessage } from "ai";
import { createOpenAICompatible } from "@ai-sdk/openai-compatible";
import type { Request, Response } from "express";
import { extractToken } from "../auth.js";
/**
* `openai/gpt-oss-120b` is HF's trending tool-calling model with a
* strong reputation on the editor's 18-tool agent loop. The `:fastest`
* policy routes it to Cerebras, which we've validated end-to-end
* (multi-turn, with the reasoning-strip below).
*/
export const DEFAULT_MODEL = "openai/gpt-oss-120b";
/**
* Hugging Face Inference Providers exposes an OpenAI-compatible chat
* completions endpoint at `https://router.huggingface.co/v1` that routes
* to a fleet of providers (Cerebras, Together, Fireworks, ...). The
* upside: any HF user token with the `inference-api` scope can call it,
* so a forked Space gets AI features for free as soon as the user logs
* in - no extra API key to wire up.
*
* See https://huggingface.co/docs/inference-providers
*/
const HF_INFERENCE_BASE_URL = "https://router.huggingface.co/v1";
/**
* Resolve the HF token used to authenticate inference calls.
*
* Priority:
* 1. The currently logged-in editor's OAuth token (forwarded from the
* `hf_access_token` cookie). This is the production path on a HF
* Space - no environment secret needed.
* 2. The `HF_TOKEN` env var fallback. Useful for local dev when OAuth
* isn't configured, or as a server-side default when the OAuth
* scope doesn't include `inference-api` yet.
*/
function resolveHfToken(req: Request): string | undefined {
const userToken = extractToken(req.headers.cookie);
if (userToken) return userToken;
const envToken = process.env.HF_TOKEN;
if (envToken) return envToken;
return undefined;
}
function createProvider(apiKey: string) {
return createOpenAICompatible({
name: "huggingface",
baseURL: HF_INFERENCE_BASE_URL,
apiKey,
});
}
/**
* Drop `reasoning` parts from past assistant messages.
*
* Reasoning models (gpt-oss, DeepSeek R1, ...) emit a `reasoning` part
* alongside their text. The AI SDK persists those parts in the UI
* message history, and on the next turn they round-trip back to the
* provider as `reasoning_content` on the assistant message.
*
* Some HF Inference Providers (notably Cerebras for gpt-oss) reject
* that field with a 400 `wrong_api_format`. Since the model doesn't
* need to see its own prior reasoning to keep the conversation going,
* the safest fix is to strip those parts before conversion.
*/
function stripReasoningParts(messages: UIMessage[]): UIMessage[] {
return messages.map((m) => {
if (m.role !== "assistant" || !Array.isArray(m.parts)) return m;
const filtered = m.parts.filter(
(p) => typeof p.type !== "string" || p.type !== "reasoning",
);
if (filtered.length === m.parts.length) return m;
return { ...m, parts: filtered };
});
}
interface StreamChatOptions {
systemPrompt: string;
tools: Parameters<typeof streamText>[0]["tools"];
logPrefix: string;
}
export async function streamChatResponse(
req: Request,
res: Response,
{ systemPrompt, tools, logPrefix }: StreamChatOptions,
) {
try {
const { messages, model } = req.body;
if (!messages || !Array.isArray(messages)) {
res.status(400).json({ error: "messages array is required" });
return;
}
const apiKey = resolveHfToken(req);
if (!apiKey) {
res.status(500).json({
error:
"No Hugging Face token available. Sign in with your HF account " +
"(the OAuth token is used to call Inference Providers) or set " +
"HF_TOKEN in the backend environment.",
});
return;
}
const provider = createProvider(apiKey);
const modelId = model || process.env.HF_INFERENCE_MODEL || DEFAULT_MODEL;
const modelMessages = await convertToModelMessages(stripReasoningParts(messages));
const result = streamText({
model: provider.chatModel(modelId),
system: systemPrompt,
messages: modelMessages,
tools,
});
const webResponse = result.toUIMessageStreamResponse({
onError: (error) => {
console.error(`[${logPrefix}] stream error:`, error);
return error instanceof Error ? error.message : "Stream error";
},
});
// ----- Disable response buffering --------------------------------------
// The AI SDK already streams tokens chunk-by-chunk, but a few layers
// between Node and the browser can re-buffer them into one big payload
// that arrives all at once, killing the "typing" effect the user
// expects:
// - reverse proxies (Nginx on a HF Space) buffer responses unless
// `X-Accel-Buffering: no` is set;
// - some CDNs / loadbalancers gzip the body on the fly, which means
// they hold chunks until the compression window fills up;
// - Node's own socket keeps small writes in TCP buffers until they
// reach the MTU, which is fine on localhost but visible over a
// real network.
// The headers below opt out of all of those. We also flush response
// headers eagerly so the browser commits to streaming mode and starts
// rendering as soon as the first byte arrives.
const headers: Record<string, string> = Object.fromEntries(
webResponse.headers.entries(),
);
headers["X-Accel-Buffering"] = "no";
headers["Cache-Control"] = "no-cache, no-transform";
headers["Content-Encoding"] = "identity";
res.writeHead(webResponse.status, headers);
res.flushHeaders?.();
// Disable Nagle so small chunks ship immediately instead of waiting
// to coalesce into a packet.
res.socket?.setNoDelay?.(true);
const reader = webResponse.body!.getReader();
const pump = async (): Promise<void> => {
const { done, value } = await reader.read();
if (done) {
res.end();
return;
}
res.write(value);
return pump();
};
await pump();
} catch (error: unknown) {
const message =
error instanceof Error ? error.message : "Internal server error";
console.error(`[${logPrefix}] error:`, message);
if (!res.headersSent) {
res.status(500).json({ error: message });
}
}
}
|