import express from "express"; import { createServer } from "http"; import { WebSocketServer } from "ws"; import { Hocuspocus } from "@hocuspocus/server"; import { Database } from "@hocuspocus/extension-database"; import { readFileSync, writeFileSync, mkdirSync, existsSync } from "fs"; import { join } from "path"; import * as Y from "yjs"; import { extractToken, isOAuthEnabled } from "./auth.js"; import { isHfStorageEnabled, setUserToken, pullDocument } from "./hf-storage.js"; import { citationsRouter } from "./citations.js"; import { getDataDir, docPath, sanitizeName } from "./utils.js"; import { debouncedSave, ensurePublishedRestored } from "./persistence.js"; import { createAuthRouter, createRequireEditor } from "./routes/auth.js"; import { createChatRouter } from "./routes/chat.js"; import { createPublishRouter } from "./routes/publish.js"; import { createUploadRouter } from "./routes/upload.js"; import { createDatasetProxyRouter } from "./routes/dataset-proxy.js"; import { createStorageRouter } from "./routes/storage.js"; export { debouncedSave, resetSaveTimers, resetPublishedRestored } from "./persistence.js"; const DEFAULT_DOC_NAME = "default"; /** * Full inline SVG of the official Hugging Face brand logo (smiling face * with both hugging hands). Drawn from * https://huggingface.co/front/assets/huggingface_logo-noborder.svg * so the page has zero network dependencies. */ const HF_LOGO_SVG = ``; interface LoginPageOptions { title: string; description: string; /** * Optional callout block shown between the description and the * sign-in button. Used to surface "you're authenticated but you * didn't grant access to the organization" hints. HTML must * already be escaped by the caller. */ noteHtml?: string; /** Button label override (default: "Sign in with Hugging Face"). */ buttonLabel?: string; /** Button href override (default: "/oauth/authorize"). */ buttonHref?: string; } function renderLoginPage({ title, description, noteHtml, buttonLabel, buttonHref, }: LoginPageOptions): string { const safeTitle = escapeHtml(title); const safeDescription = escapeHtml(description); const safeButtonLabel = escapeHtml(buttonLabel || "Sign in with Hugging Face"); const safeButtonHref = escapeHtml(buttonHref || "/oauth/authorize"); return ` ${safeTitle} - Research Article Template Editor

${safeTitle}

${noteHtml ? `` : ""}
`; } function escapeHtml(value: string): string { return value .replace(/&/g, "&") .replace(//g, ">") .replace(/"/g, """) .replace(/'/g, "'"); } function sendLoginPage(res: express.Response) { res.status(200).send( renderLoginPage({ title: "This article is not yet published", description: "Sign in with your Hugging Face account to access the editor.", }), ); } /** * Pick the right "you can't edit" message for the /editor login page * based on what we know about the requester: * * - No user at all (no cookie / invalid token): plain sign-in prompt. * - User authenticated, Space owned by an org, but the org isn't in * the OAuth grant scope: explain that they need to tick the org * checkbox on HF's consent screen, and link straight to a * force-consent login URL. * - User authenticated, org grant is fine, but they're not in the * write/admin members of the org (or it's a personal Space they * don't own): tell them they don't have permission - no point * re-prompting the consent screen, an admin needs to add them. */ function buildEditorLoginPage( user: | { name: string; accessIssue?: "no-org-grant" | "not-member"; spaceOrg?: string } | null, ): LoginPageOptions { if (!user) { return { title: "Editor access", description: "Sign in with your Hugging Face account to start editing.", }; } const safeHandle = escapeHtml(user.name); const safeOrg = user.spaceOrg ? escapeHtml(user.spaceOrg) : ""; if (user.accessIssue === "no-org-grant" && safeOrg) { return { title: "Permission needed", description: `You're signed in as @${safeHandle}, but this app can't see your membership of the ${safeOrg} organization.`, noteHtml: [ `To unlock editing, re-authorize the app and tick the ${safeOrg} organization on the consent screen:`, `
    `, `
  1. Click Authorize again below.
  2. `, `
  3. On the Hugging Face consent screen, scroll to Organizations.
  4. `, `
  5. Toggle access for ${safeOrg} on, then confirm.
  6. `, `
`, ].join(""), buttonLabel: "Authorize again", buttonHref: "/oauth/authorize?prompt=consent", }; } return { title: "No write access", description: `You're signed in as @${safeHandle}, but your account doesn't have write access to this Space` + (safeOrg ? ` (owned by ${safeOrg}).` : "."), noteHtml: `Ask an admin of ${safeOrg || "the Space"} to add you as a write member. ` + `If you have multiple Hugging Face accounts, you can also sign out and try again with the right one.`, buttonLabel: "Sign in with another account", buttonHref: "/oauth/authorize?prompt=consent", }; } export function createApp() { const DATA_DIR = getDataDir(); mkdirSync(DATA_DIR, { recursive: true }); const oauthEnabled = isOAuthEnabled(); // ---------- Hocuspocus (Y.js collaboration server) ---------- const hocuspocus = new Hocuspocus({ async onAuthenticate({ token, context }: { token: string; context: any }) { if (!oauthEnabled) return; const { resolveUser } = await import("./auth.js"); // Two-source pattern: the HocuspocusProvider client sends `token` // via the WS sub-protocol, but our cookie is httpOnly so the // client can't read it and sends "" instead. Fall back to the // cookie the upgrade handler stuffed into context.token. const authToken = token || context?.token; // Surface enough info in the Space logs to triage "Disconnected" // reports without leaking the token itself. We log: // - whether each source produced a token (just truthiness) // - the resolved user (or null) + accessIssue when present // - the SPACE_ID owner being checked, to spot org mismatches const tokenSource = token ? "client" : context?.token ? "cookie" : "none"; const tokenLen = authToken ? authToken.length : 0; const user = await resolveUser(authToken); const spaceOwner = (process.env.SPACE_ID || "").split("/")[0] || "(none)"; if (!user) { console.warn( `[ws-auth] reject: no user resolved` + ` source=${tokenSource} tokenLen=${tokenLen}` + ` spaceOwner=${spaceOwner}`, ); throw new Error("Unauthorized: invalid or missing HF token"); } if (!user.canEdit) { console.warn( `[ws-auth] reject: ${user.name} can't write to ${spaceOwner}` + ` issue=${user.accessIssue ?? "unknown"}` + ` source=${tokenSource}`, ); throw new Error( `Unauthorized: ${user.name} has no write access to ${spaceOwner}` + (user.accessIssue ? ` (${user.accessIssue})` : ""), ); } console.log( `[ws-auth] accept user=${user.name} spaceOwner=${spaceOwner}` + ` source=${tokenSource}`, ); if (authToken) setUserToken(authToken); return { user }; }, extensions: [ new Database({ fetch: async ({ documentName }: { documentName: string }) => { try { const p = docPath(documentName); if (existsSync(p)) { const buf = readFileSync(p); console.log(`[persist] fetch "${documentName}" from disk: ${buf.length} bytes`); return buf; } console.log(`[persist] fetch "${documentName}": no file on disk`); if (isHfStorageEnabled()) { const data = await pullDocument(documentName); if (data) { writeFileSync(p, data); console.log(`[persist] pulled ${documentName} from HF`); return Buffer.from(data); } } } catch (err) { console.error(`[persist] fetch "${documentName}" failed:`, (err as Error).message); } return null; }, store: async () => {}, }), { async onChange({ documentName, document }: { documentName: string; document: any }) { console.log(`[persist] onChange "${documentName}"`); debouncedSave(documentName, document); }, async afterLoadDocument({ documentName }: { documentName: string }) { console.log(`[persist] loaded "${documentName}"`); }, } as any, ], }); // ---------- Express app ---------- const app = express(); const httpServer = createServer(app); app.use(express.json({ limit: "1mb" })); const authCtx = { oauthEnabled }; const requireEditor = createRequireEditor(authCtx); app.use(createAuthRouter(authCtx)); app.use(createChatRouter(requireEditor)); app.use("/api/citations", citationsRouter); app.use(createPublishRouter({ oauthEnabled, hocuspocus })); app.use(createUploadRouter()); app.use(createStorageRouter({ oauthEnabled })); // Reverse proxy for private-dataset assets. Mounted before any // static serving so `/d/*` always wins, never falls through to a // 404 from express.static. app.use(createDatasetProxyRouter()); // ---------- Collab WebSocket ---------- const wss = new WebSocketServer({ noServer: true }); httpServer.on("upgrade", (req, socket, head) => { const url = req.url || ""; if (url === "/collab" || url.startsWith("/collab/") || url.startsWith("/collab?")) { console.log(`[ws] upgrade request for ${url}`); wss.handleUpgrade(req, socket, head, (ws) => { ws.setMaxListeners(Infinity); ws.on("error", (error) => { console.error("[ws] socket error:", error.message); }); if (process.env.NODE_ENV !== "production") { ws.on("message", (data: Buffer, isBinary: boolean) => { const buf = Buffer.isBuffer(data) ? data : Buffer.from(data as any); console.log(`[ws-debug] msg ${buf.length}B binary=${isBinary} first20=${buf.slice(0, 20).toString("hex")}`); }); } const token = extractToken(req.headers.cookie); // Diagnostic for "Disconnected" reports: confirms whether the // browser actually attached our session cookie to the WS // upgrade. On HF Spaces, some gating setups occasionally // strip cookies on WS upgrades even when they're sent for // plain HTTP, which manifests as a working /editor route // but a permanently-failing WS auth. const cookieHeader = req.headers.cookie || ""; console.log( `[ws] upgrade cookies=${cookieHeader.length}B hasToken=${Boolean(token)}`, ); hocuspocus.handleConnection(ws, req, { token }); }); } else { console.log(`[ws] rejected upgrade for ${url}`); socket.destroy(); } }); // ---------- Static assets ---------- app.use("/uploads", express.static(join(DATA_DIR, "uploads"))); app.use("/published", express.static(join(DATA_DIR, "published"))); const staticDir = process.env.NODE_ENV === "production" ? join(DATA_DIR, "..", "frontend-dist") : join(DATA_DIR, "..", "..", "frontend", "dist"); function getPublishedPath(docName: string): string { return join(DATA_DIR, "published", docName, "index.html"); } function getPublishedAssetPath(docName: string, filename: string): string { return join(DATA_DIR, "published", docName, filename); } if (existsSync(staticDir)) { app.use(express.static(staticDir, { index: false })); // ---- LLM-friendly endpoints -------------------------------------- // The publisher generates a Markdown twin of the article (`llms.txt`) // following the https://llmstxt.org/ convention. We expose it at the // Space root so external agents/crawlers (Claude, Perplexity, ...) can // consume the article without having to parse the heavy HTML page. // `robots.txt` advertises this endpoint. app.get("/llms.txt", async (_req, res) => { const llmsPath = getPublishedAssetPath(DEFAULT_DOC_NAME, "llms.txt"); if (!existsSync(llmsPath)) { res.status(404).type("text/plain").send("Not yet published"); return; } res.type("text/markdown; charset=utf-8"); res.sendFile(llmsPath); }); app.get("/robots.txt", (_req, res) => { res .type("text/plain; charset=utf-8") .send( [ "User-agent: *", "Allow: /", "", "LLMs-Txt: /llms.txt", "", ].join("\n"), ); }); app.get("/editor", async (req, res) => { if (oauthEnabled) { const { resolveUser } = await import("./auth.js"); const token = extractToken(req.headers.cookie); const user = await resolveUser(token); if (!user || !user.canEdit) { res.status(200).send(renderLoginPage(buildEditorLoginPage(user))); return; } } res.sendFile(join(staticDir, "index.html")); }); app.get("*", async (req, res) => { if (!oauthEnabled) { res.sendFile(join(staticDir, "index.html")); return; } const visitorToken = extractToken(req.headers.cookie) ?? undefined; await ensurePublishedRestored(visitorToken); const publishedPath = getPublishedPath(DEFAULT_DOC_NAME); if (existsSync(publishedPath)) { res.sendFile(publishedPath); return; } sendLoginPage(res); }); } return { app, httpServer, hocuspocus, wss }; }