fix(highlight): auto-detect language for code blocks without explicit lang
Browse filesCode blocks authored without a language rendered as plain text in both the
editor and the published output (Shiki falls back to "text"). Add a shared
lowlight-based detector and use it as a fallback in the editor decorations
and the publish transformer, so language-less blocks get highlighted
identically everywhere - including in already-written articles.
Co-authored-by: Cursor <cursoragent@cursor.com>
backend/src/publisher/transformers/highlight-code.ts
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
import type { Element as HastElement, ElementContent, Root as HastRoot } from "hast";
|
| 2 |
import { getSharedHighlighter, isSupportedLang, normalizeLang, SHIKI_THEMES } from "../../shared/shiki-config.js";
|
|
|
|
| 3 |
import type { Transformer } from "./types.js";
|
| 4 |
|
| 5 |
/**
|
|
@@ -127,11 +128,15 @@ export const highlightCodeTransformer: Transformer = {
|
|
| 127 |
if (!pre || pre.tagName.toLowerCase() !== "pre") continue;
|
| 128 |
if (pre.classList.contains("mermaid")) continue;
|
| 129 |
|
| 130 |
-
const rawLang = extractLang(codeEl as unknown as Element);
|
| 131 |
-
const lang = normalizeLang(rawLang);
|
| 132 |
const source = codeEl.textContent || "";
|
| 133 |
if (!source) continue;
|
| 134 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
let hast: HastRoot;
|
| 136 |
try {
|
| 137 |
hast = highlighter.codeToHast(source, {
|
|
|
|
| 1 |
import type { Element as HastElement, ElementContent, Root as HastRoot } from "hast";
|
| 2 |
import { getSharedHighlighter, isSupportedLang, normalizeLang, SHIKI_THEMES } from "../../shared/shiki-config.js";
|
| 3 |
+
import { detectShikiLang } from "../../shared/detect-lang.js";
|
| 4 |
import type { Transformer } from "./types.js";
|
| 5 |
|
| 6 |
/**
|
|
|
|
| 128 |
if (!pre || pre.tagName.toLowerCase() !== "pre") continue;
|
| 129 |
if (pre.classList.contains("mermaid")) continue;
|
| 130 |
|
|
|
|
|
|
|
| 131 |
const source = codeEl.textContent || "";
|
| 132 |
if (!source) continue;
|
| 133 |
|
| 134 |
+
// Fall back to auto-detection when the block has no explicit language,
|
| 135 |
+
// so language-less blocks (the common case in authored docs) still get
|
| 136 |
+
// highlighted. Same logic runs in the editor for an identical result.
|
| 137 |
+
const rawLang = extractLang(codeEl as unknown as Element);
|
| 138 |
+
const lang = normalizeLang(rawLang) || detectShikiLang(source);
|
| 139 |
+
|
| 140 |
let hast: HastRoot;
|
| 141 |
try {
|
| 142 |
hast = highlighter.codeToHast(source, {
|
backend/src/shared/detect-lang.ts
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/**
|
| 2 |
+
* Shared code-language auto-detection.
|
| 3 |
+
*
|
| 4 |
+
* Code blocks authored without an explicit language render as plain text in
|
| 5 |
+
* both the editor (PM decorations) and the publisher (static SSR), because
|
| 6 |
+
* Shiki falls back to `"text"` when no language is set. This module fills that
|
| 7 |
+
* gap: given the block's source, it guesses the language with highlight.js
|
| 8 |
+
* (via lowlight, already a dependency) and maps the guess onto a language
|
| 9 |
+
* Shiki actually bundles.
|
| 10 |
+
*
|
| 11 |
+
* Used by BOTH `code-block-shiki.tsx` (editor) and `highlight-code.ts`
|
| 12 |
+
* (publisher) so a language-less block is highlighted identically everywhere,
|
| 13 |
+
* including in articles that were already written.
|
| 14 |
+
*
|
| 15 |
+
* Detection results are cached by source text: the editor rebuilds its
|
| 16 |
+
* decoration set on every doc change, and we don't want to re-run the
|
| 17 |
+
* (relatively expensive) auto-detector for blocks whose content is unchanged.
|
| 18 |
+
*/
|
| 19 |
+
import { createLowlight, common } from "lowlight";
|
| 20 |
+
import { isSupportedLang, normalizeLang } from "./shiki-config.js";
|
| 21 |
+
|
| 22 |
+
const lowlight = createLowlight(common);
|
| 23 |
+
|
| 24 |
+
/** Don't auto-detect on trivially short snippets - too little signal. */
|
| 25 |
+
const MIN_LENGTH = 3;
|
| 26 |
+
/**
|
| 27 |
+
* highlight.js relevance is roughly proportional to how many language-specific
|
| 28 |
+
* constructs matched. A small floor avoids tagging arbitrary prose as code
|
| 29 |
+
* while still catching short-but-clear snippets (e.g. a couple of imports).
|
| 30 |
+
*/
|
| 31 |
+
const MIN_RELEVANCE = 2;
|
| 32 |
+
|
| 33 |
+
const cache = new Map<string, string>();
|
| 34 |
+
const CACHE_MAX = 200;
|
| 35 |
+
|
| 36 |
+
/**
|
| 37 |
+
* Best-effort detection of the Shiki language for a code block that has no
|
| 38 |
+
* explicit language. Returns a supported Shiki language name, or "" when
|
| 39 |
+
* detection is inconclusive or lands on a language Shiki doesn't bundle.
|
| 40 |
+
*/
|
| 41 |
+
export function detectShikiLang(code: string | null | undefined): string {
|
| 42 |
+
const text = code ?? "";
|
| 43 |
+
if (text.trim().length < MIN_LENGTH) return "";
|
| 44 |
+
|
| 45 |
+
const cached = cache.get(text);
|
| 46 |
+
if (cached !== undefined) return cached;
|
| 47 |
+
|
| 48 |
+
let detected = "";
|
| 49 |
+
try {
|
| 50 |
+
const result = lowlight.highlightAuto(text);
|
| 51 |
+
const lang = result.data?.language ?? "";
|
| 52 |
+
const relevance = result.data?.relevance ?? 0;
|
| 53 |
+
if (lang && relevance >= MIN_RELEVANCE) {
|
| 54 |
+
const normalized = normalizeLang(lang);
|
| 55 |
+
if (isSupportedLang(normalized)) detected = normalized;
|
| 56 |
+
}
|
| 57 |
+
} catch {
|
| 58 |
+
detected = "";
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
// Cheap eviction: clear wholesale once the cache grows past the cap. Code
|
| 62 |
+
// blocks are few and small, so this rarely triggers.
|
| 63 |
+
if (cache.size >= CACHE_MAX) cache.clear();
|
| 64 |
+
cache.set(text, detected);
|
| 65 |
+
return detected;
|
| 66 |
+
}
|
backend/tests/highlight-autodetect.test.ts
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/**
|
| 2 |
+
* Auto-detection of syntax highlighting for language-less code blocks.
|
| 3 |
+
*
|
| 4 |
+
* Code blocks authored without an explicit language must still be highlighted
|
| 5 |
+
* (in both the editor and the published output). These tests cover the
|
| 6 |
+
* publisher path end-to-end plus the shared detector in isolation.
|
| 7 |
+
*/
|
| 8 |
+
import { describe, it, expect } from "vitest";
|
| 9 |
+
import { renderArticleHTML, type PublishMeta } from "../src/publisher/html-renderer.js";
|
| 10 |
+
import type { PublishCSS } from "../src/publisher/index.js";
|
| 11 |
+
import { detectShikiLang } from "../src/shared/detect-lang.js";
|
| 12 |
+
|
| 13 |
+
const EMPTY_CSS: PublishCSS = {
|
| 14 |
+
variables: "",
|
| 15 |
+
reset: "",
|
| 16 |
+
base: "",
|
| 17 |
+
layout: "",
|
| 18 |
+
print: "",
|
| 19 |
+
editorTokens: "",
|
| 20 |
+
article: "",
|
| 21 |
+
components: "",
|
| 22 |
+
publisher: "",
|
| 23 |
+
};
|
| 24 |
+
|
| 25 |
+
const META: PublishMeta = {
|
| 26 |
+
title: "Code Article",
|
| 27 |
+
description: "code highlighting",
|
| 28 |
+
authors: [{ name: "Alice", affiliationIndices: [1], affiliationNames: ["MIT"] }],
|
| 29 |
+
affiliations: [{ name: "MIT" }],
|
| 30 |
+
date: "2025-01-01",
|
| 31 |
+
};
|
| 32 |
+
|
| 33 |
+
const PY = `import torch
|
| 34 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 35 |
+
|
| 36 |
+
def load(name):
|
| 37 |
+
return AutoModelForCausalLM.from_pretrained(name)`;
|
| 38 |
+
|
| 39 |
+
describe("detectShikiLang", () => {
|
| 40 |
+
it("detects python from a typical snippet", () => {
|
| 41 |
+
expect(detectShikiLang(PY)).toBe("python");
|
| 42 |
+
});
|
| 43 |
+
|
| 44 |
+
it("returns empty for trivially short input", () => {
|
| 45 |
+
expect(detectShikiLang("x")).toBe("");
|
| 46 |
+
});
|
| 47 |
+
});
|
| 48 |
+
|
| 49 |
+
describe("publisher highlights language-less code blocks", () => {
|
| 50 |
+
it("colors a code block that has no language attribute", async () => {
|
| 51 |
+
const json = {
|
| 52 |
+
type: "doc",
|
| 53 |
+
content: [
|
| 54 |
+
// Note: no `attrs.language` - mirrors how these blocks are authored.
|
| 55 |
+
{ type: "codeBlock", content: [{ type: "text", text: PY }] },
|
| 56 |
+
],
|
| 57 |
+
};
|
| 58 |
+
const html = await renderArticleHTML(json as any, META, EMPTY_CSS);
|
| 59 |
+
|
| 60 |
+
// Detected language label and per-token color vars must be present.
|
| 61 |
+
expect(html).toContain('data-lang="python"');
|
| 62 |
+
expect(html).toContain("--shiki-light:");
|
| 63 |
+
});
|
| 64 |
+
});
|
frontend/package-lock.json
CHANGED
|
@@ -30,6 +30,7 @@
|
|
| 30 |
"@types/katex": "^0.16.8",
|
| 31 |
"ai": "^6.0.158",
|
| 32 |
"katex": "^0.16.45",
|
|
|
|
| 33 |
"lucide-react": "^1.8.0",
|
| 34 |
"mermaid": "^11.14.0",
|
| 35 |
"react": "^18.3.0",
|
|
@@ -3834,6 +3835,15 @@
|
|
| 3834 |
"url": "https://opencollective.com/unified"
|
| 3835 |
}
|
| 3836 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3837 |
"node_modules/html-url-attributes": {
|
| 3838 |
"version": "3.0.1",
|
| 3839 |
"resolved": "https://registry.npmjs.org/html-url-attributes/-/html-url-attributes-3.0.1.tgz",
|
|
@@ -4095,6 +4105,21 @@
|
|
| 4095 |
"loose-envify": "cli.js"
|
| 4096 |
}
|
| 4097 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4098 |
"node_modules/lru-cache": {
|
| 4099 |
"version": "5.1.1",
|
| 4100 |
"resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz",
|
|
|
|
| 30 |
"@types/katex": "^0.16.8",
|
| 31 |
"ai": "^6.0.158",
|
| 32 |
"katex": "^0.16.45",
|
| 33 |
+
"lowlight": "^3.3.0",
|
| 34 |
"lucide-react": "^1.8.0",
|
| 35 |
"mermaid": "^11.14.0",
|
| 36 |
"react": "^18.3.0",
|
|
|
|
| 3835 |
"url": "https://opencollective.com/unified"
|
| 3836 |
}
|
| 3837 |
},
|
| 3838 |
+
"node_modules/highlight.js": {
|
| 3839 |
+
"version": "11.11.1",
|
| 3840 |
+
"resolved": "https://registry.npmjs.org/highlight.js/-/highlight.js-11.11.1.tgz",
|
| 3841 |
+
"integrity": "sha512-Xwwo44whKBVCYoliBQwaPvtd/2tYFkRQtXDWj1nackaV2JPXx3L0+Jvd8/qCJ2p+ML0/XVkJ2q+Mr+UVdpJK5w==",
|
| 3842 |
+
"license": "BSD-3-Clause",
|
| 3843 |
+
"engines": {
|
| 3844 |
+
"node": ">=12.0.0"
|
| 3845 |
+
}
|
| 3846 |
+
},
|
| 3847 |
"node_modules/html-url-attributes": {
|
| 3848 |
"version": "3.0.1",
|
| 3849 |
"resolved": "https://registry.npmjs.org/html-url-attributes/-/html-url-attributes-3.0.1.tgz",
|
|
|
|
| 4105 |
"loose-envify": "cli.js"
|
| 4106 |
}
|
| 4107 |
},
|
| 4108 |
+
"node_modules/lowlight": {
|
| 4109 |
+
"version": "3.3.0",
|
| 4110 |
+
"resolved": "https://registry.npmjs.org/lowlight/-/lowlight-3.3.0.tgz",
|
| 4111 |
+
"integrity": "sha512-0JNhgFoPvP6U6lE/UdVsSq99tn6DhjjpAj5MxG49ewd2mOBVtwWYIT8ClyABhq198aXXODMU6Ox8DrGy/CpTZQ==",
|
| 4112 |
+
"license": "MIT",
|
| 4113 |
+
"dependencies": {
|
| 4114 |
+
"@types/hast": "^3.0.0",
|
| 4115 |
+
"devlop": "^1.0.0",
|
| 4116 |
+
"highlight.js": "~11.11.0"
|
| 4117 |
+
},
|
| 4118 |
+
"funding": {
|
| 4119 |
+
"type": "github",
|
| 4120 |
+
"url": "https://github.com/sponsors/wooorm"
|
| 4121 |
+
}
|
| 4122 |
+
},
|
| 4123 |
"node_modules/lru-cache": {
|
| 4124 |
"version": "5.1.1",
|
| 4125 |
"resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz",
|
frontend/package.json
CHANGED
|
@@ -34,6 +34,7 @@
|
|
| 34 |
"@types/katex": "^0.16.8",
|
| 35 |
"ai": "^6.0.158",
|
| 36 |
"katex": "^0.16.45",
|
|
|
|
| 37 |
"lucide-react": "^1.8.0",
|
| 38 |
"mermaid": "^11.14.0",
|
| 39 |
"react": "^18.3.0",
|
|
|
|
| 34 |
"@types/katex": "^0.16.8",
|
| 35 |
"ai": "^6.0.158",
|
| 36 |
"katex": "^0.16.45",
|
| 37 |
+
"lowlight": "^3.3.0",
|
| 38 |
"lucide-react": "^1.8.0",
|
| 39 |
"mermaid": "^11.14.0",
|
| 40 |
"react": "^18.3.0",
|
frontend/src/editor/extensions/code-block-shiki.tsx
CHANGED
|
@@ -12,6 +12,7 @@ import {
|
|
| 12 |
SHIKI_THEMES,
|
| 13 |
type ShikiHighlighter,
|
| 14 |
} from "#shared/shiki-config";
|
|
|
|
| 15 |
|
| 16 |
/**
|
| 17 |
* TipTap code block with Shiki-powered syntax highlighting.
|
|
@@ -162,8 +163,10 @@ function buildDecorations({ doc, name, highlighter, defaultLanguage }: Decoratio
|
|
| 162 |
|
| 163 |
if (!highlighter || !text) continue;
|
| 164 |
|
|
|
|
|
|
|
| 165 |
const raw = block.node.attrs.language || defaultLanguage || "";
|
| 166 |
-
const lang = normalizeLang(raw);
|
| 167 |
|
| 168 |
let hast: HastRoot;
|
| 169 |
try {
|
|
@@ -274,7 +277,9 @@ export const CodeBlockShiki = CodeBlock.extend<CodeBlockOptions>({
|
|
| 274 |
|
| 275 |
const applyLang = (n: ProsemirrorNode) => {
|
| 276 |
const raw = (n.attrs.language as string | null | undefined) || "";
|
| 277 |
-
|
|
|
|
|
|
|
| 278 |
if (normalized) {
|
| 279 |
pre.dataset.lang = normalized;
|
| 280 |
code.className = `language-${normalized}`;
|
|
|
|
| 12 |
SHIKI_THEMES,
|
| 13 |
type ShikiHighlighter,
|
| 14 |
} from "#shared/shiki-config";
|
| 15 |
+
import { detectShikiLang } from "#shared/detect-lang";
|
| 16 |
|
| 17 |
/**
|
| 18 |
* TipTap code block with Shiki-powered syntax highlighting.
|
|
|
|
| 163 |
|
| 164 |
if (!highlighter || !text) continue;
|
| 165 |
|
| 166 |
+
// Auto-detect when no explicit language is set, so language-less blocks
|
| 167 |
+
// get highlighted in the editor just like in the published output.
|
| 168 |
const raw = block.node.attrs.language || defaultLanguage || "";
|
| 169 |
+
const lang = normalizeLang(raw) || detectShikiLang(text);
|
| 170 |
|
| 171 |
let hast: HastRoot;
|
| 172 |
try {
|
|
|
|
| 277 |
|
| 278 |
const applyLang = (n: ProsemirrorNode) => {
|
| 279 |
const raw = (n.attrs.language as string | null | undefined) || "";
|
| 280 |
+
// Mirror the decoration logic: show the auto-detected language label
|
| 281 |
+
// (and `language-*` class) when the author didn't pick one.
|
| 282 |
+
const normalized = raw ? normalizeLang(raw) : detectShikiLang(n.textContent || "");
|
| 283 |
if (normalized) {
|
| 284 |
pre.dataset.lang = normalized;
|
| 285 |
code.className = `language-${normalized}`;
|