tfrere HF Staff Cursor commited on
Commit
bf2abd0
·
1 Parent(s): d062662

fix(highlight): auto-detect language for code blocks without explicit lang

Browse files

Code blocks authored without a language rendered as plain text in both the
editor and the published output (Shiki falls back to "text"). Add a shared
lowlight-based detector and use it as a fallback in the editor decorations
and the publish transformer, so language-less blocks get highlighted
identically everywhere - including in already-written articles.

Co-authored-by: Cursor <cursoragent@cursor.com>

backend/src/publisher/transformers/highlight-code.ts CHANGED
@@ -1,5 +1,6 @@
1
  import type { Element as HastElement, ElementContent, Root as HastRoot } from "hast";
2
  import { getSharedHighlighter, isSupportedLang, normalizeLang, SHIKI_THEMES } from "../../shared/shiki-config.js";
 
3
  import type { Transformer } from "./types.js";
4
 
5
  /**
@@ -127,11 +128,15 @@ export const highlightCodeTransformer: Transformer = {
127
  if (!pre || pre.tagName.toLowerCase() !== "pre") continue;
128
  if (pre.classList.contains("mermaid")) continue;
129
 
130
- const rawLang = extractLang(codeEl as unknown as Element);
131
- const lang = normalizeLang(rawLang);
132
  const source = codeEl.textContent || "";
133
  if (!source) continue;
134
 
 
 
 
 
 
 
135
  let hast: HastRoot;
136
  try {
137
  hast = highlighter.codeToHast(source, {
 
1
  import type { Element as HastElement, ElementContent, Root as HastRoot } from "hast";
2
  import { getSharedHighlighter, isSupportedLang, normalizeLang, SHIKI_THEMES } from "../../shared/shiki-config.js";
3
+ import { detectShikiLang } from "../../shared/detect-lang.js";
4
  import type { Transformer } from "./types.js";
5
 
6
  /**
 
128
  if (!pre || pre.tagName.toLowerCase() !== "pre") continue;
129
  if (pre.classList.contains("mermaid")) continue;
130
 
 
 
131
  const source = codeEl.textContent || "";
132
  if (!source) continue;
133
 
134
+ // Fall back to auto-detection when the block has no explicit language,
135
+ // so language-less blocks (the common case in authored docs) still get
136
+ // highlighted. Same logic runs in the editor for an identical result.
137
+ const rawLang = extractLang(codeEl as unknown as Element);
138
+ const lang = normalizeLang(rawLang) || detectShikiLang(source);
139
+
140
  let hast: HastRoot;
141
  try {
142
  hast = highlighter.codeToHast(source, {
backend/src/shared/detect-lang.ts ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * Shared code-language auto-detection.
3
+ *
4
+ * Code blocks authored without an explicit language render as plain text in
5
+ * both the editor (PM decorations) and the publisher (static SSR), because
6
+ * Shiki falls back to `"text"` when no language is set. This module fills that
7
+ * gap: given the block's source, it guesses the language with highlight.js
8
+ * (via lowlight, already a dependency) and maps the guess onto a language
9
+ * Shiki actually bundles.
10
+ *
11
+ * Used by BOTH `code-block-shiki.tsx` (editor) and `highlight-code.ts`
12
+ * (publisher) so a language-less block is highlighted identically everywhere,
13
+ * including in articles that were already written.
14
+ *
15
+ * Detection results are cached by source text: the editor rebuilds its
16
+ * decoration set on every doc change, and we don't want to re-run the
17
+ * (relatively expensive) auto-detector for blocks whose content is unchanged.
18
+ */
19
+ import { createLowlight, common } from "lowlight";
20
+ import { isSupportedLang, normalizeLang } from "./shiki-config.js";
21
+
22
+ const lowlight = createLowlight(common);
23
+
24
+ /** Don't auto-detect on trivially short snippets - too little signal. */
25
+ const MIN_LENGTH = 3;
26
+ /**
27
+ * highlight.js relevance is roughly proportional to how many language-specific
28
+ * constructs matched. A small floor avoids tagging arbitrary prose as code
29
+ * while still catching short-but-clear snippets (e.g. a couple of imports).
30
+ */
31
+ const MIN_RELEVANCE = 2;
32
+
33
+ const cache = new Map<string, string>();
34
+ const CACHE_MAX = 200;
35
+
36
+ /**
37
+ * Best-effort detection of the Shiki language for a code block that has no
38
+ * explicit language. Returns a supported Shiki language name, or "" when
39
+ * detection is inconclusive or lands on a language Shiki doesn't bundle.
40
+ */
41
+ export function detectShikiLang(code: string | null | undefined): string {
42
+ const text = code ?? "";
43
+ if (text.trim().length < MIN_LENGTH) return "";
44
+
45
+ const cached = cache.get(text);
46
+ if (cached !== undefined) return cached;
47
+
48
+ let detected = "";
49
+ try {
50
+ const result = lowlight.highlightAuto(text);
51
+ const lang = result.data?.language ?? "";
52
+ const relevance = result.data?.relevance ?? 0;
53
+ if (lang && relevance >= MIN_RELEVANCE) {
54
+ const normalized = normalizeLang(lang);
55
+ if (isSupportedLang(normalized)) detected = normalized;
56
+ }
57
+ } catch {
58
+ detected = "";
59
+ }
60
+
61
+ // Cheap eviction: clear wholesale once the cache grows past the cap. Code
62
+ // blocks are few and small, so this rarely triggers.
63
+ if (cache.size >= CACHE_MAX) cache.clear();
64
+ cache.set(text, detected);
65
+ return detected;
66
+ }
backend/tests/highlight-autodetect.test.ts ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * Auto-detection of syntax highlighting for language-less code blocks.
3
+ *
4
+ * Code blocks authored without an explicit language must still be highlighted
5
+ * (in both the editor and the published output). These tests cover the
6
+ * publisher path end-to-end plus the shared detector in isolation.
7
+ */
8
+ import { describe, it, expect } from "vitest";
9
+ import { renderArticleHTML, type PublishMeta } from "../src/publisher/html-renderer.js";
10
+ import type { PublishCSS } from "../src/publisher/index.js";
11
+ import { detectShikiLang } from "../src/shared/detect-lang.js";
12
+
13
+ const EMPTY_CSS: PublishCSS = {
14
+ variables: "",
15
+ reset: "",
16
+ base: "",
17
+ layout: "",
18
+ print: "",
19
+ editorTokens: "",
20
+ article: "",
21
+ components: "",
22
+ publisher: "",
23
+ };
24
+
25
+ const META: PublishMeta = {
26
+ title: "Code Article",
27
+ description: "code highlighting",
28
+ authors: [{ name: "Alice", affiliationIndices: [1], affiliationNames: ["MIT"] }],
29
+ affiliations: [{ name: "MIT" }],
30
+ date: "2025-01-01",
31
+ };
32
+
33
+ const PY = `import torch
34
+ from transformers import AutoModelForCausalLM, AutoTokenizer
35
+
36
+ def load(name):
37
+ return AutoModelForCausalLM.from_pretrained(name)`;
38
+
39
+ describe("detectShikiLang", () => {
40
+ it("detects python from a typical snippet", () => {
41
+ expect(detectShikiLang(PY)).toBe("python");
42
+ });
43
+
44
+ it("returns empty for trivially short input", () => {
45
+ expect(detectShikiLang("x")).toBe("");
46
+ });
47
+ });
48
+
49
+ describe("publisher highlights language-less code blocks", () => {
50
+ it("colors a code block that has no language attribute", async () => {
51
+ const json = {
52
+ type: "doc",
53
+ content: [
54
+ // Note: no `attrs.language` - mirrors how these blocks are authored.
55
+ { type: "codeBlock", content: [{ type: "text", text: PY }] },
56
+ ],
57
+ };
58
+ const html = await renderArticleHTML(json as any, META, EMPTY_CSS);
59
+
60
+ // Detected language label and per-token color vars must be present.
61
+ expect(html).toContain('data-lang="python"');
62
+ expect(html).toContain("--shiki-light:");
63
+ });
64
+ });
frontend/package-lock.json CHANGED
@@ -30,6 +30,7 @@
30
  "@types/katex": "^0.16.8",
31
  "ai": "^6.0.158",
32
  "katex": "^0.16.45",
 
33
  "lucide-react": "^1.8.0",
34
  "mermaid": "^11.14.0",
35
  "react": "^18.3.0",
@@ -3834,6 +3835,15 @@
3834
  "url": "https://opencollective.com/unified"
3835
  }
3836
  },
 
 
 
 
 
 
 
 
 
3837
  "node_modules/html-url-attributes": {
3838
  "version": "3.0.1",
3839
  "resolved": "https://registry.npmjs.org/html-url-attributes/-/html-url-attributes-3.0.1.tgz",
@@ -4095,6 +4105,21 @@
4095
  "loose-envify": "cli.js"
4096
  }
4097
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4098
  "node_modules/lru-cache": {
4099
  "version": "5.1.1",
4100
  "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz",
 
30
  "@types/katex": "^0.16.8",
31
  "ai": "^6.0.158",
32
  "katex": "^0.16.45",
33
+ "lowlight": "^3.3.0",
34
  "lucide-react": "^1.8.0",
35
  "mermaid": "^11.14.0",
36
  "react": "^18.3.0",
 
3835
  "url": "https://opencollective.com/unified"
3836
  }
3837
  },
3838
+ "node_modules/highlight.js": {
3839
+ "version": "11.11.1",
3840
+ "resolved": "https://registry.npmjs.org/highlight.js/-/highlight.js-11.11.1.tgz",
3841
+ "integrity": "sha512-Xwwo44whKBVCYoliBQwaPvtd/2tYFkRQtXDWj1nackaV2JPXx3L0+Jvd8/qCJ2p+ML0/XVkJ2q+Mr+UVdpJK5w==",
3842
+ "license": "BSD-3-Clause",
3843
+ "engines": {
3844
+ "node": ">=12.0.0"
3845
+ }
3846
+ },
3847
  "node_modules/html-url-attributes": {
3848
  "version": "3.0.1",
3849
  "resolved": "https://registry.npmjs.org/html-url-attributes/-/html-url-attributes-3.0.1.tgz",
 
4105
  "loose-envify": "cli.js"
4106
  }
4107
  },
4108
+ "node_modules/lowlight": {
4109
+ "version": "3.3.0",
4110
+ "resolved": "https://registry.npmjs.org/lowlight/-/lowlight-3.3.0.tgz",
4111
+ "integrity": "sha512-0JNhgFoPvP6U6lE/UdVsSq99tn6DhjjpAj5MxG49ewd2mOBVtwWYIT8ClyABhq198aXXODMU6Ox8DrGy/CpTZQ==",
4112
+ "license": "MIT",
4113
+ "dependencies": {
4114
+ "@types/hast": "^3.0.0",
4115
+ "devlop": "^1.0.0",
4116
+ "highlight.js": "~11.11.0"
4117
+ },
4118
+ "funding": {
4119
+ "type": "github",
4120
+ "url": "https://github.com/sponsors/wooorm"
4121
+ }
4122
+ },
4123
  "node_modules/lru-cache": {
4124
  "version": "5.1.1",
4125
  "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz",
frontend/package.json CHANGED
@@ -34,6 +34,7 @@
34
  "@types/katex": "^0.16.8",
35
  "ai": "^6.0.158",
36
  "katex": "^0.16.45",
 
37
  "lucide-react": "^1.8.0",
38
  "mermaid": "^11.14.0",
39
  "react": "^18.3.0",
 
34
  "@types/katex": "^0.16.8",
35
  "ai": "^6.0.158",
36
  "katex": "^0.16.45",
37
+ "lowlight": "^3.3.0",
38
  "lucide-react": "^1.8.0",
39
  "mermaid": "^11.14.0",
40
  "react": "^18.3.0",
frontend/src/editor/extensions/code-block-shiki.tsx CHANGED
@@ -12,6 +12,7 @@ import {
12
  SHIKI_THEMES,
13
  type ShikiHighlighter,
14
  } from "#shared/shiki-config";
 
15
 
16
  /**
17
  * TipTap code block with Shiki-powered syntax highlighting.
@@ -162,8 +163,10 @@ function buildDecorations({ doc, name, highlighter, defaultLanguage }: Decoratio
162
 
163
  if (!highlighter || !text) continue;
164
 
 
 
165
  const raw = block.node.attrs.language || defaultLanguage || "";
166
- const lang = normalizeLang(raw);
167
 
168
  let hast: HastRoot;
169
  try {
@@ -274,7 +277,9 @@ export const CodeBlockShiki = CodeBlock.extend<CodeBlockOptions>({
274
 
275
  const applyLang = (n: ProsemirrorNode) => {
276
  const raw = (n.attrs.language as string | null | undefined) || "";
277
- const normalized = raw ? normalizeLang(raw) : "";
 
 
278
  if (normalized) {
279
  pre.dataset.lang = normalized;
280
  code.className = `language-${normalized}`;
 
12
  SHIKI_THEMES,
13
  type ShikiHighlighter,
14
  } from "#shared/shiki-config";
15
+ import { detectShikiLang } from "#shared/detect-lang";
16
 
17
  /**
18
  * TipTap code block with Shiki-powered syntax highlighting.
 
163
 
164
  if (!highlighter || !text) continue;
165
 
166
+ // Auto-detect when no explicit language is set, so language-less blocks
167
+ // get highlighted in the editor just like in the published output.
168
  const raw = block.node.attrs.language || defaultLanguage || "";
169
+ const lang = normalizeLang(raw) || detectShikiLang(text);
170
 
171
  let hast: HastRoot;
172
  try {
 
277
 
278
  const applyLang = (n: ProsemirrorNode) => {
279
  const raw = (n.attrs.language as string | null | undefined) || "";
280
+ // Mirror the decoration logic: show the auto-detected language label
281
+ // (and `language-*` class) when the author didn't pick one.
282
+ const normalized = raw ? normalizeLang(raw) : detectShikiLang(n.textContent || "");
283
  if (normalized) {
284
  pre.dataset.lang = normalized;
285
  code.className = `language-${normalized}`;