carbon-tokenization / backend /src /shared /detect-lang.ts
tfrere's picture
tfrere HF Staff
fix(highlight): auto-detect language for code blocks without explicit lang
bf2abd0
/**
* Shared code-language auto-detection.
*
* Code blocks authored without an explicit language render as plain text in
* both the editor (PM decorations) and the publisher (static SSR), because
* Shiki falls back to `"text"` when no language is set. This module fills that
* gap: given the block's source, it guesses the language with highlight.js
* (via lowlight, already a dependency) and maps the guess onto a language
* Shiki actually bundles.
*
* Used by BOTH `code-block-shiki.tsx` (editor) and `highlight-code.ts`
* (publisher) so a language-less block is highlighted identically everywhere,
* including in articles that were already written.
*
* Detection results are cached by source text: the editor rebuilds its
* decoration set on every doc change, and we don't want to re-run the
* (relatively expensive) auto-detector for blocks whose content is unchanged.
*/
import { createLowlight, common } from "lowlight";
import { isSupportedLang, normalizeLang } from "./shiki-config.js";
const lowlight = createLowlight(common);
/** Don't auto-detect on trivially short snippets - too little signal. */
const MIN_LENGTH = 3;
/**
* highlight.js relevance is roughly proportional to how many language-specific
* constructs matched. A small floor avoids tagging arbitrary prose as code
* while still catching short-but-clear snippets (e.g. a couple of imports).
*/
const MIN_RELEVANCE = 2;
const cache = new Map<string, string>();
const CACHE_MAX = 200;
/**
* Best-effort detection of the Shiki language for a code block that has no
* explicit language. Returns a supported Shiki language name, or "" when
* detection is inconclusive or lands on a language Shiki doesn't bundle.
*/
export function detectShikiLang(code: string | null | undefined): string {
const text = code ?? "";
if (text.trim().length < MIN_LENGTH) return "";
const cached = cache.get(text);
if (cached !== undefined) return cached;
let detected = "";
try {
const result = lowlight.highlightAuto(text);
const lang = result.data?.language ?? "";
const relevance = result.data?.relevance ?? 0;
if (lang && relevance >= MIN_RELEVANCE) {
const normalized = normalizeLang(lang);
if (isSupportedLang(normalized)) detected = normalized;
}
} catch {
detected = "";
}
// Cheap eviction: clear wholesale once the cache grows past the cap. Code
// blocks are few and small, so this rarely triggers.
if (cache.size >= CACHE_MAX) cache.clear();
cache.set(text, detected);
return detected;
}