carbon-tokenization / backend /tests /markdown-renderer.test.ts
tfrere's picture
tfrere HF Staff
feat(publisher): generate llms.txt Markdown twin for LLM agents
3de227d
import { describe, it, expect } from "vitest";
import {
renderArticleMarkdown,
stripHtmlToText,
} from "../src/publisher/markdown-renderer.js";
import type { PublishMeta, CitationData } from "../src/publisher/html-renderer.js";
const META: PublishMeta = {
title: "Test Article",
subtitle: "A subtitle",
description: "A short description for SEO",
authors: [
{ name: "Alice", affiliationIndices: [1], affiliationNames: ["MIT"] },
{ name: "Bob", affiliationIndices: [2], affiliationNames: ["HF"] },
],
affiliations: [{ name: "MIT" }, { name: "HF" }],
date: "2026-04-30",
doi: "10.1234/abcd.efgh",
};
const doc = (content: any[]) => ({ type: "doc", content });
describe("renderArticleMarkdown - header", () => {
it("emits an llms.txt-style header with title, description, authors, date and DOI", () => {
const md = renderArticleMarkdown(doc([{ type: "paragraph" }]), META);
expect(md).toContain("# Test Article");
expect(md).toContain("> A short description for SEO");
expect(md).toContain("- **Authors**: Alice, Bob");
expect(md).toContain("- **Published**: 2026-04-30");
expect(md).toContain("- **DOI**: https://doi.org/10.1234/abcd.efgh");
expect(md).toContain("---");
});
it("falls back to subtitle when description is empty", () => {
const md = renderArticleMarkdown(
doc([{ type: "paragraph" }]),
{ ...META, description: "" },
);
expect(md).toContain("> A subtitle");
});
it("collapses multi-line titles", () => {
const md = renderArticleMarkdown(
doc([{ type: "paragraph" }]),
{ ...META, title: "Line one\\nLine two" },
);
expect(md).toContain("# Line one Line two");
expect(md).not.toContain("\\n");
});
});
describe("renderArticleMarkdown - block nodes", () => {
it("renders headings with the correct markdown level", () => {
const md = renderArticleMarkdown(
doc([
{ type: "heading", attrs: { level: 2 }, content: [{ type: "text", text: "Hello" }] },
{ type: "heading", attrs: { level: 3 }, content: [{ type: "text", text: "Sub" }] },
]),
META,
);
expect(md).toContain("## Hello");
expect(md).toContain("### Sub");
});
it("applies bold/italic/code/link marks", () => {
const md = renderArticleMarkdown(
doc([
{
type: "paragraph",
content: [
{ type: "text", text: "bold", marks: [{ type: "bold" }] },
{ type: "text", text: " " },
{ type: "text", text: "italic", marks: [{ type: "italic" }] },
{ type: "text", text: " " },
{ type: "text", text: "code", marks: [{ type: "code" }] },
{ type: "text", text: " " },
{
type: "text",
text: "link",
marks: [{ type: "link", attrs: { href: "https://example.com" } }],
},
],
},
]),
META,
);
expect(md).toContain("**bold**");
expect(md).toContain("*italic*");
expect(md).toContain("`code`");
expect(md).toContain("[link](https://example.com)");
});
it("renders bullet and ordered lists", () => {
const md = renderArticleMarkdown(
doc([
{
type: "bulletList",
content: [
{ type: "listItem", content: [{ type: "paragraph", content: [{ type: "text", text: "one" }] }] },
{ type: "listItem", content: [{ type: "paragraph", content: [{ type: "text", text: "two" }] }] },
],
},
{
type: "orderedList",
content: [
{ type: "listItem", content: [{ type: "paragraph", content: [{ type: "text", text: "first" }] }] },
{ type: "listItem", content: [{ type: "paragraph", content: [{ type: "text", text: "second" }] }] },
],
},
]),
META,
);
expect(md).toContain("- one");
expect(md).toContain("- two");
expect(md).toContain("1. first");
expect(md).toContain("2. second");
});
it("renders code blocks with language fence", () => {
const md = renderArticleMarkdown(
doc([
{
type: "codeBlock",
attrs: { language: "ts" },
content: [{ type: "text", text: "const x = 1;" }],
},
]),
META,
);
expect(md).toContain("```ts");
expect(md).toContain("const x = 1;");
expect(md).toContain("```");
});
it("renders inline and block math", () => {
const md = renderArticleMarkdown(
doc([
{
type: "paragraph",
content: [
{ type: "text", text: "Energy: " },
{ type: "inlineMath", attrs: { latex: "E = mc^2" } },
],
},
{ type: "blockMath", attrs: { latex: "\\int_0^1 x dx" } },
]),
META,
);
expect(md).toContain("$E = mc^2$");
expect(md).toContain("$$\n\\int_0^1 x dx\n$$");
});
it("renders tables with a header row separator", () => {
const md = renderArticleMarkdown(
doc([
{
type: "table",
content: [
{
type: "tableRow",
content: [
{ type: "tableHeader", content: [{ type: "paragraph", content: [{ type: "text", text: "Col A" }] }] },
{ type: "tableHeader", content: [{ type: "paragraph", content: [{ type: "text", text: "Col B" }] }] },
],
},
{
type: "tableRow",
content: [
{ type: "tableCell", content: [{ type: "paragraph", content: [{ type: "text", text: "1" }] }] },
{ type: "tableCell", content: [{ type: "paragraph", content: [{ type: "text", text: "2" }] }] },
],
},
],
},
]),
META,
);
expect(md).toContain("| Col A | Col B |");
expect(md).toContain("| --- | --- |");
expect(md).toContain("| 1 | 2 |");
});
});
describe("renderArticleMarkdown - custom components", () => {
it("collapses HtmlEmbed to a single inline placeholder with title and src", () => {
const md = renderArticleMarkdown(
doc([
{
type: "htmlEmbed",
attrs: { src: "d3-chart.html", title: "Citations over time", desc: "" },
},
]),
META,
);
expect(md).toContain("*[Interactive visualization: Citations over time]*");
expect(md).not.toContain("<iframe");
});
it("renders Note as a blockquote", () => {
const md = renderArticleMarkdown(
doc([
{
type: "note",
content: [
{ type: "paragraph", content: [{ type: "text", text: "Heads up." }] },
],
},
]),
META,
);
expect(md).toContain("> Heads up.");
});
it("renders Accordion with bold title and inner content", () => {
const md = renderArticleMarkdown(
doc([
{
type: "accordion",
attrs: { title: "More details" },
content: [
{ type: "paragraph", content: [{ type: "text", text: "Inside." }] },
],
},
]),
META,
);
expect(md).toContain("**More details**");
expect(md).toContain("Inside.");
});
it("renders QuoteBlock with attribution", () => {
const md = renderArticleMarkdown(
doc([
{
type: "quoteBlock",
attrs: { author: "Ada Lovelace", source: "Notes" },
content: [
{ type: "paragraph", content: [{ type: "text", text: "The future is open." }] },
],
},
]),
META,
);
expect(md).toContain("> The future is open.");
expect(md).toContain("> -- Ada Lovelace, Notes");
});
it("renders HfUser as a markdown link to huggingface.co/<u>", () => {
const md = renderArticleMarkdown(
doc([
{
type: "hfUser",
attrs: { username: "tfrere", name: "Thibaud Frere" },
},
]),
META,
);
expect(md).toContain("[Thibaud Frere](https://huggingface.co/tfrere)");
});
it("renders Mermaid as a fenced ```mermaid block", () => {
const md = renderArticleMarkdown(
doc([
{
type: "mermaid",
attrs: { code: "graph TD\n A --> B" },
},
]),
META,
);
expect(md).toContain("```mermaid");
expect(md).toContain("graph TD");
expect(md).toContain("A --> B");
});
it("unwraps Wide / FullWidth / Stack containers", () => {
const md = renderArticleMarkdown(
doc([
{
type: "wide",
content: [
{ type: "paragraph", content: [{ type: "text", text: "Wide content." }] },
],
},
]),
META,
);
expect(md).toContain("Wide content.");
expect(md).not.toContain("[wide]");
});
});
describe("renderArticleMarkdown - citations and footnotes", () => {
it("renders citations as keys for APA and as numeric tags for IEEE", () => {
const json = doc([
{
type: "paragraph",
content: [
{ type: "text", text: "See " },
{ type: "citation", attrs: { key: "smith2024", label: "Smith (2024)" } },
{ type: "text", text: "." },
],
},
]);
const apa: CitationData = {
entries: [{ id: "smith2024" }],
orderedKeys: ["smith2024"],
style: "apa",
};
const ieee: CitationData = {
entries: [{ id: "smith2024" }],
orderedKeys: ["smith2024"],
style: "ieee",
};
expect(renderArticleMarkdown(json, META, apa)).toContain("Smith (2024)");
expect(renderArticleMarkdown(json, META, ieee)).toContain("[1]");
});
it("collects footnotes and emits a footnotes section", () => {
const md = renderArticleMarkdown(
doc([
{
type: "paragraph",
content: [
{ type: "text", text: "Body" },
{ type: "footnote", attrs: { content: "First note" } },
{ type: "text", text: " more " },
{ type: "footnote", attrs: { content: "Second note" } },
],
},
]),
META,
);
expect(md).toContain("[^1]");
expect(md).toContain("[^2]");
expect(md).toContain("## Footnotes");
expect(md).toContain("[^1]: First note");
expect(md).toContain("[^2]: Second note");
});
it("appends a References section from the formatted bibliography", () => {
const biblio = '<div class="csl-entry">Smith, J. (2024). <i>Test Paper</i>. Journal.</div>';
const md = renderArticleMarkdown(
doc([{ type: "paragraph", content: [{ type: "text", text: "Body" }] }]),
META,
undefined,
biblio,
);
expect(md).toContain("## References");
expect(md).toContain("Smith, J. (2024).");
expect(md).toContain("Test Paper");
expect(md).not.toContain("<div");
});
});
describe("stripHtmlToText", () => {
it("converts <a href> to a markdown link", () => {
expect(stripHtmlToText('<a href="https://example.com">click</a>')).toBe(
"[click](https://example.com)",
);
});
it("decodes common HTML entities", () => {
expect(stripHtmlToText("Tom &amp; Jerry &lt;3")).toBe("Tom & Jerry <3");
});
it("collapses block tags into newlines and removes the rest", () => {
const html = "<p>One.</p><p>Two.</p>";
expect(stripHtmlToText(html).trim()).toBe("One.\nTwo.");
});
});