MiniSearch / server /rerankerService.test.ts
github-actions[bot]
Sync from https://github.com/felladrin/MiniSearch
f31a721
import { describe, expect, it } from "vitest";
import { sanitizeUnicodeSurrogates } from "./rerankerService";
describe("sanitizeUnicodeSurrogates", () => {
describe("valid input passthrough", () => {
it("should return empty string unchanged", () => {
expect(sanitizeUnicodeSurrogates("")).toBe("");
});
it("should return ASCII text unchanged", () => {
const input = "Hello, World! 123";
expect(sanitizeUnicodeSurrogates(input)).toBe(input);
});
it("should return valid Unicode text unchanged", () => {
const input = "HΓ©llo WΓΆrld ζ—₯本θͺž πŸŽ‰";
expect(sanitizeUnicodeSurrogates(input)).toBe(input);
});
it("should preserve valid surrogate pairs (emoji)", () => {
const input = "Text with emoji πŸ˜€πŸŽŠπŸš€";
expect(sanitizeUnicodeSurrogates(input)).toBe(input);
});
it("should preserve valid surrogate pairs in complex text", () => {
const input = "Start πŸŽ‰ middle πŸš€ end";
expect(sanitizeUnicodeSurrogates(input)).toBe(input);
});
});
describe("unpaired high surrogate handling", () => {
it("should replace lone high surrogate at end of string", () => {
const highSurrogate = String.fromCharCode(0xd800);
const input = `text${highSurrogate}`;
expect(sanitizeUnicodeSurrogates(input)).toBe("text\ufffd");
});
it("should replace high surrogate followed by non-surrogate", () => {
const highSurrogate = String.fromCharCode(0xd800);
const input = `${highSurrogate}A`;
expect(sanitizeUnicodeSurrogates(input)).toBe("\ufffdA");
});
it("should replace high surrogate followed by another high surrogate", () => {
const high1 = String.fromCharCode(0xd800);
const high2 = String.fromCharCode(0xd801);
const input = `${high1}${high2}`;
expect(sanitizeUnicodeSurrogates(input)).toBe("\ufffd\ufffd");
});
it("should replace multiple consecutive unpaired high surrogates", () => {
const high = String.fromCharCode(0xd800);
const input = `${high}${high}${high}`;
expect(sanitizeUnicodeSurrogates(input)).toBe("\ufffd\ufffd\ufffd");
});
});
describe("unpaired low surrogate handling", () => {
it("should replace lone low surrogate at start of string", () => {
const lowSurrogate = String.fromCharCode(0xdc00);
const input = `${lowSurrogate}text`;
expect(sanitizeUnicodeSurrogates(input)).toBe("\ufffdtext");
});
it("should replace lone low surrogate in middle of string", () => {
const lowSurrogate = String.fromCharCode(0xdc00);
const input = `before${lowSurrogate}after`;
expect(sanitizeUnicodeSurrogates(input)).toBe("before\ufffdafter");
});
it("should replace multiple consecutive unpaired low surrogates", () => {
const low = String.fromCharCode(0xdc00);
const input = `${low}${low}`;
expect(sanitizeUnicodeSurrogates(input)).toBe("\ufffd\ufffd");
});
});
describe("mixed surrogate scenarios", () => {
it("should handle low surrogate followed by high surrogate (reversed pair)", () => {
const low = String.fromCharCode(0xdc00);
const high = String.fromCharCode(0xd800);
const input = `${low}${high}`;
expect(sanitizeUnicodeSurrogates(input)).toBe("\ufffd\ufffd");
});
it("should handle valid pair followed by unpaired high", () => {
const validEmoji = "πŸ˜€";
const unpairedHigh = String.fromCharCode(0xd83d);
const input = `${validEmoji}${unpairedHigh}`;
expect(sanitizeUnicodeSurrogates(input)).toBe("πŸ˜€\ufffd");
});
it("should handle unpaired low followed by valid pair", () => {
const unpairedLow = String.fromCharCode(0xdc00);
const validEmoji = "πŸŽ‰";
const input = `${unpairedLow}${validEmoji}`;
expect(sanitizeUnicodeSurrogates(input)).toBe("\ufffdπŸŽ‰");
});
it("should handle interleaved valid and invalid surrogates", () => {
const high = String.fromCharCode(0xd800);
const low = String.fromCharCode(0xdc00);
const input = `A${high}B${low}C`;
expect(sanitizeUnicodeSurrogates(input)).toBe("A\ufffdB\ufffdC");
});
});
describe("edge cases from real-world scenarios", () => {
it("should handle text that might come from corrupted web content", () => {
const corruptedChar = String.fromCharCode(0xd834);
const input = `Search result: ${corruptedChar} more text`;
expect(sanitizeUnicodeSurrogates(input)).toBe(
"Search result: \ufffd more text",
);
});
it("should preserve valid content around invalid surrogates", () => {
const badHigh = String.fromCharCode(0xd83d);
const input = `Valid text ζ—₯本θͺž ${badHigh} more valid πŸŽ‰ end`;
expect(sanitizeUnicodeSurrogates(input)).toBe(
"Valid text ζ—₯本θͺž \ufffd more valid πŸŽ‰ end",
);
});
it("should handle boundary surrogate values", () => {
const minHigh = String.fromCharCode(0xd800);
const maxHigh = String.fromCharCode(0xdbff);
const minLow = String.fromCharCode(0xdc00);
const maxLow = String.fromCharCode(0xdfff);
expect(sanitizeUnicodeSurrogates(minHigh)).toBe("\ufffd");
expect(sanitizeUnicodeSurrogates(maxHigh)).toBe("\ufffd");
expect(sanitizeUnicodeSurrogates(minLow)).toBe("\ufffd");
expect(sanitizeUnicodeSurrogates(maxLow)).toBe("\ufffd");
expect(sanitizeUnicodeSurrogates(`${minHigh}${minLow}`)).toBe(
`${minHigh}${minLow}`,
);
expect(sanitizeUnicodeSurrogates(`${maxHigh}${maxLow}`)).toBe(
`${maxHigh}${maxLow}`,
);
});
it("should handle long strings with scattered invalid surrogates", () => {
const unpairedHigh = String.fromCharCode(0xd800);
const unpairedLow = String.fromCharCode(0xdc00);
const chunks = [
"Start of document.",
unpairedHigh,
" Some middle content.",
unpairedLow,
" More content here.",
unpairedHigh,
" End of document.",
];
const input = chunks.join("");
const expected =
"Start of document.\ufffd Some middle content.\ufffd More content here.\ufffd End of document.";
expect(sanitizeUnicodeSurrogates(input)).toBe(expected);
});
it("should preserve adjacent high+low as valid pair even in mixed context", () => {
const high = String.fromCharCode(0xd800);
const low = String.fromCharCode(0xdc00);
const validPair = `${high}${low}`;
const input = `Text ${high} orphan, then valid pair: ${validPair} end`;
expect(sanitizeUnicodeSurrogates(input)).toBe(
`Text \ufffd orphan, then valid pair: ${validPair} end`,
);
});
});
describe("literal syntax and complex sequences", () => {
it("should handle mixed valid and invalid surrogates using literals", () => {
const input = "A\uD800B\uD83D\uDE00C\uDC00D";
expect(sanitizeUnicodeSurrogates(input)).toBe(
"A\uFFFDB\uD83D\uDE00C\uFFFDD",
);
});
it("should handle surrogate pair followed by lone high surrogate", () => {
const input = "πŸ˜€\uD800";
expect(sanitizeUnicodeSurrogates(input)).toBe("πŸ˜€\uFFFD");
});
it("should handle lone high surrogate followed by valid surrogate pair", () => {
const input = "\uD801\uD800\uDC00";
expect(sanitizeUnicodeSurrogates(input)).toBe("\uFFFD\uD800\uDC00");
});
it("should handle multiple lone surrogates in a row", () => {
const input = "\uD800\uDC00\uD801";
expect(sanitizeUnicodeSurrogates(input)).toBe("\uD800\uDC00\uFFFD");
});
});
});