tfrere's picture
tfrere HF Staff
feat(editor): embed studio with data files and agent-aware editing
8fc8501
import type { EmbedDataFileMeta } from "../editor/embeds/embed-data-store";
/**
* Light-weight client-side parsing of uploaded data files. Extracts
* column names and row counts so the sidebar and agent system prompt
* can preview dataset shape without shipping the full content every
* time.
*
* Parsing is deliberately forgiving: it never throws on malformed
* input, and returns `undefined` metadata fields when shape cannot be
* inferred.
*/
export const ACCEPTED_DATA_EXTS = ["csv", "tsv", "json", "txt", "ndjson"] as const;
export type AcceptedDataExt = (typeof ACCEPTED_DATA_EXTS)[number];
export const MAX_DATA_FILE_SIZE = 3 * 1024 * 1024;
export function extFromName(name: string): string {
const match = name.toLowerCase().match(/\.([a-z0-9]+)$/);
return match ? match[1] : "";
}
export function isAcceptedExt(ext: string): ext is AcceptedDataExt {
return (ACCEPTED_DATA_EXTS as readonly string[]).includes(ext);
}
function splitCsvLine(line: string, delim: string): string[] {
const out: string[] = [];
let cur = "";
let inQuotes = false;
for (let i = 0; i < line.length; i++) {
const c = line[i];
if (inQuotes) {
if (c === '"') {
if (line[i + 1] === '"') {
cur += '"';
i++;
} else {
inQuotes = false;
}
} else {
cur += c;
}
} else if (c === '"') {
inQuotes = true;
} else if (c === delim) {
out.push(cur);
cur = "";
} else {
cur += c;
}
}
out.push(cur);
return out;
}
interface ParsedShape {
rowCount?: number;
columns?: string[];
}
function parseDelimited(content: string, delim: string): ParsedShape {
const lines = content
.split(/\r\n|\n|\r/)
.filter((l) => l.length > 0);
if (lines.length === 0) return {};
const header = splitCsvLine(lines[0], delim).map((c) => c.trim());
return {
columns: header,
rowCount: Math.max(0, lines.length - 1),
};
}
function parseJson(content: string): ParsedShape {
try {
const parsed = JSON.parse(content);
if (Array.isArray(parsed)) {
const first = parsed.find((r) => r && typeof r === "object");
return {
rowCount: parsed.length,
columns: first ? Object.keys(first as Record<string, unknown>) : undefined,
};
}
if (parsed && typeof parsed === "object") {
return { columns: Object.keys(parsed as Record<string, unknown>) };
}
} catch {
// swallow - return empty shape
}
return {};
}
function parseNdjson(content: string): ParsedShape {
const lines = content.split(/\r\n|\n|\r/).filter((l) => l.trim().length > 0);
if (lines.length === 0) return {};
let columns: string[] | undefined;
try {
const first = JSON.parse(lines[0]);
if (first && typeof first === "object" && !Array.isArray(first)) {
columns = Object.keys(first as Record<string, unknown>);
}
} catch {
// ignore
}
return { rowCount: lines.length, columns };
}
export function inferDataShape(ext: string, content: string): ParsedShape {
switch (ext) {
case "csv":
return parseDelimited(content, ",");
case "tsv":
return parseDelimited(content, "\t");
case "json":
return parseJson(content);
case "ndjson":
return parseNdjson(content);
default:
return {};
}
}
/**
* Build a lightweight manifest line per file suitable for inclusion in
* the agent system prompt. Keeps it to one line per file so many
* datasets can coexist without exploding prompt size.
*/
export function formatManifestLine(meta: EmbedDataFileMeta): string {
const size = formatBytes(meta.size);
const shape =
meta.rowCount !== undefined
? ` - ${meta.rowCount} rows`
: "";
const cols = meta.columns && meta.columns.length > 0
? ` - columns: ${meta.columns.slice(0, 12).join(", ")}${meta.columns.length > 12 ? ", ..." : ""}`
: "";
return `- ${meta.name} (${meta.ext.toUpperCase()}, ${size}${shape})${cols}`;
}
export function formatBytes(bytes: number): string {
if (bytes < 1024) return `${bytes} B`;
if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`;
return `${(bytes / (1024 * 1024)).toFixed(2)} MB`;
}