import type { EmbedDataFileMeta } from "../editor/embeds/embed-data-store"; /** * Light-weight client-side parsing of uploaded data files. Extracts * column names and row counts so the sidebar and agent system prompt * can preview dataset shape without shipping the full content every * time. * * Parsing is deliberately forgiving: it never throws on malformed * input, and returns `undefined` metadata fields when shape cannot be * inferred. */ export const ACCEPTED_DATA_EXTS = ["csv", "tsv", "json", "txt", "ndjson"] as const; export type AcceptedDataExt = (typeof ACCEPTED_DATA_EXTS)[number]; export const MAX_DATA_FILE_SIZE = 3 * 1024 * 1024; export function extFromName(name: string): string { const match = name.toLowerCase().match(/\.([a-z0-9]+)$/); return match ? match[1] : ""; } export function isAcceptedExt(ext: string): ext is AcceptedDataExt { return (ACCEPTED_DATA_EXTS as readonly string[]).includes(ext); } function splitCsvLine(line: string, delim: string): string[] { const out: string[] = []; let cur = ""; let inQuotes = false; for (let i = 0; i < line.length; i++) { const c = line[i]; if (inQuotes) { if (c === '"') { if (line[i + 1] === '"') { cur += '"'; i++; } else { inQuotes = false; } } else { cur += c; } } else if (c === '"') { inQuotes = true; } else if (c === delim) { out.push(cur); cur = ""; } else { cur += c; } } out.push(cur); return out; } interface ParsedShape { rowCount?: number; columns?: string[]; } function parseDelimited(content: string, delim: string): ParsedShape { const lines = content .split(/\r\n|\n|\r/) .filter((l) => l.length > 0); if (lines.length === 0) return {}; const header = splitCsvLine(lines[0], delim).map((c) => c.trim()); return { columns: header, rowCount: Math.max(0, lines.length - 1), }; } function parseJson(content: string): ParsedShape { try { const parsed = JSON.parse(content); if (Array.isArray(parsed)) { const first = parsed.find((r) => r && typeof r === "object"); return { rowCount: parsed.length, columns: first ? Object.keys(first as Record) : undefined, }; } if (parsed && typeof parsed === "object") { return { columns: Object.keys(parsed as Record) }; } } catch { // swallow - return empty shape } return {}; } function parseNdjson(content: string): ParsedShape { const lines = content.split(/\r\n|\n|\r/).filter((l) => l.trim().length > 0); if (lines.length === 0) return {}; let columns: string[] | undefined; try { const first = JSON.parse(lines[0]); if (first && typeof first === "object" && !Array.isArray(first)) { columns = Object.keys(first as Record); } } catch { // ignore } return { rowCount: lines.length, columns }; } export function inferDataShape(ext: string, content: string): ParsedShape { switch (ext) { case "csv": return parseDelimited(content, ","); case "tsv": return parseDelimited(content, "\t"); case "json": return parseJson(content); case "ndjson": return parseNdjson(content); default: return {}; } } /** * Build a lightweight manifest line per file suitable for inclusion in * the agent system prompt. Keeps it to one line per file so many * datasets can coexist without exploding prompt size. */ export function formatManifestLine(meta: EmbedDataFileMeta): string { const size = formatBytes(meta.size); const shape = meta.rowCount !== undefined ? ` - ${meta.rowCount} rows` : ""; const cols = meta.columns && meta.columns.length > 0 ? ` - columns: ${meta.columns.slice(0, 12).join(", ")}${meta.columns.length > 12 ? ", ..." : ""}` : ""; return `- ${meta.name} (${meta.ext.toUpperCase()}, ${size}${shape})${cols}`; } export function formatBytes(bytes: number): string { if (bytes < 1024) return `${bytes} B`; if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`; return `${(bytes / (1024 * 1024)).toFixed(2)} MB`; }