rafmacalaba's picture
feat: multi-corpus support
a2c885c
import { HF_DATASET_BASE_URL, MAX_DOCS_TO_SCAN, getCorpora, getLinksRepoPath, getDocRepoPath } from '../../../utils/config.js';
/**
* GET /api/progress
* Returns progress stats across ALL corpora.
*/
export async function GET() {
try {
const corpora = getCorpora();
const allDocs = [];
for (const corpus of corpora) {
const linksPath = getLinksRepoPath(corpus);
const linksUrl = `${HF_DATASET_BASE_URL}/raw/main/${linksPath}`;
const linksRes = await fetch(linksUrl, {
headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` },
next: { revalidate: 300 }
});
if (!linksRes.ok) continue;
const links = await linksRes.json();
const activeLinks = links
.filter(l => l.status === 'success' && l.has_revalidation === true)
.slice(0, MAX_DOCS_TO_SCAN);
const results = await Promise.allSettled(
activeLinks.map(async (link) => {
const docRepoPath = getDocRepoPath(corpus, link.index);
const docUrl = `${HF_DATASET_BASE_URL}/raw/main/${docRepoPath}`;
const docRes = await fetch(docUrl, {
headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` }
});
if (!docRes.ok) return null;
const pagesData = await docRes.json();
let totalMentions = 0;
let verifiedMentions = 0;
let totalPages = 0;
let completedPages = 0;
let humanAnnotations = 0;
for (const page of pagesData) {
const datasets = (page.datasets || []).filter(ds => {
if (ds.dataset_tag === 'non-dataset' && ds.dataset_name?.judge_agrees === true) {
return false;
}
return true;
});
if (datasets.length === 0) continue;
totalPages++;
totalMentions += datasets.length;
let pageVerified = 0;
for (const ds of datasets) {
if (ds.human_validated === true) {
verifiedMentions++;
pageVerified++;
}
if (ds.source === 'human') {
humanAnnotations++;
}
}
if (pageVerified === datasets.length) {
completedPages++;
}
}
return {
corpus: corpus.id,
index: link.index,
totalPages,
completedPages,
totalMentions,
verifiedMentions,
humanAnnotations,
complete: totalPages > 0 && completedPages === totalPages,
};
})
);
const docs = results
.filter(r => r.status === 'fulfilled' && r.value !== null)
.map(r => r.value);
allDocs.push(...docs);
}
const summary = {
totalDocs: allDocs.length,
completedDocs: allDocs.filter(d => d.complete).length,
totalPages: allDocs.reduce((s, d) => s + d.totalPages, 0),
completedPages: allDocs.reduce((s, d) => s + d.completedPages, 0),
totalMentions: allDocs.reduce((s, d) => s + d.totalMentions, 0),
verifiedMentions: allDocs.reduce((s, d) => s + d.verifiedMentions, 0),
humanAnnotations: allDocs.reduce((s, d) => s + d.humanAnnotations, 0),
docs: allDocs,
};
return new Response(JSON.stringify(summary), {
status: 200,
headers: {
'Content-Type': 'application/json',
'Cache-Control': 'public, s-maxage=300, stale-while-revalidate=59'
}
});
} catch (error) {
console.error('Progress API error:', error);
return new Response(
JSON.stringify({ error: 'Failed to compute progress' }),
{ status: 500, headers: { 'Content-Type': 'application/json' } }
);
}
}