File size: 4,602 Bytes
a2c885c
03cc8ff
 
 
a2c885c
03cc8ff
 
 
a2c885c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
03cc8ff
a2c885c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
03cc8ff
 
a2c885c
 
 
03cc8ff
 
a2c885c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
03cc8ff
 
a2c885c
 
 
 
 
 
 
 
03cc8ff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import { HF_DATASET_BASE_URL, MAX_DOCS_TO_SCAN, getCorpora, getLinksRepoPath, getDocRepoPath } from '../../../utils/config.js';

/**
 * GET /api/progress
 * Returns progress stats across ALL corpora.
 */
export async function GET() {
    try {
        const corpora = getCorpora();
        const allDocs = [];

        for (const corpus of corpora) {
            const linksPath = getLinksRepoPath(corpus);
            const linksUrl = `${HF_DATASET_BASE_URL}/raw/main/${linksPath}`;
            const linksRes = await fetch(linksUrl, {
                headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` },
                next: { revalidate: 300 }
            });

            if (!linksRes.ok) continue;

            const links = await linksRes.json();
            const activeLinks = links
                .filter(l => l.status === 'success' && l.has_revalidation === true)
                .slice(0, MAX_DOCS_TO_SCAN);

            const results = await Promise.allSettled(
                activeLinks.map(async (link) => {
                    const docRepoPath = getDocRepoPath(corpus, link.index);
                    const docUrl = `${HF_DATASET_BASE_URL}/raw/main/${docRepoPath}`;
                    const docRes = await fetch(docUrl, {
                        headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` }
                    });
                    if (!docRes.ok) return null;

                    const pagesData = await docRes.json();

                    let totalMentions = 0;
                    let verifiedMentions = 0;
                    let totalPages = 0;
                    let completedPages = 0;
                    let humanAnnotations = 0;

                    for (const page of pagesData) {
                        const datasets = (page.datasets || []).filter(ds => {
                            if (ds.dataset_tag === 'non-dataset' && ds.dataset_name?.judge_agrees === true) {
                                return false;
                            }
                            return true;
                        });

                        if (datasets.length === 0) continue;

                        totalPages++;
                        totalMentions += datasets.length;

                        let pageVerified = 0;
                        for (const ds of datasets) {
                            if (ds.human_validated === true) {
                                verifiedMentions++;
                                pageVerified++;
                            }
                            if (ds.source === 'human') {
                                humanAnnotations++;
                            }
                        }

                        if (pageVerified === datasets.length) {
                            completedPages++;
                        }
                    }

                    return {
                        corpus: corpus.id,
                        index: link.index,
                        totalPages,
                        completedPages,
                        totalMentions,
                        verifiedMentions,
                        humanAnnotations,
                        complete: totalPages > 0 && completedPages === totalPages,
                    };
                })
            );

            const docs = results
                .filter(r => r.status === 'fulfilled' && r.value !== null)
                .map(r => r.value);

            allDocs.push(...docs);
        }

        const summary = {
            totalDocs: allDocs.length,
            completedDocs: allDocs.filter(d => d.complete).length,
            totalPages: allDocs.reduce((s, d) => s + d.totalPages, 0),
            completedPages: allDocs.reduce((s, d) => s + d.completedPages, 0),
            totalMentions: allDocs.reduce((s, d) => s + d.totalMentions, 0),
            verifiedMentions: allDocs.reduce((s, d) => s + d.verifiedMentions, 0),
            humanAnnotations: allDocs.reduce((s, d) => s + d.humanAnnotations, 0),
            docs: allDocs,
        };

        return new Response(JSON.stringify(summary), {
            status: 200,
            headers: {
                'Content-Type': 'application/json',
                'Cache-Control': 'public, s-maxage=300, stale-while-revalidate=59'
            }
        });
    } catch (error) {
        console.error('Progress API error:', error);
        return new Response(
            JSON.stringify({ error: 'Failed to compute progress' }),
            { status: 500, headers: { 'Content-Type': 'application/json' } }
        );
    }
}