vectorplasticity commited on
Commit
4cecf14
·
verified ·
1 Parent(s): 76a4048

Add dataset map configuration

Browse files
Files changed (1) hide show
  1. app/config/dataset_map.json +416 -0
app/config/dataset_map.json ADDED
@@ -0,0 +1,416 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "datasets": {
3
+ "causal-lm": [
4
+ {
5
+ "id": "wikitext",
6
+ "name": "WikiText",
7
+ "description": "Wikipedia text dataset for language modeling",
8
+ "configs": ["wikitext-2-raw-v1", "wikitext-103-raw-v1"],
9
+ "splits": ["train", "validation", "test"],
10
+ "text_column": "text",
11
+ "default_config": "wikitext-2-raw-v1",
12
+ "size_categories": ["100K-1M", "1M-10M"],
13
+ "recommended": true
14
+ },
15
+ {
16
+ "id": "openwebtext",
17
+ "name": "OpenWebText",
18
+ "description": "Open source recreation of WebText dataset",
19
+ "configs": [],
20
+ "splits": ["train"],
21
+ "text_column": "text",
22
+ "size_categories": [">10M"],
23
+ "recommended": true
24
+ },
25
+ {
26
+ "id": "the_pile",
27
+ "name": "The Pile",
28
+ "description": "Large-scale text corpus for language modeling",
29
+ "configs": ["all", "enron_emails", "europarl", "hacker_news", "pubmed", "ubuntu_irc"],
30
+ "splits": ["train", "validation", "test"],
31
+ "text_column": "text",
32
+ "size_categories": [">10M"],
33
+ "recommended": false
34
+ },
35
+ {
36
+ "id": "c4",
37
+ "name": "C4 (Colossal Clean Crawled Corpus)",
38
+ "description": "Huge cleaned web text dataset",
39
+ "configs": ["en", "realnewslike", "en.noblocklist", "en.noclean"],
40
+ "splits": ["train", "validation"],
41
+ "text_column": "text",
42
+ "size_categories": [">10M"],
43
+ "recommended": false
44
+ },
45
+ {
46
+ "id": "tiny_shakespeare",
47
+ "name": "Tiny Shakespeare",
48
+ "description": "Small Shakespeare text for quick testing",
49
+ "configs": [],
50
+ "splits": ["train", "validation", "test"],
51
+ "text_column": "text",
52
+ "size_categories": ["<10K"],
53
+ "recommended": true
54
+ }
55
+ ],
56
+ "seq2seq": [
57
+ {
58
+ "id": "cnn_dailymail",
59
+ "name": "CNN/DailyMail",
60
+ "description": "News article summarization dataset",
61
+ "configs": ["1.0.0", "2.0.0", "3.0.0"],
62
+ "splits": ["train", "validation", "test"],
63
+ "text_column": "article",
64
+ "label_column": "highlights",
65
+ "default_config": "3.0.0",
66
+ "size_categories": ["100K-1M"],
67
+ "recommended": true
68
+ },
69
+ {
70
+ "id": "xsum",
71
+ "name": "XSum",
72
+ "description": "BBC article summarization",
73
+ "configs": [],
74
+ "splits": ["train", "validation", "test"],
75
+ "text_column": "document",
76
+ "label_column": "summary",
77
+ "size_categories": ["10K-100K"],
78
+ "recommended": true
79
+ },
80
+ {
81
+ "id": "samsum",
82
+ "name": "SAMSum",
83
+ "description": "Dialogue summarization dataset",
84
+ "configs": [],
85
+ "splits": ["train", "validation", "test"],
86
+ "text_column": "dialogue",
87
+ "label_column": "summary",
88
+ "size_categories": ["10K-100K"],
89
+ "recommended": true
90
+ },
91
+ {
92
+ "id": "wmt16",
93
+ "name": "WMT16 Translation",
94
+ "description": "Machine translation dataset",
95
+ "configs": ["de-en", "en-de", "ro-en", "en-ro", "cs-en", "en-cs"],
96
+ "splits": ["train", "validation", "test"],
97
+ "size_categories": ["1M-10M"],
98
+ "recommended": false
99
+ },
100
+ {
101
+ "id": "billsum",
102
+ "name": "BillSum",
103
+ "description": "US Congressional bill summarization",
104
+ "configs": [],
105
+ "splits": ["train", "test"],
106
+ "text_column": "text",
107
+ "label_column": "summary",
108
+ "size_categories": ["10K-100K"],
109
+ "recommended": true
110
+ }
111
+ ],
112
+ "token-classification": [
113
+ {
114
+ "id": "conll2003",
115
+ "name": "CoNLL-2003",
116
+ "description": "Named entity recognition dataset",
117
+ "configs": [],
118
+ "splits": ["train", "validation", "test"],
119
+ "text_column": "tokens",
120
+ "label_column": "ner_tags",
121
+ "labels": ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-MISC", "I-MISC"],
122
+ "size_categories": ["10K-100K"],
123
+ "recommended": true
124
+ },
125
+ {
126
+ "id": "wnut_17",
127
+ "name": "WNUT 17",
128
+ "description": "Emerging entity recognition from social media",
129
+ "configs": [],
130
+ "splits": ["train", "validation", "test"],
131
+ "text_column": "tokens",
132
+ "label_column": "ner_tags",
133
+ "labels": ["O", "B-corporation", "B-creative-work", "B-group", "B-location", "B-person", "B-product", "I-corporation", "I-creative-work", "I-group", "I-location", "I-person", "I-product"],
134
+ "size_categories": ["<10K"],
135
+ "recommended": true
136
+ },
137
+ {
138
+ "id": "ontonotes5",
139
+ "name": "OntoNotes 5.0",
140
+ "description": "Multi-genre NER and coreference",
141
+ "configs": ["english_v4", "english_v12", "chinese_v4", "arabic_v4"],
142
+ "splits": ["train", "validation", "test"],
143
+ "text_column": "document",
144
+ "label_column": "named_entities",
145
+ "size_categories": ["100K-1M"],
146
+ "recommended": false
147
+ }
148
+ ],
149
+ "text-classification": [
150
+ {
151
+ "id": "imdb",
152
+ "name": "IMDB",
153
+ "description": "Movie review sentiment classification",
154
+ "configs": [],
155
+ "splits": ["train", "test", "unsupervised"],
156
+ "text_column": "text",
157
+ "label_column": "label",
158
+ "labels": ["negative", "positive"],
159
+ "size_categories": ["10K-100K"],
160
+ "recommended": true
161
+ },
162
+ {
163
+ "id": "yelp_polarity",
164
+ "name": "Yelp Polarity",
165
+ "description": "Yelp review sentiment classification",
166
+ "configs": [],
167
+ "splits": ["train", "test"],
168
+ "text_column": "text",
169
+ "label_column": "label",
170
+ "labels": ["negative", "positive"],
171
+ "size_categories": ["100K-1M"],
172
+ "recommended": true
173
+ },
174
+ {
175
+ "id": "ag_news",
176
+ "name": "AG News",
177
+ "description": "News article categorization",
178
+ "configs": [],
179
+ "splits": ["train", "test"],
180
+ "text_column": "text",
181
+ "label_column": "label",
182
+ "labels": ["World", "Sports", "Business", "Sci/Tech"],
183
+ "size_categories": ["100K-1M"],
184
+ "recommended": true
185
+ },
186
+ {
187
+ "id": "glue",
188
+ "name": "GLUE",
189
+ "description": "General Language Understanding Evaluation",
190
+ "configs": ["cola", "mnli", "mnli_matched", "mnli_mismatched", "mrpc", "qnli", "qqp", "rte", "sst2", "stsb", "wnli"],
191
+ "splits": ["train", "validation", "test"],
192
+ "size_categories": ["varies"],
193
+ "recommended": true
194
+ },
195
+ {
196
+ "id": "emotion",
197
+ "name": "Emotion",
198
+ "description": "Twitter emotion classification",
199
+ "configs": [],
200
+ "splits": ["train", "validation", "test"],
201
+ "text_column": "text",
202
+ "label_column": "label",
203
+ "labels": ["sadness", "joy", "love", "anger", "fear", "surprise"],
204
+ "size_categories": ["10K-100K"],
205
+ "recommended": true
206
+ }
207
+ ],
208
+ "question-answering": [
209
+ {
210
+ "id": "squad",
211
+ "name": "SQuAD",
212
+ "description": "Stanford Question Answering Dataset",
213
+ "configs": ["plain_text"],
214
+ "splits": ["train", "validation"],
215
+ "text_column": "context",
216
+ "question_column": "question",
217
+ "answer_column": "answers",
218
+ "size_categories": ["10K-100K"],
219
+ "recommended": true
220
+ },
221
+ {
222
+ "id": "squad_v2",
223
+ "name": "SQuAD 2.0",
224
+ "description": "SQuAD with unanswerable questions",
225
+ "configs": ["squad_v2"],
226
+ "splits": ["train", "validation"],
227
+ "size_categories": ["100K-1M"],
228
+ "recommended": true
229
+ },
230
+ {
231
+ "id": "natural_questions",
232
+ "name": "Natural Questions",
233
+ "description": "Real user questions with Wikipedia answers",
234
+ "configs": ["default"],
235
+ "splits": ["train", "validation"],
236
+ "size_categories": [">10M"],
237
+ "recommended": false
238
+ },
239
+ {
240
+ "id": "coqa",
241
+ "name": "CoQA",
242
+ "description": "Conversational Question Answering",
243
+ "configs": [],
244
+ "splits": ["train", "validation"],
245
+ "size_categories": ["100K-1M"],
246
+ "recommended": true
247
+ }
248
+ ],
249
+ "translation": [
250
+ {
251
+ "id": "wmt14",
252
+ "name": "WMT14 Translation",
253
+ "description": "Large-scale machine translation",
254
+ "configs": ["de-en", "en-de", "fr-en", "en-fr"],
255
+ "splits": ["train", "validation", "test"],
256
+ "size_categories": [">10M"],
257
+ "recommended": false
258
+ },
259
+ {
260
+ "id": "opus100",
261
+ "name": "OPUS-100",
262
+ "description": "Multi-lingual parallel corpus",
263
+ "configs": ["en-de", "en-fr", "en-es", "en-ru", "en-zh"],
264
+ "splits": ["train", "validation", "test"],
265
+ "size_categories": ["1M-10M"],
266
+ "recommended": true
267
+ }
268
+ ],
269
+ "image-classification": [
270
+ {
271
+ "id": "cifar10",
272
+ "name": "CIFAR-10",
273
+ "description": "10-class image classification",
274
+ "configs": [],
275
+ "splits": ["train", "test"],
276
+ "image_column": "img",
277
+ "label_column": "label",
278
+ "labels": ["airplane", "automobile", "bird", "cat", "deer", "dog", "frog", "horse", "ship", "truck"],
279
+ "size_categories": ["10K-100K"],
280
+ "recommended": true
281
+ },
282
+ {
283
+ "id": "imagenet-1k",
284
+ "name": "ImageNet-1k",
285
+ "description": "Large-scale image classification",
286
+ "configs": [],
287
+ "splits": ["train", "validation"],
288
+ "image_column": "image",
289
+ "label_column": "label",
290
+ "size_categories": [">10M"],
291
+ "recommended": false
292
+ }
293
+ ]
294
+ },
295
+ "models": {
296
+ "causal-lm": {
297
+ "small": [
298
+ {"id": "gpt2", "params": "124M", "recommended": true},
299
+ {"id": "distilgpt2", "params": "82M", "recommended": true},
300
+ {"id": "EleutherAI/gpt-neo-125M", "params": "125M", "recommended": true},
301
+ {"id": "bigscience/bloom-560m", "params": "560M", "recommended": true}
302
+ ],
303
+ "medium": [
304
+ {"id": "gpt2-medium", "params": "355M", "recommended": true},
305
+ {"id": "gpt2-large", "params": "774M", "recommended": true},
306
+ {"id": "EleutherAI/gpt-neo-1.3B", "params": "1.3B", "recommended": true},
307
+ {"id": "EleutherAI/gpt-j-6b", "params": "6B", "recommended": false},
308
+ {"id": "bigscience/bloom-1b7", "params": "1.7B", "recommended": true},
309
+ {"id": "meta-llama/Llama-2-7b-hf", "params": "7B", "recommended": true},
310
+ {"id": "mistralai/Mistral-7B-v0.1", "params": "7B", "recommended": true}
311
+ ],
312
+ "large": [
313
+ {"id": "EleutherAI/gpt-neox-20b", "params": "20B", "recommended": false},
314
+ {"id": "bigscience/bloom", "params": "176B", "recommended": false},
315
+ {"id": "meta-llama/Llama-2-13b-hf", "params": "13B", "recommended": false},
316
+ {"id": "meta-llama/Llama-2-70b-hf", "params": "70B", "recommended": false}
317
+ ]
318
+ },
319
+ "seq2seq": {
320
+ "small": [
321
+ {"id": "google-t5/t5-small", "params": "60M", "recommended": true},
322
+ {"id": "facebook/bart-base", "params": "140M", "recommended": true},
323
+ {"id": "google/flan-t5-small", "params": "80M", "recommended": true}
324
+ ],
325
+ "medium": [
326
+ {"id": "google-t5/t5-base", "params": "220M", "recommended": true},
327
+ {"id": "facebook/bart-large", "params": "400M", "recommended": true},
328
+ {"id": "google/flan-t5-base", "params": "250M", "recommended": true},
329
+ {"id": "google/flan-t5-large", "params": "780M", "recommended": true},
330
+ {"id": "google-t5/t5-large", "params": "770M", "recommended": true}
331
+ ],
332
+ "large": [
333
+ {"id": "google-t5/t5-3b", "params": "3B", "recommended": false},
334
+ {"id": "google/flan-t5-xl", "params": "3B", "recommended": false},
335
+ {"id": "facebook/bart-large-cnn", "params": "400M", "recommended": true}
336
+ ]
337
+ },
338
+ "token-classification": {
339
+ "small": [
340
+ {"id": "dslim/bert-base-NER", "params": "110M", "recommended": true},
341
+ {"id": "dslim/distilbert-NER", "params": "66M", "recommended": true},
342
+ {"id": "dbmdz/bert-large-cased-finetuned-conll03-english", "params": "340M", "recommended": true}
343
+ ],
344
+ "medium": [
345
+ {"id": "dslim/bert-base-NER", "params": "110M", "recommended": true},
346
+ {"id": "elastic/distilbert-base-uncased-finetuned-conll03-english", "params": "66M", "recommended": true}
347
+ ]
348
+ },
349
+ "text-classification": {
350
+ "small": [
351
+ {"id": "distilbert/distilbert-base-uncased", "params": "66M", "recommended": true},
352
+ {"id": "google-bert/bert-base-uncased", "params": "110M", "recommended": true},
353
+ {"id": "roberta-base", "params": "125M", "recommended": true}
354
+ ],
355
+ "medium": [
356
+ {"id": "google-bert/bert-large-uncased", "params": "340M", "recommended": true},
357
+ {"id": "roberta-large", "params": "355M", "recommended": true},
358
+ {"id": "microsoft/deberta-v3-base", "params": "184M", "recommended": true}
359
+ ]
360
+ },
361
+ "question-answering": {
362
+ "small": [
363
+ {"id": "distilbert/distilbert-base-uncased-distilled-squad", "params": "66M", "recommended": true},
364
+ {"id": "deepset/minilm-uncased-squad2", "params": "33M", "recommended": true}
365
+ ],
366
+ "medium": [
367
+ {"id": "deepset/roberta-base-squad2", "params": "125M", "recommended": true},
368
+ {"id": "google-bert/bert-large-uncased-whole-word-masking-finetuned-squad", "params": "340M", "recommended": true}
369
+ ]
370
+ }
371
+ },
372
+ "task_metadata": {
373
+ "causal-lm": {
374
+ "display_name": "Causal Language Modeling",
375
+ "description": "Generate text, autocomplete, story writing",
376
+ "icon": "text_fields",
377
+ "metrics": ["perplexity", "accuracy", "f1"],
378
+ "requires_decoder_only": true
379
+ },
380
+ "seq2seq": {
381
+ "display_name": "Sequence-to-Sequence",
382
+ "description": "Summarization, translation, paraphrase",
383
+ "icon": "compare_arrows",
384
+ "metrics": ["rouge1", "rouge2", "rougeL", "bleu", "meteor"],
385
+ "requires_encoder_decoder": true
386
+ },
387
+ "token-classification": {
388
+ "display_name": "Token Classification",
389
+ "description": "Named entity recognition, POS tagging",
390
+ "icon": "label",
391
+ "metrics": ["precision", "recall", "f1", "accuracy"],
392
+ "requires_encoder": true
393
+ },
394
+ "text-classification": {
395
+ "display_name": "Text Classification",
396
+ "description": "Sentiment analysis, topic classification",
397
+ "icon": "category",
398
+ "metrics": ["accuracy", "f1", "precision", "recall"],
399
+ "requires_encoder": true
400
+ },
401
+ "question-answering": {
402
+ "display_name": "Question Answering",
403
+ "description": "Extractive and generative QA",
404
+ "icon": "help",
405
+ "metrics": ["exact_match", "f1"],
406
+ "requires_encoder": true
407
+ },
408
+ "translation": {
409
+ "display_name": "Translation",
410
+ "description": "Machine translation between languages",
411
+ "icon": "translate",
412
+ "metrics": ["bleu", "meteor", "chrf"],
413
+ "requires_encoder_decoder": true
414
+ }
415
+ }
416
+ }