MgGladys commited on
Commit
1ffaeb6
·
verified ·
1 Parent(s): 43feefe

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. adhoc/debug/iterable_dataset_drop_last_batch.py +55 -0
  2. adhoc/eval_mteb/e5mistral_prompt.py +143 -0
  3. adhoc/eval_mteb/merge_cqadupstack.py +80 -0
  4. adhoc/eval_mteb/mteb_utils.py +348 -0
  5. adhoc/eval_mteb/run_mteb.py +198 -0
  6. adhoc/gather_score_byckpt_aws.py +136 -0
  7. adhoc/hf_datasets.py +37 -0
  8. adhoc/merge_checkpoint.py +26 -0
  9. adhoc/plot.py +31 -0
  10. adhoc/plot2.py +47 -0
  11. adhoc/test_ddp.py +24 -0
  12. adhoc/testset_stats.py +66 -0
  13. adhoc/visual_doc/category_colpali_training.py +27 -0
  14. adhoc/visual_doc/category_visrag_training.py +38 -0
  15. adhoc/visual_doc/check_corpus.py +7 -0
  16. adhoc/visual_doc/mmdoclong-doc.py +124 -0
  17. adhoc/visual_doc/mmdoclong.py +124 -0
  18. adhoc/visual_doc/vidoseek.py +117 -0
  19. experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-100/added_tokens.json +24 -0
  20. experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-100/chat_template.jinja +7 -0
  21. experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-100/merges.txt +0 -0
  22. experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-100/preprocessor_config.json +29 -0
  23. experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-100/special_tokens_map.json +31 -0
  24. experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-100/tokenizer_config.json +208 -0
  25. experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-100/trainer_state.json +734 -0
  26. experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-100/vocab.json +0 -0
  27. experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-400/special_tokens_map.json +31 -0
  28. experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-500/added_tokens.json +24 -0
  29. experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-500/chat_template.jinja +7 -0
  30. experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-500/merges.txt +0 -0
  31. experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-500/preprocessor_config.json +29 -0
  32. experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-500/special_tokens_map.json +31 -0
  33. experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-500/tokenizer_config.json +208 -0
  34. experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-500/trainer_state.json +3534 -0
  35. experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-500/vocab.json +0 -0
  36. experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-600/added_tokens.json +24 -0
  37. experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-600/chat_template.jinja +7 -0
  38. experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-600/preprocessor_config.json +29 -0
  39. experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-600/special_tokens_map.json +31 -0
  40. experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-600/tokenizer_config.json +208 -0
  41. experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-600/trainer_state.json +0 -0
  42. experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-700/added_tokens.json +24 -0
  43. experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-700/chat_template.jinja +7 -0
  44. experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-700/merges.txt +0 -0
  45. experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-700/preprocessor_config.json +29 -0
  46. experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-700/special_tokens_map.json +31 -0
  47. experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-700/tokenizer_config.json +208 -0
  48. experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-700/trainer_state.json +0 -0
  49. experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-700/vocab.json +0 -0
  50. experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-800/added_tokens.json +24 -0
adhoc/debug/iterable_dataset_drop_last_batch.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import Dataset
2
+ from datasets import interleave_datasets
3
+ from torch.utils.data import DataLoader
4
+
5
+ def convert_to_str(batch, dataset_name):
6
+ batch['a'] = [f"{dataset_name}-{e}" for e in batch['a']]
7
+ return batch
8
+
9
+ def gen1():
10
+ for ii in range(1, 25):
11
+ yield {"a": ii}
12
+
13
+ def gen2():
14
+ for ii in range(1, 25):
15
+ yield {"a": ii}
16
+
17
+ # https://github.com/huggingface/datasets/issues/6565
18
+ if __name__ == '__main__':
19
+ dataset1 = Dataset.from_generator(gen1).to_iterable_dataset(num_shards=2)
20
+ dataset2 = Dataset.from_generator(gen2).to_iterable_dataset(num_shards=2)
21
+ dataset1 = dataset1.map(lambda x: convert_to_str(x, dataset_name="a"), batched=True, batch_size=10, drop_last_batch=True)
22
+ dataset2 = dataset2.map(lambda x: convert_to_str(x, dataset_name="b"), batched=True, batch_size=10, drop_last_batch=True)
23
+
24
+ interleaved = interleave_datasets([dataset1, dataset2], stopping_strategy="all_exhausted")
25
+
26
+ print(f"num_workers=0")
27
+ loader = DataLoader(interleaved, batch_size=5, num_workers=0)
28
+ i = 0
29
+ for b in loader:
30
+ print(i, b['a'])
31
+ i += 1
32
+
33
+ print('=-' * 20)
34
+ print(f"num_workers=1")
35
+ loader = DataLoader(interleaved, batch_size=5, num_workers=1)
36
+ i = 0
37
+ for b in loader:
38
+ print(i, b['a'])
39
+ i += 1
40
+
41
+ print('=-' * 20)
42
+ print(f"num_workers=2")
43
+ loader = DataLoader(interleaved, batch_size=5, num_workers=2)
44
+ i = 0
45
+ for b in loader:
46
+ print(i, b['a'])
47
+ i += 1
48
+
49
+ print('=-' * 20)
50
+ print(f"num_workers=3")
51
+ loader = DataLoader(interleaved, batch_size=5, num_workers=3)
52
+ i = 0
53
+ for b in loader:
54
+ print(i, b['a'])
55
+ i += 1
adhoc/eval_mteb/e5mistral_prompt.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ from typing import Dict
3
+
4
+ def load_e5mistral_prompt(task_name, task_type, *args, **kwargs):
5
+ if task_type is None:
6
+ task_type = "Retrieval"
7
+ if task_name.endswith("_small") or task_name.endswith("_s") or task_name.endswith("_xs"):
8
+ task_name = task_name[:task_name.rindex("_")]
9
+ if task_name.startswith("cqadupstack-"):
10
+ task_name = "cqadupstack"
11
+ task_def = get_task_def_by_task_name_and_type(task_name=task_name, task_type=task_type)
12
+ prompt = get_detailed_instruct(task_def)
13
+ prompt_dict = {"q_prompt": prompt, "d_prompt": ""}
14
+ return prompt_dict
15
+
16
+
17
+ def get_task_def_by_task_name_and_type(task_type: str, task_name: str) -> str:
18
+ # @ruimeng added
19
+ if task_name.lower() in ['nli', 'allnli']:
20
+ return "Retrieve a sentence that is semantically entailed by the given sentence."
21
+
22
+ if task_type in ['STS', 'sts']:
23
+ return "Retrieve semantically similar text."
24
+
25
+ if task_type in ['Summarization', 'summarization']:
26
+ return "Given a news summary, retrieve other semantically similar summaries"
27
+
28
+ if task_type in ['BitextMining', 'bitextmining']:
29
+ return "Retrieve parallel sentences."
30
+
31
+ if task_type in ['Classification', 'classification']:
32
+ task_name_to_instruct: Dict[str, str] = {
33
+ 'AmazonCounterfactualClassification': 'Classify a given Amazon customer review text as either counterfactual or not-counterfactual',
34
+ 'AmazonPolarityClassification': 'Classify Amazon reviews into positive or negative sentiment',
35
+ 'AmazonReviewsClassification': 'Classify the given Amazon review into its appropriate rating category',
36
+ 'AmazonReviewsPairClassification': 'Given an Amazon review, locate reviews within the same rating category',
37
+ 'Banking77Classification': 'Given a online banking query, find the corresponding intents',
38
+ 'EmotionClassification': 'Classify the emotion expressed in the given Twitter message into one of the six emotions: anger, fear, joy, love, sadness, and surprise',
39
+ 'EmotionPairClassification': 'Given an Twitter message, locate message within the same emotion category',
40
+ 'ImdbClassification': 'Classify the sentiment expressed in the given movie review text from the IMDB dataset',
41
+ 'MassiveIntentClassification': 'Given a user utterance as query, find the user intents',
42
+ 'MassiveScenarioClassification': 'Given a user utterance as query, find the user scenarios',
43
+ 'MTOPDomainClassification': 'Classify the intent domain of the given utterance in task-oriented conversation',
44
+ 'MTOPIntentClassification': 'Classify the intent of the given utterance in task-oriented conversation',
45
+ 'MTOPIntentPairClassification': 'Given an utterance in task-oriented conversation, locate utterance within the same intent category',
46
+ 'ToxicConversationsClassification': 'Classify the given comments as either toxic or not toxic',
47
+ 'ToxicConversationsPairClassification': 'Given an comment as toxic or non-toxic, locate comments within the same category',
48
+ 'TweetSentimentExtractionClassification': 'Classify the sentiment of a given tweet as either positive, negative, or neutral',
49
+ 'TweetSentimentPairClassification': 'Given an comment as either positive, negative, or neutral, locate comments within the same category',
50
+ }
51
+ return task_name_to_instruct[task_name]
52
+
53
+ if task_type in ['Clustering', 'clustering']:
54
+ task_name_to_instruct: Dict[str, str] = {
55
+ 'ArxivClusteringP2P': 'Identify the main and secondary category of Arxiv papers based on the titles and abstracts',
56
+ 'ArxivClusteringS2S': 'Identify the main and secondary category of Arxiv papers based on the titles',
57
+ 'BiorxivClusteringP2P': 'Identify the main category of Biorxiv papers based on the titles and abstracts',
58
+ 'BiorxivClusteringS2S': 'Identify the main category of Biorxiv papers based on the titles',
59
+ 'MedrxivClusteringP2P': 'Identify the main category of Medrxiv papers based on the titles and abstracts',
60
+ 'MedrxivClusteringS2S': 'Identify the main category of Medrxiv papers based on the titles',
61
+ 'RedditClustering': 'Identify the topic or theme of Reddit posts based on the titles',
62
+ 'RedditClusteringP2P': 'Identify the topic or theme of Reddit posts based on the titles and posts',
63
+ 'StackExchangeClustering': 'Identify the topic or theme of StackExchange posts based on the titles',
64
+ 'StackExchangeClusteringP2P': 'Identify the topic or theme of StackExchange posts based on the given paragraphs',
65
+ 'TwentyNewsgroupsClustering': 'Identify the topic or theme of the given news articles',
66
+ }
67
+ return task_name_to_instruct[task_name]
68
+
69
+ if task_type in ['Reranking', 'PairClassification', 'reranking', 'pairclassification']:
70
+ task_name_to_instruct: Dict[str, str] = {
71
+ 'AskUbuntuDupQuestions': 'Retrieve duplicate questions from AskUbuntu forum',
72
+ 'MindSmallReranking': 'Retrieve relevant news articles based on user browsing history',
73
+ 'SciDocsRR': 'Given a title of a scientific paper, retrieve the titles of other relevant papers',
74
+ 'StackOverflowDupQuestions': 'Retrieve duplicate questions from StackOverflow forum',
75
+ 'SprintDuplicateQuestions': 'Retrieve duplicate questions from Sprint forum',
76
+ 'TwitterSemEval2015': 'Retrieve tweets that are semantically similar to the given tweet',
77
+ 'TwitterURLCorpus': 'Retrieve tweets that are semantically similar to the given tweet',
78
+ }
79
+ return task_name_to_instruct[task_name]
80
+
81
+ if task_type in ['Retrieval', 'retrieval']:
82
+ if task_name.lower().startswith('cqadupstack'):
83
+ return 'Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question'
84
+
85
+ task_name_to_instruct: Dict[str, str] = {
86
+ 'ArguAna': 'Given a claim, find documents that refute the claim',
87
+ 'ClimateFEVER': 'Given a claim about climate change, retrieve documents that support or refute the claim',
88
+ 'DBPedia': 'Given a query, retrieve relevant entity descriptions from DBPedia',
89
+ 'FEVER': 'Given a claim, retrieve documents that support or refute the claim',
90
+ 'FiQA2018': 'Given a financial question, retrieve user replies that best answer the question',
91
+ 'HotpotQA': 'Given a multi-hop question, retrieve documents that can help answer the question',
92
+ 'MSMARCO': 'Given a web search query, retrieve relevant passages that answer the query',
93
+ 'NFCorpus': 'Given a question, retrieve relevant documents that best answer the question',
94
+ 'NQ': 'Given a question, retrieve Wikipedia passages that answer the question',
95
+ 'QuoraRetrieval': 'Given a question, retrieve questions that are semantically equivalent to the given question',
96
+ 'SCIDOCS': 'Given a scientific paper title, retrieve paper abstracts that are cited by the given paper',
97
+ 'SciFact': 'Given a scientific claim, retrieve documents that support or refute the claim',
98
+ 'Touche2020': 'Given a question, retrieve detailed and persuasive arguments that answer the question',
99
+ 'TRECCOVID': 'Given a query on COVID-19, retrieve documents that answer the query',
100
+ 'InstructConversation': "Given a question asked by user, the assistant answers",
101
+ 'MrTydi': "Given a question, retrieve Wikipedia passages that answer the question",
102
+ "ChatgptShortLong": "Given a query, retrieve passages that answer the query",
103
+ # E5 public training
104
+ "msmarco_document": "Given a web search query, retrieve relevant documents that answer the query",
105
+ "msmarco_passage": "Given a web search query, retrieve relevant passages that answer the query",
106
+ "allnli": "Given a web search query, retrieve relevant documents that answer the query",
107
+ "dureader": "Given a Chinese search query, retrieve web passages that answer the question",
108
+ "eli5_question_answer": "Provided a user question, retrieve the highest voted answers on Reddit ELI5 forum",
109
+ "fever": "Given a claim, retrieve documents that support or refute the claim",
110
+ "hotpot_qa": "Given a multi-hop question, retrieve documents that can help answer the question",
111
+ "miracl": "Given a question, retrieve Wikipedia passages that answer the question",
112
+ "mrtydi": "Given a question, retrieve Wikipedia passages that answer the question",
113
+ "nq": "Given a question, retrieve Wikipedia passages that answer the question",
114
+ "quora_duplicates": "Given a question, retrieve questions that are semantically equivalent to the given question",
115
+ "squad": "Retrieve Wikipedia passages that answer the question",
116
+ "t2ranking": "Given a Chinese search query, retrieve web passages that answer the question",
117
+ "trivia_qa": "Retrieve Wikipedia passages that answer the question'",
118
+ }
119
+
120
+ # add lower case keys to match some beir names
121
+ task_name_to_instruct.update({k.lower(): v for k, v in task_name_to_instruct.items()})
122
+ # other cases where lower case match still doesn't work
123
+ task_name_to_instruct['trec-covid'] = task_name_to_instruct['TRECCOVID']
124
+ task_name_to_instruct['climate-fever'] = task_name_to_instruct['ClimateFEVER']
125
+ task_name_to_instruct['dbpedia-entity'] = task_name_to_instruct['DBPedia']
126
+ task_name_to_instruct['webis-touche2020'] = task_name_to_instruct['Touche2020']
127
+ task_name_to_instruct['fiqa'] = task_name_to_instruct['FiQA2018']
128
+ task_name_to_instruct['quora'] = task_name_to_instruct['QuoraRetrieval']
129
+ task_name_to_instruct['instructed-conversation'] = task_name_to_instruct['InstructConversation']
130
+
131
+ # for miracl evaluation
132
+ task_name_to_instruct['miracl'] = 'Given a question, retrieve Wikipedia passages that answer the question'
133
+
134
+ return task_name_to_instruct[task_name]
135
+
136
+ raise ValueError(f"No instruction config for task {task_name} with type {task_type}")
137
+
138
+
139
+ def get_detailed_instruct(task_description: str) -> str:
140
+ if not task_description:
141
+ return ''
142
+
143
+ return 'Instruct: {}\nQuery: '.format(task_description)
adhoc/eval_mteb/merge_cqadupstack.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Merges CQADupstack subset results
2
+ Usage: python merge_cqadupstack.py path_to_results_folder
3
+
4
+ Adapted from: https://github.com/embeddings-benchmark/mteb/blob/main/scripts/merge_cqadupstack.py
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import glob
10
+ import json
11
+ import logging
12
+ import os
13
+ import sys
14
+
15
+ logging.basicConfig(level=logging.INFO)
16
+ logger = logging.getLogger(__name__)
17
+
18
+ TASK_LIST_CQA = [
19
+ "CQADupstackAndroidRetrieval",
20
+ "CQADupstackEnglishRetrieval",
21
+ "CQADupstackGamingRetrieval",
22
+ "CQADupstackGisRetrieval",
23
+ "CQADupstackMathematicaRetrieval",
24
+ "CQADupstackPhysicsRetrieval",
25
+ "CQADupstackProgrammersRetrieval",
26
+ "CQADupstackStatsRetrieval",
27
+ "CQADupstackTexRetrieval",
28
+ "CQADupstackUnixRetrieval",
29
+ "CQADupstackWebmastersRetrieval",
30
+ "CQADupstackWordpressRetrieval",
31
+ ]
32
+
33
+ NOAVG_KEYS = [
34
+ "hf_subset",
35
+ "languages",
36
+ "evaluation_time",
37
+ "mteb_version",
38
+ "mteb_dataset_name",
39
+ "dataset_revision",
40
+ ]
41
+
42
+
43
+ results_folder = '/export/xgen-embedding/release/SFR-Embedding-Mistral-v2/RC3/eval_output/public_mteb/beir'
44
+ # Ensure at least 1 character btw CQADupstack & Retrieval
45
+ files = glob.glob(f'{results_folder.rstrip("/")}/CQADupstack*?*Retrieval.json')
46
+
47
+ logger.info(f"Found CQADupstack files {len(files)}/{len(TASK_LIST_CQA)}: \n{files}")
48
+
49
+ if len(files) == len(TASK_LIST_CQA):
50
+ all_results = {}
51
+ for file_name in files:
52
+ with open(file_name, "r", encoding="utf-8") as f:
53
+ results = json.load(f)
54
+ for split, split_results in results.items():
55
+ if split not in ("train", "validation", "dev", "test"):
56
+ all_results[split] = split_results
57
+ continue
58
+ all_results.setdefault(split, {})
59
+ for metric, score in split_results.items():
60
+ all_results[split].setdefault(metric, 0)
61
+ if metric == "evaluation_time":
62
+ score = all_results[split][metric] + score
63
+ elif metric not in NOAVG_KEYS:
64
+ score = all_results[split][metric] + score * 1 / len(
65
+ TASK_LIST_CQA
66
+ )
67
+ all_results[split][metric] = score
68
+ final_results = results
69
+ final_results['scores'] = all_results
70
+ final_results["task_name"] = "CQADupstackRetrieval"
71
+ final_results["evaluation_time"] = None
72
+
73
+ logger.info(all_results)
74
+ logger.info(f"Saving results to {os.path.join(results_folder, 'CQADupstackRetrieval.json')}")
75
+ with open(os.path.join(results_folder, "CQADupstackRetrieval.json"), "w", encoding="utf-8") as f:
76
+ json.dump(final_results, f, indent=4)
77
+ else:
78
+ logger.warning(
79
+ f"Got {len(files)}, but expected {len(TASK_LIST_CQA)} files. Missing: {set(TASK_LIST_CQA) - set([x.split('/')[-1].split('.')[0] for x in files])}; Too much: {set([x.split('/')[-1].split('.')[0] for x in files]) - set(TASK_LIST_CQA)}"
80
+ )
adhoc/eval_mteb/mteb_utils.py ADDED
@@ -0,0 +1,348 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+
3
+ import torch
4
+ import logging
5
+
6
+ from torch import Tensor
7
+ from transformers import PreTrainedTokenizerFast, BatchEncoding
8
+ from typing import Mapping, Dict, List
9
+
10
+ import torch.distributed as dist
11
+
12
+
13
+ def _setup_logger():
14
+ log_format = logging.Formatter("[%(asctime)s %(levelname)s] %(message)s")
15
+ logger = logging.getLogger()
16
+ logger.setLevel(logging.INFO)
17
+
18
+ console_handler = logging.StreamHandler()
19
+ console_handler.setFormatter(log_format)
20
+ logger.handlers = [console_handler]
21
+
22
+ return logger
23
+
24
+
25
+ logger = _setup_logger()
26
+
27
+
28
+ def str2bool(v):
29
+ if isinstance(v, bool):
30
+ return v
31
+ if v.lower() in ('yes', 'true', 't', 'y', '1'):
32
+ return True
33
+ elif v.lower() in ('no', 'false', 'f', 'n', '0'):
34
+ return False
35
+ else:
36
+ raise argparse.ArgumentTypeError('Boolean value expected.')
37
+
38
+
39
+ def move_to_cuda(sample):
40
+ if len(sample) == 0:
41
+ return {}
42
+
43
+ def _move_to_cuda(maybe_tensor):
44
+ if torch.is_tensor(maybe_tensor):
45
+ return maybe_tensor.cuda(non_blocking=True)
46
+ elif isinstance(maybe_tensor, dict):
47
+ return {key: _move_to_cuda(value) for key, value in maybe_tensor.items()}
48
+ elif isinstance(maybe_tensor, list):
49
+ return [_move_to_cuda(x) for x in maybe_tensor]
50
+ elif isinstance(maybe_tensor, tuple):
51
+ return tuple([_move_to_cuda(x) for x in maybe_tensor])
52
+ elif isinstance(maybe_tensor, Mapping):
53
+ return type(maybe_tensor)({k: _move_to_cuda(v) for k, v in maybe_tensor.items()})
54
+ else:
55
+ return maybe_tensor
56
+
57
+ return _move_to_cuda(sample)
58
+
59
+
60
+ def pool(last_hidden_states: Tensor,
61
+ attention_mask: Tensor,
62
+ pool_type: str) -> Tensor:
63
+ last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
64
+
65
+ if pool_type == "avg":
66
+ emb = last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
67
+ elif pool_type == "weightedavg": # position-weighted mean pooling from SGPT (https://arxiv.org/abs/2202.08904)
68
+ attention_mask *= attention_mask.cumsum(dim=1) # [0,1,1,1,0,0] -> [0,1,2,3,0,0]
69
+ s = torch.sum(last_hidden * attention_mask.unsqueeze(-1).float(), dim=1)
70
+ d = attention_mask.sum(dim=1, keepdim=True).float()
71
+ emb = s / d
72
+ elif pool_type == "cls":
73
+ emb = last_hidden[:, 0]
74
+ elif pool_type == "last" or pool_type == "eos":
75
+ left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
76
+ if left_padding:
77
+ emb = last_hidden[:, -1]
78
+ else:
79
+ sequence_lengths = attention_mask.sum(dim=1) - 1
80
+ batch_size = last_hidden.shape[0]
81
+ emb = last_hidden[torch.arange(batch_size, device=last_hidden.device), sequence_lengths]
82
+ elif pool_type.lower() == "none":
83
+ emb = last_hidden
84
+ else:
85
+ raise ValueError(f"pool_type {pool_type} not supported")
86
+
87
+ return emb
88
+
89
+
90
+ def create_batch_dict(tokenizer: PreTrainedTokenizerFast, input_texts: List[str], always_add_eos: bool, max_length: int = 512) -> BatchEncoding:
91
+ if not always_add_eos:
92
+ return tokenizer(
93
+ input_texts,
94
+ max_length=max_length,
95
+ padding=True,
96
+ pad_to_multiple_of=8,
97
+ return_token_type_ids=False,
98
+ truncation=True,
99
+ return_tensors='pt'
100
+ )
101
+ else:
102
+ batch_dict = tokenizer(
103
+ input_texts,
104
+ max_length=max_length - 1,
105
+ return_token_type_ids=False,
106
+ return_attention_mask=False,
107
+ padding=False,
108
+ truncation=True
109
+ )
110
+
111
+ # append eos_token_id to every input_ids
112
+ batch_dict['input_ids'] = [input_ids + [tokenizer.eos_token_id] for input_ids in batch_dict['input_ids']]
113
+
114
+ return tokenizer.pad(
115
+ batch_dict,
116
+ padding=True,
117
+ pad_to_multiple_of=8,
118
+ return_attention_mask=True,
119
+ return_tensors="pt",
120
+ )
121
+
122
+
123
+ def get_task_def_by_task_name_and_type(task_name: str, task_type: str) -> str:
124
+ if task_type in ['STS']:
125
+ return "Retrieve semantically similar text."
126
+
127
+ if task_type in ['Summarization']:
128
+ return "Given a news summary, retrieve other semantically similar summaries"
129
+
130
+ if task_type in ['BitextMining']:
131
+ return "Retrieve parallel sentences."
132
+
133
+ if task_type in ['Classification']:
134
+ task_name_to_instruct: Dict[str, str] = {
135
+ 'AmazonCounterfactualClassification': 'Classify a given Amazon customer review text as either counterfactual or not-counterfactual',
136
+ 'AmazonPolarityClassification': 'Classify Amazon reviews into positive or negative sentiment',
137
+ 'AmazonReviewsClassification': 'Classify the given Amazon review into its appropriate rating category',
138
+ 'Banking77Classification': 'Given a online banking query, find the corresponding intents',
139
+ 'EmotionClassification': 'Classify the emotion expressed in the given Twitter message into one of the six emotions: anger, fear, joy, love, sadness, and surprise',
140
+ 'ImdbClassification': 'Classify the sentiment expressed in the given movie review text from the IMDB dataset',
141
+ 'MassiveIntentClassification': 'Given a user utterance as query, find the user intents',
142
+ 'MassiveScenarioClassification': 'Given a user utterance as query, find the user scenarios',
143
+ 'MTOPDomainClassification': 'Classify the intent domain of the given utterance in task-oriented conversation',
144
+ 'MTOPIntentClassification': 'Classify the intent of the given utterance in task-oriented conversation',
145
+ 'ToxicConversationsClassification': 'Classify the given comments as either toxic or not toxic',
146
+ 'TweetSentimentExtractionClassification': 'Classify the sentiment of a given tweet as either positive, negative, or neutral',
147
+ # C-MTEB eval instructions
148
+ 'TNews': 'Classify the fine-grained category of the given news title',
149
+ 'IFlyTek': 'Given an App description text, find the appropriate fine-grained category',
150
+ 'MultilingualSentiment': 'Classify sentiment of the customer review into positive, neutral, or negative',
151
+ 'JDReview': 'Classify the customer review for iPhone on e-commerce platform into positive or negative',
152
+ 'OnlineShopping': 'Classify the customer review for online shopping into positive or negative',
153
+ 'Waimai': 'Classify the customer review from a food takeaway platform into positive or negative',
154
+ }
155
+ return task_name_to_instruct[task_name]
156
+
157
+ if task_type in ['Clustering']:
158
+ task_name_to_instruct: Dict[str, str] = {
159
+ 'ArxivClusteringP2P': 'Identify the main and secondary category of Arxiv papers based on the titles and abstracts',
160
+ 'ArxivClusteringS2S': 'Identify the main and secondary category of Arxiv papers based on the titles',
161
+ 'BiorxivClusteringP2P': 'Identify the main category of Biorxiv papers based on the titles and abstracts',
162
+ 'BiorxivClusteringS2S': 'Identify the main category of Biorxiv papers based on the titles',
163
+ 'MedrxivClusteringP2P': 'Identify the main category of Medrxiv papers based on the titles and abstracts',
164
+ 'MedrxivClusteringS2S': 'Identify the main category of Medrxiv papers based on the titles',
165
+ 'RedditClustering': 'Identify the topic or theme of Reddit posts based on the titles',
166
+ 'RedditClusteringP2P': 'Identify the topic or theme of Reddit posts based on the titles and posts',
167
+ 'StackExchangeClustering': 'Identify the topic or theme of StackExchange posts based on the titles',
168
+ 'StackExchangeClusteringP2P': 'Identify the topic or theme of StackExchange posts based on the given paragraphs',
169
+ 'TwentyNewsgroupsClustering': 'Identify the topic or theme of the given news articles',
170
+ # C-MTEB eval instructions
171
+ 'CLSClusteringS2S': 'Identify the main category of scholar papers based on the titles',
172
+ 'CLSClusteringP2P': 'Identify the main category of scholar papers based on the titles and abstracts',
173
+ 'ThuNewsClusteringS2S': 'Identify the topic or theme of the given news articles based on the titles',
174
+ 'ThuNewsClusteringP2P': 'Identify the topic or theme of the given news articles based on the titles and contents',
175
+ }
176
+ return task_name_to_instruct[task_name]
177
+
178
+ if task_type in ['Reranking', 'PairClassification']:
179
+ task_name_to_instruct: Dict[str, str] = {
180
+ 'AskUbuntuDupQuestions': 'Retrieve duplicate questions from AskUbuntu forum',
181
+ 'MindSmallReranking': 'Retrieve relevant news articles based on user browsing history',
182
+ 'SciDocsRR': 'Given a title of a scientific paper, retrieve the titles of other relevant papers',
183
+ 'StackOverflowDupQuestions': 'Retrieve duplicate questions from StackOverflow forum',
184
+ 'SprintDuplicateQuestions': 'Retrieve duplicate questions from Sprint forum',
185
+ 'TwitterSemEval2015': 'Retrieve tweets that are semantically similar to the given tweet',
186
+ 'TwitterURLCorpus': 'Retrieve tweets that are semantically similar to the given tweet',
187
+ # C-MTEB eval instructions
188
+ 'T2Reranking': 'Given a Chinese search query, retrieve web passages that answer the question',
189
+ 'MMarcoReranking': 'Given a Chinese search query, retrieve web passages that answer the question',
190
+ 'CMedQAv1': 'Given a Chinese community medical question, retrieve replies that best answer the question',
191
+ 'CMedQAv2': 'Given a Chinese community medical question, retrieve replies that best answer the question',
192
+ 'Ocnli': 'Retrieve semantically similar text.',
193
+ 'Cmnli': 'Retrieve semantically similar text.',
194
+ }
195
+ return task_name_to_instruct[task_name]
196
+
197
+ if task_type in ['Retrieval']:
198
+ if task_name.lower().startswith('cqadupstack'):
199
+ return 'Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question'
200
+
201
+ task_name_to_instruct: Dict[str, str] = {
202
+ 'ArguAna': 'Given a claim, find documents that refute the claim',
203
+ 'ClimateFEVER': 'Given a claim about climate change, retrieve documents that support or refute the claim',
204
+ 'DBPedia': 'Given a query, retrieve relevant entity descriptions from DBPedia',
205
+ 'FEVER': 'Given a claim, retrieve documents that support or refute the claim',
206
+ 'FiQA2018': 'Given a financial question, retrieve user replies that best answer the question',
207
+ 'HotpotQA': 'Given a multi-hop question, retrieve documents that can help answer the question',
208
+ 'MSMARCO': 'Given a web search query, retrieve relevant passages that answer the query',
209
+ 'NFCorpus': 'Given a question, retrieve relevant documents that best answer the question',
210
+ 'NQ': 'Given a question, retrieve Wikipedia passages that answer the question',
211
+ 'QuoraRetrieval': 'Given a question, retrieve questions that are semantically equivalent to the given question',
212
+ 'SCIDOCS': 'Given a scientific paper title, retrieve paper abstracts that are cited by the given paper',
213
+ 'SciFact': 'Given a scientific claim, retrieve documents that support or refute the claim',
214
+ 'Touche2020': 'Given a question, retrieve detailed and persuasive arguments that answer the question',
215
+ 'TRECCOVID': 'Given a query on COVID-19, retrieve documents that answer the query',
216
+ # C-MTEB eval instructions
217
+ 'T2Retrieval': 'Given a Chinese search query, retrieve web passages that answer the question',
218
+ 'MMarcoRetrieval': 'Given a web search query, retrieve relevant passages that answer the query',
219
+ 'DuRetrieval': 'Given a Chinese search query, retrieve web passages that answer the question',
220
+ 'CovidRetrieval': 'Given a question on COVID-19, retrieve news articles that answer the question',
221
+ 'CmedqaRetrieval': 'Given a Chinese community medical question, retrieve replies that best answer the question',
222
+ 'EcomRetrieval': 'Given a user query from an e-commerce website, retrieve description sentences of relevant products',
223
+ 'MedicalRetrieval': 'Given a medical question, retrieve user replies that best answer the question',
224
+ 'VideoRetrieval': 'Given a video search query, retrieve the titles of relevant videos',
225
+ }
226
+
227
+ # add lower case keys to match some beir names
228
+ task_name_to_instruct.update({k.lower(): v for k, v in task_name_to_instruct.items()})
229
+ # other cases where lower case match still doesn't work
230
+ task_name_to_instruct['trec-covid'] = task_name_to_instruct['TRECCOVID']
231
+ task_name_to_instruct['climate-fever'] = task_name_to_instruct['ClimateFEVER']
232
+ task_name_to_instruct['dbpedia-entity'] = task_name_to_instruct['DBPedia']
233
+ task_name_to_instruct['webis-touche2020'] = task_name_to_instruct['Touche2020']
234
+ task_name_to_instruct['fiqa'] = task_name_to_instruct['FiQA2018']
235
+ task_name_to_instruct['quora'] = task_name_to_instruct['QuoraRetrieval']
236
+
237
+ # for miracl evaluation
238
+ task_name_to_instruct['miracl'] = 'Given a question, retrieve Wikipedia passages that answer the question'
239
+
240
+ return task_name_to_instruct[task_name]
241
+
242
+ raise ValueError(f"No instruction config for task {task_name} with type {task_type}")
243
+
244
+
245
+ def get_detailed_instruct(task_description: str) -> str:
246
+ if not task_description:
247
+ return ''
248
+
249
+ return 'Instruct: {}\nQuery: '.format(task_description)
250
+
251
+
252
+ def input_transform_func(tokenizer: PreTrainedTokenizerFast,
253
+ examples: Dict[str, List],
254
+ always_add_eos: bool,
255
+ max_length: int,
256
+ ) -> BatchEncoding:
257
+ if not always_add_eos:
258
+ batch_dict = tokenizer(
259
+ examples['input_texts'],
260
+ max_length=max_length if max_length else None,
261
+ padding=True,
262
+ return_token_type_ids=False,
263
+ truncation=True
264
+ )
265
+ else:
266
+ batch_dict = tokenizer(
267
+ examples['input_texts'],
268
+ max_length=max_length - 1 if max_length else None,
269
+ return_token_type_ids=False,
270
+ return_attention_mask=False,
271
+ padding=False,
272
+ truncation=True
273
+ )
274
+ # append eos_token_id to every input_ids, some texts in FiQA are empty
275
+ input_ids_list = []
276
+ for input_ids in batch_dict['input_ids']:
277
+ if not input_ids:
278
+ input_ids_list.append([tokenizer.eos_token_id])
279
+ elif input_ids[-1] != tokenizer.eos_token_id:
280
+ input_ids_list.append(input_ids + [tokenizer.eos_token_id])
281
+ else:
282
+ input_ids_list.append(input_ids)
283
+ batch_dict['input_ids'] = input_ids_list
284
+
285
+ return batch_dict
286
+
287
+
288
+ def get_rank():
289
+ if not dist.is_available():
290
+ return 0
291
+ if not dist.is_initialized():
292
+ return 0
293
+ return dist.get_rank()
294
+
295
+
296
+ def is_main():
297
+ return get_rank() == 0
298
+
299
+
300
+ @torch.no_grad()
301
+ def varsize_gather_nograd(x: torch.Tensor):
302
+ """gather tensors of different sizes along the first dimension"""
303
+ if not dist.is_initialized():
304
+ return x
305
+
306
+ # determine max size
307
+ size = torch.tensor([x.shape[0]], device=x.device, dtype=torch.int)
308
+ allsizes = [torch.zeros_like(size) for _ in range(dist.get_world_size())]
309
+ dist.all_gather(allsizes, size)
310
+ max_size = max([size.cpu().max() for size in allsizes])
311
+
312
+ padded = torch.empty(max_size, *x.shape[1:], dtype=x.dtype, device=x.device)
313
+ padded[: x.shape[0]] = x
314
+ output = [torch.zeros_like(padded) for _ in range(dist.get_world_size())]
315
+ dist.all_gather(output, padded)
316
+
317
+ output = [tensor[: allsizes[k]] for k, tensor in enumerate(output)]
318
+ output = torch.cat(output, dim=0)
319
+
320
+ return output
321
+
322
+ SPECIAL_TOKENS = {
323
+ 't5': {
324
+ 'eos': '</s>',
325
+ },
326
+ 'xlm-r': {
327
+ 'bos': '<s>',
328
+ 'eos': '</s>',
329
+ },
330
+ 'mistral': {
331
+ 'bos': '<s>',
332
+ 'eos': '</s>',
333
+ },
334
+ 'llama': {
335
+ 'bos': '<|begin_of_text|>',
336
+ 'eos': '<|end_of_text|>',
337
+ 'pad': '<|finetune_right_pad_id|>',
338
+ 'mask': "<|reserved_special_token_0|>",
339
+ },
340
+ 'nvidia/NV-Embed-v2': {
341
+ 'bos': '<s>',
342
+ 'eos': '</s>',
343
+ },
344
+ 'qwen2': {
345
+ 'bos': '<|im_start|>',
346
+ 'eos': '<|im_end|>',
347
+ }
348
+ }
adhoc/eval_mteb/run_mteb.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import torch
3
+ import torch.distributed as dist
4
+
5
+ import tqdm
6
+ import numpy as np
7
+ import os
8
+
9
+ from functools import partial
10
+ from torch.utils.data import DataLoader
11
+ from datasets import Dataset
12
+ from transformers import AutoTokenizer, AutoModel, DataCollatorWithPadding
13
+ from mteb import MTEB
14
+
15
+ from adhoc.eval_mteb.e5mistral_prompt import load_e5mistral_prompt
16
+ from src.arguments import ModelArguments, DataArguments, TrainingArguments, MTEBArguments
17
+ from transformers import HfArgumentParser, AutoTokenizer
18
+
19
+ from src.model.model_token_pooling import MMEBModel
20
+ from adhoc.eval_mteb.mteb_utils import logger, pool, move_to_cuda, input_transform_func, varsize_gather_nograd, is_main, str2bool
21
+ from src.model.processor import load_processor
22
+
23
+ # (not effective here, add them in environment variables) for clustering: OpenBLAS warning: precompiled NUM_THREADS exceeded, adding auxiliary array for thread metadata.
24
+ default_n_threads = 1
25
+ os.environ['OPENBLAS_NUM_THREADS'] = f"{default_n_threads}"
26
+ os.environ['MKL_NUM_THREADS'] = f"{default_n_threads}"
27
+ os.environ['OMP_NUM_THREADS'] = f"{default_n_threads}"
28
+
29
+
30
+ MTEB_TASKS_EN = [
31
+ "AmazonCounterfactualClassification", "AmazonPolarityClassification", "AmazonReviewsClassification", "Banking77Classification", "EmotionClassification", "ImdbClassification", "MassiveIntentClassification", "MassiveScenarioClassification", "MTOPDomainClassification", "MTOPIntentClassification", "ToxicConversationsClassification", "TweetSentimentExtractionClassification",
32
+ "ArxivClusteringP2P", "ArxivClusteringS2S", "BiorxivClusteringP2P", "BiorxivClusteringS2S", "MedrxivClusteringP2P", "MedrxivClusteringS2S", "RedditClustering", "RedditClusteringP2P", "StackExchangeClustering", "StackExchangeClusteringP2P", "TwentyNewsgroupsClustering",
33
+ "SprintDuplicateQuestions", "TwitterSemEval2015", "TwitterURLCorpus",
34
+ "AskUbuntuDupQuestions", "MindSmallReranking", "SciDocsRR", "StackOverflowDupQuestions",
35
+ "ArguAna", "ClimateFEVER", "CQADupstackAndroidRetrieval", "DBPedia", "FEVER", "FiQA2018", "HotpotQA", "MSMARCO", "NFCorpus", "NQ", "QuoraRetrieval", "SCIDOCS", "SciFact", "TRECCOVID", "Touche2020",
36
+ "BIOSSES", "SICK-R", "STS12", "STS13", "STS14", "STS15", "STS16", "STS17", "STS22", "STSBenchmark",
37
+ "SummEval"
38
+ ]
39
+
40
+
41
+ class DenseEncoder(torch.nn.Module):
42
+ def __init__(self, model_args, mteb_args, max_length=512, **kwargs):
43
+ super().__init__()
44
+ self.max_length = max_length
45
+ self.pool_type = model_args.pooling
46
+
47
+ processor = load_processor(model_args)
48
+ model = MMEBModel.load(model_args)
49
+
50
+ processor.tokenizer.padding_side = "right"
51
+ model.eval()
52
+ model = model.to(mteb_args.device, dtype=torch.bfloat16)
53
+ self.encoder = model
54
+ self.tokenizer = processor.tokenizer
55
+ self.processor = processor
56
+
57
+ self.batch_size_per_device = mteb_args.batch_size_per_device
58
+ self.gpu_count = torch.cuda.device_count()
59
+ self.encoder.eval()
60
+ self.encoder.cuda()
61
+ self.query_prompt = ""
62
+ self.doc_prompt = ""
63
+ self.sep = ". "
64
+
65
+ if not torch.distributed.is_initialized() and self.gpu_count > 1:
66
+ self.encoder = torch.nn.DataParallel(self.encoder)
67
+
68
+ def encode_queries(self, sentences, **kwargs) -> np.ndarray:
69
+ return self.encode(sentences, self.query_prompt, is_query=True, **kwargs)
70
+
71
+ def encode_corpus(self, sentences, **kwargs) -> np.ndarray:
72
+ return self.encode(sentences, self.doc_prompt, is_query=False, **kwargs)
73
+
74
+ @torch.no_grad()
75
+ def encode(self, inputs, prompt=None, is_query=True, **kwargs) -> np.ndarray:
76
+ """ Returns a list of embeddings for the given sentences.
77
+ Args:
78
+ inputs (`List[str]`): List of sentences to encode
79
+ batch_size_per_device (`int`): Batch size for the encoding
80
+
81
+ Returns:
82
+ `List[np.ndarray]` or `List[tensor]`: List of embeddings for the given sentences
83
+ """
84
+ if isinstance(inputs[0], dict):
85
+ input_texts = [(doc["title"] + self.sep + doc["text"]).strip() if "title" in doc else doc["text"].strip() for doc in inputs]
86
+ else:
87
+ input_texts = copy.copy(inputs)
88
+ if torch.distributed.is_initialized() and len(input_texts) >= dist.get_world_size():
89
+ idx = np.array_split(range(len(input_texts)), dist.get_world_size())[dist.get_rank()]
90
+ else:
91
+ # in case of non-DDP or not enough sentences, all devices are running the same job, but no gathering in the end
92
+ idx = range(len(input_texts))
93
+ device_sentences = [input_texts[i] for i in idx]
94
+ # for tasks other than RET
95
+ if is_query and not prompt and self.query_prompt:
96
+ prompt = self.query_prompt
97
+ if prompt:
98
+ device_sentences_with_prompt = [prompt + (s['text'] if isinstance(s, dict) else s) for s in device_sentences]
99
+ else:
100
+ device_sentences_with_prompt = device_sentences
101
+
102
+ dataset: Dataset = Dataset.from_dict({'input_texts': device_sentences_with_prompt})
103
+ dataset.set_transform(partial(input_transform_func, self.tokenizer, max_length=self.max_length, always_add_eos=True))
104
+ data_collator = DataCollatorWithPadding(self.tokenizer, pad_to_multiple_of=1)
105
+ data_loader = DataLoader(
106
+ dataset,
107
+ batch_size=self.batch_size_per_device if torch.distributed.is_initialized() else self.batch_size_per_device * self.gpu_count,
108
+ shuffle=False,
109
+ drop_last=False,
110
+ num_workers=0,
111
+ collate_fn=data_collator,
112
+ pin_memory=True)
113
+
114
+ encoded_embeds = []
115
+ # for batch in data_loader:
116
+ for batch in tqdm.tqdm(data_loader, desc="encoding", miniters=10, disable=not is_main()):
117
+ # batch.data['is_causal'] = self.is_causal # only needed for Qwen
118
+ # print(f"batch.data['is_causal']={batch.data['is_causal']}")
119
+ # print(self.tokenizer.decode(batch['input_ids'][0]))
120
+ # print(batch['input_ids'].numpy())
121
+ # print(batch)
122
+ batch = move_to_cuda(batch)
123
+ with torch.cuda.amp.autocast():
124
+ outputs = self.encoder.encode_input(batch)
125
+ encoded_embeds.append(outputs)
126
+ encoded_embeds = torch.cat(encoded_embeds, dim=0)
127
+ if torch.distributed.is_initialized() and len(inputs) >= dist.get_world_size():
128
+ encoded_embeds = varsize_gather_nograd(encoded_embeds)
129
+ encoded_embeds = encoded_embeds.cpu().numpy()
130
+
131
+ return encoded_embeds
132
+
133
+ def set_prompt(self, query_prompt: str, doc_prompt: str):
134
+ self.query_prompt = query_prompt
135
+ self.doc_prompt = doc_prompt
136
+
137
+
138
+ def main():
139
+ parser = HfArgumentParser((ModelArguments, DataArguments, MTEBArguments, TrainingArguments))
140
+ model_args, data_args, mteb_args, training_args, remaining_args = parser.parse_args_into_dataclasses(return_remaining_strings=True)
141
+ model_args: ModelArguments
142
+ data_args: DataArguments
143
+ mteb_args: MTEBArguments
144
+
145
+ assert mteb_args.eval_output_dir, 'eval_output_dir should be specified'
146
+ os.makedirs(mteb_args.eval_output_dir, exist_ok=True)
147
+
148
+ task_types = None
149
+ tasks = ['NFCorpus', 'FiQA2018', 'ArguAna', 'SciFact', 'SCIDOCS', 'Touche2020', 'TRECCOVID']
150
+ # tasks = ["BiorxivClusteringS2S", "MedrxivClusteringS2S", "RedditClustering", "StackExchangeClustering", "StackExchangeClusteringP2P", "TwentyNewsgroupsClustering"]
151
+ evaluation = MTEB(task_types=task_types, tasks=tasks, task_langs=["eng-Latn", "en"])
152
+ model = DenseEncoder(model_args, mteb_args, max_length=mteb_args.max_length)
153
+
154
+ for task_cls in evaluation.tasks:
155
+ task_name: str = task_cls.metadata.name
156
+ task_type: str = task_cls.metadata.type
157
+ # filter out not supported datasets
158
+ print(f"Evaluating MTEB: {task_type} - {task_name}")
159
+ # filter out not supported datasets
160
+ if task_name not in MTEB_TASKS_EN:
161
+ continue
162
+
163
+ eval_splits = task_cls.metadata.eval_splits
164
+ if "test" not in eval_splits:
165
+ logger.warning("Test split not found for task: {}, type: {}, eval_splits: {}".format(task_name, task_type, eval_splits))
166
+ eval_splits = ["test" if "test" in eval_splits else eval_splits[0]]
167
+
168
+ if mteb_args.prompt_family:
169
+ prompt_data = load_e5mistral_prompt(prompt_family=mteb_args.prompt_family, task_name=task_name, task_type=task_type)
170
+ query_prompt = prompt_data['q_prompt']
171
+ doc_prompt = prompt_data['d_prompt']
172
+ model.set_prompt(query_prompt=query_prompt, doc_prompt=doc_prompt)
173
+ logger.info('Set prompt: query={}, doc={}'.format(query_prompt, doc_prompt))
174
+ else:
175
+ logger.info('No prompt is set')
176
+
177
+ # disable l2 normalize for classification tasks, as it achieves slightly better results
178
+ if task_type == 'Classification':
179
+ logger.info('Set l2_normalize to False for classification task')
180
+ model.l2_normalize = False
181
+ else:
182
+ model.l2_normalize = True
183
+ logger.info('Set l2_normalize to {}'.format(model.l2_normalize))
184
+
185
+ sub_eval = MTEB(tasks=[task_name], task_langs=["eng-Latn", "en"], n_experiments=1)
186
+ logger.info('Running evaluation for task: {}, type: {}'.format(task_name, task_type))
187
+ if (torch.distributed.is_initialized() and torch.distributed.get_rank() == 0) or not torch.distributed.is_initialized():
188
+ mteb_result_folder = mteb_args.eval_output_dir
189
+ else:
190
+ mteb_result_folder = None
191
+ sub_eval.run(
192
+ model, eval_splits=eval_splits,
193
+ output_folder=mteb_result_folder
194
+ )
195
+
196
+
197
+ if __name__ == '__main__':
198
+ main()
adhoc/gather_score_byckpt_aws.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import re
4
+
5
+ # Define the datasets
6
+ datasets = [
7
+ "ImageNet-1K", "N24News", "HatefulMemes", "VOC2007", "SUN397", "Place365", "ImageNet-A", "ImageNet-R", "ObjectNet", "Country211",
8
+ "OK-VQA", "A-OKVQA", "DocVQA", "InfographicsVQA", "ChartQA", "Visual7W", "ScienceQA", "VizWiz", "GQA", "TextVQA",
9
+ "VisDial", "CIRR", "VisualNews_t2i", "VisualNews_i2t", "MSCOCO_t2i", "MSCOCO_i2t", "NIGHTS", "WebQA", "FashionIQ", "Wiki-SS-NQ", "OVEN", "EDIS",
10
+ "MSCOCO", "RefCOCO", "RefCOCO-Matching", "Visual7W-Pointing"
11
+
12
+ ]
13
+
14
+
15
+ # Define the root directory containing the experiment directories
16
+ checkpoint_paths = [
17
+ # v2 baselines
18
+ # "/fsx/home/ruimeng/runs/v3vec-baseline/gme2b/mmeb/",
19
+ # "/fsx/home/ruimeng/runs/v3vec-baseline/gme7b/mmeb/",
20
+ # "/fsx/home/ruimeng/runs/v3vec-baseline/lamra/mmeb/",
21
+ # "/fsx/home/ruimeng/runs/v3vec-baseline/colpali/mmeb/",
22
+
23
+ # unified data, qwenresize
24
+ # "/fsx/home/yeliu/runs/mmeb/qwen2vl_2B.mmeb20.qwenresize.lora8.bs1024pergpu128.GCq8p8.NormTemp002.lr2e5.step2kwarm100.8H100/checkpoint-2000/eval/",
25
+ # "/fsx/home/yeliu/runs/mmeb/qwen2vl_2B.video.qwenresize.lora8.bs1024pergpu128.GCq8p8.NormTemp002.lr2e5.step2kwarm100.8H100/checkpoint-2000/eval/",
26
+ # "/fsx/home/yeliu/runs/mmeb/qwen2vl_2B.mmeb20+vidore+visrag.qwenresize.lora8.bs1024pergpu128.GCq8p8.NormTemp002.lr2e5.step2kwarm100.8H100/checkpoint-2000/eval/",
27
+ # "/fsx/home/yeliu/runs/mmeb/qwen2vl_2B.mmeb20+visdoc+video.qwenresize.lora8.bs1024pergpu128.GCq8p8.NormTemp002.lr2e5.step2kwarm100.8H100/checkpoint-2000/eval/",
28
+ # "/fsx/home/yeliu/runs/mmeb/qwen2vl_2B.mmeb20+visdoc+video.qwenresize.lora16.IB128.bs1024pergpu128.GCq8p8.NormTemp002.lr2e5.step2kwarm100.8H100/checkpoint-2000/eval/",
29
+ # "/fsx/home/yeliu/runs/mmeb/qwen2vl_2B.mmeb20+visdoc+video.qwenresize.lora16.bs1024pergpu128.GCq8p8.NormTemp002.lr2e5.step2kwarm100.8H100/checkpoint-2000/eval/",
30
+ # "/fsx/home/yeliu/runs/mmeb/qwen2vl_2B.visdoc.qwenresize.lora8.bs1024pergpu128.GCq8p8.NormTemp002.lr2e5.step2kwarm100.8H100/checkpoint-2000/eval/",
31
+ # "/fsx/home/yeliu/runs/mmeb/qwen2vl_2B.mmeb20+visdoc+video.qwenresize.lora16.noIB.bs1024pergpu128.GCq8p8.NormTemp002.lr2e5.step2kwarm100.8H100/checkpoint-2000/eval/",
32
+ # "/fsx/home/yeliu/runs/mmeb/qwen2vl_2B.mmeb20+visdoc+video.qwenresize.lora16.IB32.bs1024pergpu128.GCq8p8.NormTemp002.lr2e5.step2kwarm100.8H100/checkpoint-2000/eval/",
33
+ # "/fsx/home/yeliu/runs/mmeb/qwen2vl_2B.mmeb20+video.qwenresize.lora8.bs1024pergpu128.GCq8p8.NormTemp002.lr2e5.step2kwarm100.8H100/checkpoint-2000/eval/"
34
+ # "/fsx/home/yeliu/runs/mmeb/qwen2vl_2B.mmeb20+visdoc+video.qwenresize.lora16.IB32.bs1024pergpu128.GCq8p8.NormTemp002.lr2e5.step2kwarm100.8H100/checkpoint-2000/eval"
35
+ # "/fsx/home/yeliu/runs/mmeb/qwen2vl_2B.mmeb20+visdoc+video_v2.qwenresize.lora16.noIB.bs1024pergpu128.GCq8p8.NormTemp002.lr2e5.step2kwarm100.8H100/checkpoint-4000/eval"
36
+ # "/fsx/home/yeliu/runs/mmeb/qwen2vl_2B.mmeb+video_v2+split_visdoc.qwenresize.lora8.bs1024pergpu128.GCq8p8.NormTemp002.lr2e5.step2kwarm100.8H100//checkpoint-1000/eval"
37
+ # "/fsx/home/yeliu/runs/mmeb/qwen2vl_202_2B.mmeb20+visdoc+video.qwenresize.lora32.bs1024pergpu128.GCq8p8.NormTemp002.lr2e5.step2kwarm100.8H100/checkpoint-2000/eval"
38
+ # "/fsx/home/yeliu/runs/mmeb/qwen2vl_2B.mmeb20+visdoc+video.qwenresize.lora16.IB0.bs1024pergpu128.GCq8p8.NormTemp002.lr2e5.step2kwarm100.8H100/checkpoint-2000/eval"
39
+ # "/fsx/home/yeliu/runs/mmeb/qwen2vl_2B.mmeb20+visdoc+video_v2.qwenresize.lora16.noIB.bs1024pergpu128.GCq8p8.NormTemp002.lr2e5.step2kwarm100.8H100/checkpoint-5000/eval"
40
+ # "/fsx/home/yeliu/runs/mmeb/qwen2vl_2B.mmeb+video_v2+split_visdoc.qwenresize.lora8.bs1024pergpu128.GCq8p8.NormTemp002.lr2e5.step2kwarm100.8H100/checkpoint-5000/eval"
41
+ # "/fsx/home/ruimeng/runs/mmeb/qwen2vl_2B-002-6.mmeb20_vidore1_videohound2_mteb15-v2-cap100k-rerun.qwenresize.lora16.bs1024pergpu128-ib64-droplast.GCq8p8.NormTemp002.lr5e5.step5kwarm200.maxlen2k.8H100/checkpoint-4000/eval"
42
+ "/fsx/home/yeliu/runs/mmeb/qwen2vl_7B.mmeb+video_v2+split_visdoc.qwenresize.lora16.bs512pergpu64.GCq8p8.NormTemp002.lr2e5.step2kwarm100.8H100/checkpoint-3000/eval"
43
+
44
+ ]
45
+
46
+
47
+ # Function to extract step number from checkpoint directory name
48
+ def extract_step(checkpoint_name):
49
+ match = re.search(r'checkpoint-(\d+)', checkpoint_name)
50
+ return int(match.group(1)) if match else float('inf')
51
+
52
+
53
+ # Dictionary to hold all gathered scores, organized by experiment
54
+ gathered_scores_by_exp = {}
55
+
56
+ # Loop through checkpoint directories
57
+ for checkpoint_path in checkpoint_paths:
58
+ print(checkpoint_path)
59
+ step = extract_step(checkpoint_path)
60
+ experiment_dir = checkpoint_path.split("/")[-3]
61
+
62
+ # Check if it is a checkpoint directory, and a valid checkpoint dir
63
+ if str.isdigit(str(step)):
64
+ # Initialize a dictionary to store scores for this checkpoint
65
+ checkpoint_scores = {"experiment": experiment_dir, "checkpoint": str(step)}
66
+ else:
67
+ checkpoint_scores = {"experiment": experiment_dir, "checkpoint": "default"}
68
+
69
+ # Go through each dataset and check if the corresponding score file exists
70
+ for dataset in datasets:
71
+ score_file = os.path.join(checkpoint_path, f"{dataset}_score.json") # Score file named like DatasetName_score.json
72
+
73
+ # Check if the score file exists
74
+ if os.path.isfile(score_file):
75
+ with open(score_file, "r") as f:
76
+ score_data = json.load(f) # Load the score JSON
77
+ checkpoint_scores[dataset] = score_data.get("acc", "N/A") # Assuming 'acc' is the key for accuracy
78
+ else:
79
+ checkpoint_scores[dataset] = "N/A" # If no score file, set to 'N/A'
80
+ print(checkpoint_scores)
81
+
82
+ # Append the scores for this checkpoint to the respective experiment group
83
+ gathered_scores_by_exp[experiment_dir] = checkpoint_scores
84
+
85
+
86
+
87
+ print('\n' * 5)
88
+ # Print gathered scores in a comma-separated format
89
+ header = ["experiment", "checkpoint"] + datasets
90
+ print(",".join(header)) # Print header
91
+
92
+ for experiment, scores in gathered_scores_by_exp.items():
93
+ row = [scores["experiment"], scores["checkpoint"]] + [str(scores[dataset]) for dataset in datasets]
94
+ print(",".join(row)) # Print each row of scores
95
+
96
+
97
+
98
+ header = ["dataset"] + list(gathered_scores_by_exp.keys())
99
+ print(",".join(header)) # Print header
100
+ # Additional Block: Print results per experiment, transposed (dataset per row, step per column)
101
+ # Print dataset names in the first column, and the scores for each checkpoint in subsequent columns
102
+ for dataset in datasets:
103
+ row = []
104
+ for experiment, scores in gathered_scores_by_exp.items():
105
+ row.append(str(scores[dataset]))
106
+ print(",".join([dataset] + row)) # Print header
107
+
108
+
109
+ import pandas as pd
110
+
111
+ # Collect rows
112
+ rows = []
113
+ for dataset in datasets:
114
+ row = [dataset]
115
+ for experiment in gathered_scores_by_exp.keys():
116
+ row.append(gathered_scores_by_exp[experiment][dataset])
117
+ rows.append(row)
118
+
119
+ # Create DataFrame
120
+ df = pd.DataFrame(rows, columns=header)
121
+
122
+ # Save to CSV
123
+ df.to_csv("output_scores.csv", index=False)
124
+ print("CSV saved to output_scores.csv")
125
+
126
+
127
+
128
+ # header = ["dataset"] + list(gathered_scores_by_exp.keys())
129
+ # print(",".join(header)) # Print header
130
+ # # Additional Block: Print results per experiment, transposed (dataset per row, step per column)
131
+ # # Print dataset names in the first column, and the scores for each checkpoint in subsequent columns
132
+ # for dataset in datasets:
133
+ # print(",".join([dataset, str(scores[dataset])]))
134
+ # for experiment, scores in gathered_scores_by_exp.items():
135
+ # print(f"\nResults for {experiment}:")
136
+ #
adhoc/hf_datasets.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+
3
+ # official example from https://huggingface.co/docs/datasets/en/stream
4
+ def official_example():
5
+ dataset = load_dataset("ethz/food101", split="validation")
6
+ dataset = dataset.to_iterable_dataset()
7
+ dataset = dataset.shuffle(buffer_size=1024, seed=42)
8
+ # dataset = dataset.map(add_prefix, remove_columns=["image", "label"]) # this works
9
+ dataset = dataset.map(add_prefix, remove_columns=["image", "label"], drop_last_batch=True, batched=True, batch_size=1024) # this also works
10
+ # dataset = load_dataset("ethz/food101", streaming=True)
11
+ for batch in dataset:
12
+ print(batch)
13
+ pass
14
+
15
+ def add_prefix(example):
16
+ example['text'] = [f'label: {l}' for l in example['label']]
17
+ return example
18
+
19
+ def data_prepare(batch_dict, *args, **kwargs):
20
+ return batch_dict
21
+
22
+ def load_mmeb():
23
+ dataset = load_dataset("TIGER-Lab/MMEB-train", "OK-VQA", split="original")
24
+ dataset = dataset.select(range(1000)) # step 1 select (works)
25
+ dataset = dataset.to_iterable_dataset()
26
+ dataset = dataset.shuffle(buffer_size=1024 * 16, seed=42) # step 2 shuffle (works)
27
+ dataset = dataset.map(lambda x: data_prepare(x), batched=True, batch_size=1024 * 4) # cannot use drop_last_batch=True
28
+ # dataset = dataset._resolve_features()
29
+ for batch in dataset:
30
+ print(batch)
31
+ pass
32
+
33
+
34
+
35
+ if __name__ == "__main__":
36
+ # official_example()
37
+ load_mmeb()
adhoc/merge_checkpoint.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.arguments import ModelArguments
2
+ from transformers import HfArgumentParser, AutoProcessor
3
+
4
+ from src.model.model_token_pooling import MMEBModel
5
+ from src.model.processor import get_backbone_name, load_processor
6
+
7
+
8
+ def main():
9
+ parser = HfArgumentParser(ModelArguments)
10
+ model_args, = parser.parse_args_into_dataclasses()
11
+ model_args: ModelArguments
12
+
13
+ model = MMEBModel.build(model_args)
14
+ model_backbone = get_backbone_name(hf_config=model.config)
15
+ setattr(model_args, "model_backbone", model_backbone)
16
+ # processor.tokenizer.padding_side = "right"
17
+ model = MMEBModel.load(model_args, is_trainable=False)
18
+ model.config.save_pretrained(f'{model_args.model_name}/full_model/', safe_serialization=False)
19
+ processor = load_processor(model_args)
20
+ processor.save_pretrained(f'{model_args.model_name}/full_model/', safe_serialization=False)
21
+ model.encoder._hf_peft_config_loaded = False
22
+ model.encoder.save_pretrained(f'{model_args.model_name}/full_model/', safe_serialization=False)
23
+
24
+
25
+ if __name__ == "__main__":
26
+ main()
adhoc/plot.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import matplotlib.pyplot as plt
2
+ # Data
3
+ batch_sizes = [128, 256, 512, 1024]
4
+ batch_perf = [49.5, 52.1, 54.3, 55.9]
5
+ step_sizes = [1000, 2000, 4000, 8000]
6
+ step_perf = [49.8, 52.0, 53.8, 55.3]
7
+ num_crops = [2, 4, 8, 16]
8
+ crop_perf = [47.1, 52.0, 54.2, 54.8]
9
+ # Plot
10
+ fig, axs = plt.subplots(1, 3, figsize=(10, 3))
11
+ # Batch size subplot
12
+ axs[0].plot(batch_sizes, batch_perf, marker='o', color='steelblue')
13
+ axs[0].set_title('Batch Size Influence on Performance', fontsize=9, fontweight='bold')
14
+ axs[0].set_xlabel('Batch Size')
15
+ axs[0].set_ylabel('Performance (%)')
16
+ # Step size subplot
17
+ axs[1].plot(step_sizes, step_perf, marker='s', linestyle='--', color='green')
18
+ axs[1].set_title('Step Size Influence on Performance', fontsize=9, fontweight='bold')
19
+ axs[1].set_xlabel('Step Size')
20
+ axs[1].set_ylabel('Performance (%)')
21
+ # Number of crops subplot
22
+ axs[2].plot(num_crops, crop_perf, marker='^', linestyle='-.', color='firebrick')
23
+ axs[2].set_title('Number of Crops Influence on Performance', fontsize=9, fontweight='bold')
24
+ axs[2].set_xlabel('Number of Crops')
25
+ axs[2].set_ylabel('Performance (%)')
26
+ # Tidy up
27
+ for ax in axs:
28
+ ax.grid(True)
29
+ plt.tight_layout()
30
+ plt.show()
31
+ plt.savefig("performance_plots_high_res.pdf", format='pdf', dpi=300)
adhoc/plot2.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import matplotlib.pyplot as plt
2
+ import numpy as np
3
+
4
+ # Data
5
+ modalities = ["Image", "VisDoc", "Video"]
6
+ lora_8 = [62.7, 52.5, 32.4]
7
+ lora_16 = [63.2, 52.6, 33.5]
8
+ lora_32 = [60.0, 52.1, 32.7]
9
+
10
+ # Bar placement
11
+ x = np.array([0, 1, 2]) # modality positions
12
+ bar_width = 0.2
13
+ offset = 0.24 # control spacing between LoRA bars
14
+
15
+ # Font settings
16
+ plt.rcParams['font.family'] = 'DejaVu Sans'
17
+ plt.rcParams['font.size'] = 14
18
+
19
+ # Create plot
20
+ plt.figure(figsize=(7, 6))
21
+ bars1 = plt.bar(x - offset, lora_8, bar_width, label='LoRA 8', color='#1f77b4')
22
+ bars2 = plt.bar(x, lora_16, bar_width, label='LoRA 16', color='#ff7f0e')
23
+ bars3 = plt.bar(x + offset, lora_32, bar_width, label='LoRA 32', color='#2ca02c')
24
+
25
+ # Axes and labels
26
+ plt.xticks(x, modalities, fontsize=16)
27
+ plt.yticks(fontsize=16)
28
+ plt.xlabel("Modality", fontsize=18)
29
+ plt.ylabel("Performance", fontsize=18)
30
+ plt.title("Performance under Different LoRA Ranks", fontsize=18)
31
+ plt.ylim(30, 70)
32
+
33
+ # Annotate bars
34
+ for bars in [bars1, bars2, bars3]:
35
+ for bar in bars:
36
+ height = bar.get_height()
37
+ plt.text(bar.get_x() + bar.get_width() / 2, height + 0.5,
38
+ f'{height:.1f}', ha='center', va='bottom', fontsize=14)
39
+
40
+ # Legend without frame
41
+ plt.legend(frameon=False, fontsize=14)
42
+ plt.grid(axis='y', linestyle='--', alpha=0.6)
43
+ plt.tight_layout()
44
+
45
+ # Save as PDF
46
+ plt.savefig("lora_rank_comparison_y30_wider.pdf", format='pdf', dpi=300)
47
+ plt.show()
adhoc/test_ddp.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import torch.distributed as dist
4
+ import socket
5
+
6
+
7
+ def main():
8
+ print(f"[Rank {os.environ.get('RANK')}] Hostname: {socket.gethostname()} | Master: {os.environ['MASTER_ADDR']}:{os.environ['MASTER_PORT']}")
9
+ rank = int(os.environ["RANK"])
10
+ local_rank = int(os.environ["LOCAL_RANK"])
11
+ world_size = int(os.environ["WORLD_SIZE"])
12
+ print(f"[rank {rank}] hostname: {os.uname().nodename}, MASTER_ADDR: {os.environ['MASTER_ADDR']}")
13
+ print(f"Starting rank {rank}, local rank {local_rank}, world size {world_size}")
14
+
15
+ dist.init_process_group("nccl")
16
+ torch.cuda.set_device(local_rank)
17
+
18
+ print(f"Hello from rank {rank} out of {world_size}")
19
+
20
+ dist.destroy_process_group()
21
+
22
+
23
+ if __name__ == "__main__":
24
+ main()
adhoc/testset_stats.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import sys
3
+
4
+ import numpy as np
5
+
6
+ from src.arguments import ModelArguments, DataArguments, TrainingArguments
7
+ from transformers import HfArgumentParser, AutoProcessor
8
+ from src.dataset import EvalDataset
9
+ import re
10
+
11
+ def main():
12
+ for arg in sys.argv:
13
+ if arg.startswith("--local-rank="):
14
+ rank = arg.split("=")[1]
15
+ sys.argv.remove(arg)
16
+ sys.argv.append('--local_rank')
17
+ sys.argv.append(rank)
18
+ parser = HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
19
+ model_args, data_args, training_args = parser.parse_args_into_dataclasses()
20
+ model_args: ModelArguments
21
+ data_args: DataArguments
22
+ training_args: TrainingArguments
23
+
24
+ datasets = [
25
+ "GQA",
26
+ # "ImageNet-1K", "N24News", "HatefulMemes", "VOC2007", "SUN397", "Place365", "ImageNet-A", "ImageNet-R",
27
+ # "ObjectNet", "Country211",
28
+ # "OK-VQA", "A-OKVQA", "DocVQA", "InfographicsVQA", "ChartQA", "Visual7W", "ScienceQA", "VizWiz", "GQA",
29
+ # "TextVQA",
30
+ # "VisDial", "CIRR", "VisualNews_t2i", "VisualNews_i2t", "MSCOCO_t2i", "MSCOCO_i2t", "NIGHTS", "WebQA",
31
+ # "FashionIQ", "Wiki-SS-NQ", "OVEN", "EDIS",
32
+ # "MSCOCO", "RefCOCO", "RefCOCO-Matching", "Visual7W-Pointing"
33
+ ]
34
+
35
+ # ToDo: This part of code is a little bit hacky. Need to refactor later.
36
+ for idx, subset in enumerate(datasets):
37
+ eval_qry_dataset = EvalDataset(
38
+ data_args=data_args,
39
+ model_args=model_args,
40
+ subset=subset,
41
+ text_field="qry_text",
42
+ img_path_field="qry_img_path",
43
+ )
44
+ eval_tgt_dataset = EvalDataset(
45
+ data_args=data_args,
46
+ model_args=model_args,
47
+ subset=subset,
48
+ text_field="tgt_text",
49
+ img_path_field="tgt_img_path",
50
+ )
51
+ tgttokens = []
52
+ tgtstr_lens = []
53
+ for tgt in eval_tgt_dataset:
54
+ # print(tgt)
55
+ tokens = re.split('[^a-zA-Z]', tgt[0])
56
+ tgttokens.append(tokens)
57
+ tgtstr_lens.append(len(tokens))
58
+ pass
59
+
60
+ print(f'dataset: {subset}')
61
+ print(f'tgt-avg-len: {np.mean(tgtstr_lens)}')
62
+ pass
63
+
64
+
65
+ if __name__ == "__main__":
66
+ main()
adhoc/visual_doc/category_colpali_training.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset, Dataset
2
+ from collections import defaultdict
3
+ import os
4
+ from tqdm import tqdm
5
+
6
+ # Load dataset
7
+ dataset = load_dataset("vidore/colpali_train_set", split="train")
8
+
9
+ # Group by source
10
+ source_splits = defaultdict(list)
11
+ for example in tqdm(dataset):
12
+ source_splits[example['source']].append(example)
13
+
14
+ # Output directory
15
+ output_dir = "/fsx/sfr/data/MMEB/Visual_Doc/vidore"
16
+ os.makedirs(output_dir, exist_ok=True)
17
+
18
+ # Save each split as a Parquet file
19
+ for source, examples in source_splits.items():
20
+ print(f"{source}: {len(examples)} examples")
21
+ file_path = os.path.join(output_dir, f"{source}.parquet")
22
+
23
+ # Convert to HuggingFace Dataset then save as Parquet
24
+ hf_dataset = Dataset.from_list(examples)
25
+ hf_dataset.to_parquet(file_path)
26
+
27
+ print(f"Saved {len(source_splits)} source-based splits as Parquet to {output_dir}/")
adhoc/visual_doc/category_visrag_training.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset, Dataset
2
+ from collections import defaultdict
3
+ import os
4
+ from tqdm import tqdm
5
+
6
+ # Base output directory
7
+ base_output_dir = "/fsx/sfr/data/MMEB/Visual_Doc/visrag"
8
+
9
+ # Dataset name to subfolder mapping
10
+ datasets_to_process = {
11
+ 'openbmb/VisRAG-Ret-Train-In-domain-data': 'Train_in_domain_data',
12
+ }
13
+
14
+ # Process each dataset
15
+ for data_name, folder_name in datasets_to_process.items():
16
+ print(f"\nProcessing: {data_name}")
17
+
18
+ # Load dataset
19
+ dataset = load_dataset(data_name, split="train")
20
+
21
+ # Group by source
22
+ source_splits = defaultdict(list)
23
+ for example in tqdm(dataset):
24
+ source_splits[example['source']].append(example)
25
+
26
+ # Create output subfolder
27
+ output_dir = os.path.join(base_output_dir, folder_name)
28
+ os.makedirs(output_dir, exist_ok=True)
29
+
30
+ # Save each split as a Parquet file
31
+ for source, examples in source_splits.items():
32
+ print(f"{source}: {len(examples)} examples")
33
+
34
+ file_path = os.path.join(output_dir, f"{source}.parquet")
35
+ hf_dataset = Dataset.from_list(examples)
36
+ hf_dataset.to_parquet(file_path)
37
+
38
+ print(f"Saved {len(source_splits)} source-based splits to: {output_dir}/")
adhoc/visual_doc/check_corpus.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+
3
+ dataset_path = "/fsx/sfr/data/MMEB/Visual_Doc/vidore/Infographic-VQA.parquet"
4
+
5
+ dataset = load_dataset("parquet", data_files={"train": dataset_path}, split="train")
6
+
7
+ print(dataset[0])
adhoc/visual_doc/mmdoclong-doc.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ import fitz # PyMuPDF
3
+ from PIL import Image
4
+ import os
5
+ import json
6
+ import base64
7
+ from io import BytesIO
8
+ import ast
9
+
10
+ # Load dataset
11
+ dataset = load_dataset("yubo2333/MMLongBench-Doc")["train"]
12
+
13
+ # Directory containing PDFs
14
+ pdf_dir = "/fsx/sfr/data/MMEB/Visual_Doc/mmlongbench/documents"
15
+
16
+ def encode_image(image):
17
+ buffered = BytesIO()
18
+ image.save(buffered, format="PNG")
19
+ return base64.b64encode(buffered.getvalue()).decode()
20
+
21
+ # Dictionary to store images
22
+ all_images = {}
23
+ processed_pdfs = {}
24
+ pdf_corpus_mapping = {} # Mapping from pdf_file_name to base corpus_id
25
+ existing_corpus_ids = set() # Track already added corpus-ids
26
+
27
+ queries = []
28
+ corpus = []
29
+ qrels = []
30
+ corpus_counter = 0
31
+
32
+ # Process each PDF
33
+ for qid, doc in enumerate(dataset):
34
+ pdf_file_name = doc["doc_id"]
35
+ pdf_path = os.path.join(pdf_dir, pdf_file_name)
36
+
37
+ if doc['evidence_pages'] == []:
38
+ continue
39
+
40
+ # Ensure the file exists before processing
41
+ if not os.path.exists(pdf_path):
42
+ print(f"Warning: PDF file {pdf_file_name} not found. Skipping.")
43
+ continue
44
+
45
+ if pdf_file_name not in processed_pdfs:
46
+ # Open the PDF
47
+ pdf_document = fitz.open(pdf_path)
48
+ images = []
49
+
50
+ # Convert each page to an image
51
+ for page_number in range(len(pdf_document)):
52
+ page = pdf_document[page_number]
53
+ pix = page.get_pixmap()
54
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
55
+ images.append(img)
56
+
57
+ processed_pdfs[pdf_file_name] = images
58
+ pdf_corpus_mapping[pdf_file_name] = corpus_counter
59
+ corpus_counter += len(images) # Increment by number of images
60
+ else:
61
+ images = processed_pdfs[pdf_file_name]
62
+
63
+ # Ensure pdf_file_name is in pdf_corpus_mapping before access
64
+ if pdf_file_name not in pdf_corpus_mapping:
65
+ print(f"Error: {pdf_file_name} not found in pdf_corpus_mapping. Skipping.")
66
+ continue
67
+
68
+ base_corpus_id = pdf_corpus_mapping[pdf_file_name]
69
+ all_images[pdf_file_name] = images
70
+
71
+ try:
72
+ evidence_pages = ast.literal_eval(doc['evidence_pages'])
73
+ if not isinstance(evidence_pages, list):
74
+ raise ValueError("Invalid evidence pages format")
75
+ except Exception as e:
76
+ print(f"Error parsing evidence pages for {pdf_file_name}: {e}")
77
+ continue
78
+
79
+ if len(evidence_pages) == 0:
80
+ continue
81
+ queries.append({
82
+ "query-id": qid,
83
+ "query": doc["question"],
84
+ "corpus_range": list(range(base_corpus_id, base_corpus_id + len(images)))
85
+ })
86
+
87
+ for img_id, _ in enumerate(images):
88
+ qrels.append({
89
+ 'query-id': qid,
90
+ 'corpus-id': base_corpus_id + img_id,
91
+ 'score': 1
92
+ })
93
+
94
+ # Store encoded images in corpus if not already added
95
+ for img_id, image in enumerate(images):
96
+ corpus_id = base_corpus_id + img_id # Fix corpus ID numbering
97
+ if corpus_id not in existing_corpus_ids:
98
+ corpus.append({
99
+ "corpus-id": corpus_id,
100
+ "image": encode_image(image)
101
+ })
102
+ existing_corpus_ids.add(corpus_id)
103
+
104
+ # Function to save data in JSONL format
105
+ def save_jsonl(filename, data):
106
+ with open(filename, "w", encoding="utf-8") as f:
107
+ for entry in data:
108
+ json.dump(entry, f)
109
+ f.write("\n")
110
+
111
+ print('size of qrels:', len(qrels))
112
+ print('size of queries:', len(queries))
113
+ print('size of corpus:', len(corpus))
114
+
115
+ save_dir = "/fsx/sfr/data/MMEB/Visual_Doc/mmlongbench/test-doc/"
116
+ os.makedirs(save_dir, exist_ok=True)
117
+ queries_file = "queries.jsonl"
118
+ corpus_file = "corpus.jsonl"
119
+ qrels_file = "qrels.jsonl"
120
+
121
+ # Save to JSONL
122
+ save_jsonl(os.path.join(save_dir, queries_file), queries)
123
+ save_jsonl(os.path.join(save_dir, corpus_file), corpus)
124
+ save_jsonl(os.path.join(save_dir, qrels_file), qrels)
adhoc/visual_doc/mmdoclong.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ import fitz # PyMuPDF
3
+ from PIL import Image
4
+ import os
5
+ import json
6
+ import base64
7
+ from io import BytesIO
8
+ import ast
9
+
10
+ # Load dataset
11
+ dataset = load_dataset("yubo2333/MMLongBench-Doc")["train"]
12
+
13
+ # Directory containing PDFs
14
+ pdf_dir = "/fsx/sfr/data/MMEB/Visual_Doc/mmlongbench/documents"
15
+
16
+ def encode_image(image):
17
+ buffered = BytesIO()
18
+ image.save(buffered, format="PNG")
19
+ return base64.b64encode(buffered.getvalue()).decode()
20
+
21
+ # Dictionary to store images
22
+ all_images = {}
23
+ processed_pdfs = {}
24
+ pdf_corpus_mapping = {} # Mapping from pdf_file_name to base corpus_id
25
+ existing_corpus_ids = set() # Track already added corpus-ids
26
+
27
+ queries = []
28
+ corpus = []
29
+ qrels = []
30
+ corpus_counter = 0
31
+
32
+ # Process each PDF
33
+ for qid, doc in enumerate(dataset):
34
+ pdf_file_name = doc["doc_id"]
35
+ pdf_path = os.path.join(pdf_dir, pdf_file_name)
36
+
37
+ if doc['evidence_pages'] == []:
38
+ continue
39
+
40
+ # Ensure the file exists before processing
41
+ if not os.path.exists(pdf_path):
42
+ print(f"Warning: PDF file {pdf_file_name} not found. Skipping.")
43
+ continue
44
+
45
+ if pdf_file_name not in processed_pdfs:
46
+ # Open the PDF
47
+ pdf_document = fitz.open(pdf_path)
48
+ images = []
49
+
50
+ # Convert each page to an image
51
+ for page_number in range(len(pdf_document)):
52
+ page = pdf_document[page_number]
53
+ pix = page.get_pixmap()
54
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
55
+ images.append(img)
56
+
57
+ processed_pdfs[pdf_file_name] = images
58
+ pdf_corpus_mapping[pdf_file_name] = corpus_counter
59
+ corpus_counter += len(images) # Increment by number of images
60
+ else:
61
+ images = processed_pdfs[pdf_file_name]
62
+
63
+ # Ensure pdf_file_name is in pdf_corpus_mapping before access
64
+ if pdf_file_name not in pdf_corpus_mapping:
65
+ print(f"Error: {pdf_file_name} not found in pdf_corpus_mapping. Skipping.")
66
+ continue
67
+
68
+ base_corpus_id = pdf_corpus_mapping[pdf_file_name]
69
+ all_images[pdf_file_name] = images
70
+
71
+ try:
72
+ evidence_pages = ast.literal_eval(doc['evidence_pages'])
73
+ if not isinstance(evidence_pages, list):
74
+ raise ValueError("Invalid evidence pages format")
75
+ except Exception as e:
76
+ print(f"Error parsing evidence pages for {pdf_file_name}: {e}")
77
+ continue
78
+
79
+ if len(evidence_pages) == 0:
80
+ continue
81
+ queries.append({
82
+ "query-id": qid,
83
+ "query": doc["question"],
84
+ "corpus_range": list(range(base_corpus_id, base_corpus_id + len(images)))
85
+ })
86
+
87
+ for page_number in evidence_pages:
88
+ qrels.append({
89
+ 'query-id': qid,
90
+ 'corpus-id': base_corpus_id + int(page_number),
91
+ 'score': 1
92
+ })
93
+
94
+ # Store encoded images in corpus if not already added
95
+ for img_id, image in enumerate(images):
96
+ corpus_id = base_corpus_id + img_id # Fix corpus ID numbering
97
+ if corpus_id not in existing_corpus_ids:
98
+ corpus.append({
99
+ "corpus-id": corpus_id,
100
+ "image": encode_image(image)
101
+ })
102
+ existing_corpus_ids.add(corpus_id)
103
+
104
+ # Function to save data in JSONL format
105
+ def save_jsonl(filename, data):
106
+ with open(filename, "w", encoding="utf-8") as f:
107
+ for entry in data:
108
+ json.dump(entry, f)
109
+ f.write("\n")
110
+
111
+ print('size of qrels:', len(qrels))
112
+ print('size of queries:', len(queries))
113
+ print('size of corpus:', len(corpus))
114
+
115
+ save_dir = "/fsx/sfr/data/MMEB/Visual_Doc/mmlongbench/test/"
116
+ os.makedirs(save_dir, exist_ok=True)
117
+ queries_file = "queries.jsonl"
118
+ corpus_file = "corpus.jsonl"
119
+ qrels_file = "qrels.jsonl"
120
+
121
+ # Save to JSONL
122
+ save_jsonl(os.path.join(save_dir, queries_file), queries)
123
+ save_jsonl(os.path.join(save_dir, corpus_file), corpus)
124
+ save_jsonl(os.path.join(save_dir, qrels_file), qrels)
adhoc/visual_doc/vidoseek.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ import fitz # PyMuPDF
3
+ from PIL import Image
4
+ import os
5
+ import json
6
+ import base64
7
+ from io import BytesIO
8
+
9
+ # Load dataset
10
+ file_path = "/fsx/sfr/data/MMEB/Visual_Doc/ViDoSeek/vidoseek.json"
11
+ with open(file_path, "r", encoding="utf-8") as f:
12
+ dataset = json.load(f)
13
+
14
+
15
+ def encode_image(image):
16
+ buffered = BytesIO()
17
+ image.save(buffered, format="PNG")
18
+ return base64.b64encode(buffered.getvalue()).decode()
19
+
20
+
21
+ pdf_dir = "/fsx/sfr/data/MMEB/Visual_Doc/ViDoSeek/vidoseek_pdf_document"
22
+
23
+ all_images = {}
24
+ processed_pdfs = {}
25
+ pdf_corpus_mapping = {} # Mapping from pdf_file_name to base corpus_id
26
+ existing_corpus_ids = set() # Track already added corpus-ids
27
+
28
+ queries = []
29
+ corpus = []
30
+ qrels = []
31
+ corpus_counter = 0
32
+
33
+ # Process each PDF
34
+ for qid, doc in enumerate(dataset['examples']):
35
+ pdf_file_name = doc["meta_info"]['file_name']
36
+ pdf_path = os.path.join(pdf_dir, pdf_file_name)
37
+
38
+ if doc['meta_info']['reference_page'] == []:
39
+ continue
40
+
41
+ if pdf_file_name not in processed_pdfs:
42
+ # Check if the file exists before reading
43
+ if os.path.exists(pdf_path):
44
+ # Open the PDF
45
+ pdf_document = fitz.open(pdf_path)
46
+ images = []
47
+
48
+ # Convert each page to an image
49
+ for page_number in range(len(pdf_document)):
50
+ page = pdf_document[page_number]
51
+ pix = page.get_pixmap()
52
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
53
+ images.append(img)
54
+
55
+ processed_pdfs[pdf_file_name] = images
56
+ pdf_corpus_mapping[pdf_file_name] = corpus_counter
57
+ corpus_counter += len(images)
58
+ else:
59
+ images = processed_pdfs[pdf_file_name]
60
+
61
+ base_corpus_id = pdf_corpus_mapping[pdf_file_name]
62
+ all_images[pdf_file_name] = images
63
+
64
+ queries.append({
65
+ "query-id": qid,
66
+ "query": doc["query"],
67
+ "corpus_range": list(range(base_corpus_id, base_corpus_id + len(images)))
68
+ })
69
+
70
+ # Assign qrels for pages in the same PDF (score = 2)
71
+ for img_id, _ in enumerate(images):
72
+ qrels.append({
73
+ 'query-id': qid,
74
+ 'corpus-id': base_corpus_id + img_id,
75
+ 'score': 2
76
+ })
77
+
78
+ # Assign qrels for reference pages (score = 3)
79
+ for page_number in doc['meta_info']['reference_page']:
80
+ qrels.append({
81
+ 'query-id': qid,
82
+ 'corpus-id': base_corpus_id + int(page_number),
83
+ 'score': 3
84
+ })
85
+
86
+ # Store encoded images in corpus if not already added
87
+ for img_id, image in enumerate(images):
88
+ corpus_id = base_corpus_id + img_id
89
+ if corpus_id not in existing_corpus_ids:
90
+ corpus.append({
91
+ "corpus-id": corpus_id,
92
+ "image": encode_image(image)
93
+ })
94
+ existing_corpus_ids.add(corpus_id)
95
+
96
+
97
+ # Function to save data in JSONL format
98
+ def save_jsonl(filename, data):
99
+ with open(filename, "w", encoding="utf-8") as f:
100
+ for entry in data:
101
+ json.dump(entry, f)
102
+ f.write("\n")
103
+
104
+ print('size of qrels', len(qrels))
105
+ print('size of queries', len(queries))
106
+ print('size of corpus', len(corpus))
107
+
108
+ save_dir = "/fsx/sfr/data/MMEB/Visual_Doc/ViDoSeek/test/"
109
+ os.makedirs(save_dir, exist_ok=True)
110
+ queries_file = "queries.jsonl"
111
+ corpus_file = "corpus.jsonl"
112
+ qrels_file = "qrels.jsonl"
113
+
114
+ # Save to JSONL
115
+ save_jsonl(save_dir + queries_file, queries)
116
+ save_jsonl(save_dir + corpus_file, corpus)
117
+ save_jsonl(save_dir + qrels_file, qrels)
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-100/added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-100/chat_template.jinja ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system
2
+ You are a helpful assistant.<|im_end|>
3
+ {% endif %}<|im_start|>{{ message['role'] }}
4
+ {% if message['content'] is string %}{{ message['content'] }}<|im_end|>
5
+ {% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>
6
+ {% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
7
+ {% endif %}
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-100/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-100/preprocessor_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_convert_rgb": true,
3
+ "do_normalize": true,
4
+ "do_rescale": true,
5
+ "do_resize": true,
6
+ "image_mean": [
7
+ 0.48145466,
8
+ 0.4578275,
9
+ 0.40821073
10
+ ],
11
+ "image_processor_type": "Qwen2_5_VLImageProcessor",
12
+ "image_std": [
13
+ 0.26862954,
14
+ 0.26130258,
15
+ 0.27577711
16
+ ],
17
+ "max_pixels": 1003520,
18
+ "merge_size": 2,
19
+ "min_pixels": 3136,
20
+ "patch_size": 14,
21
+ "processor_class": "Qwen2_5_VLProcessor",
22
+ "resample": 3,
23
+ "rescale_factor": 0.00392156862745098,
24
+ "size": {
25
+ "max_pixels": 1003520,
26
+ "min_pixels": 3136
27
+ },
28
+ "temporal_patch_size": 2
29
+ }
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-100/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-100/tokenizer_config.json ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "clean_up_tokenization_spaces": false,
199
+ "eos_token": "<|im_end|>",
200
+ "errors": "replace",
201
+ "extra_special_tokens": {},
202
+ "model_max_length": 131072,
203
+ "pad_token": "<|endoftext|>",
204
+ "processor_class": "Qwen2_5_VLProcessor",
205
+ "split_special_tokens": false,
206
+ "tokenizer_class": "Qwen2Tokenizer",
207
+ "unk_token": null
208
+ }
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-100/trainer_state.json ADDED
@@ -0,0 +1,734 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.05202913631633715,
6
+ "eval_steps": 500,
7
+ "global_step": 100,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.0005202913631633715,
14
+ "grad_norm": 7.347542762756348,
15
+ "learning_rate": 0.0,
16
+ "loss": 1.428,
17
+ "step": 1
18
+ },
19
+ {
20
+ "epoch": 0.001040582726326743,
21
+ "grad_norm": 8.964370727539062,
22
+ "learning_rate": 5e-06,
23
+ "loss": 1.3459,
24
+ "step": 2
25
+ },
26
+ {
27
+ "epoch": 0.0015608740894901144,
28
+ "grad_norm": 10.382317543029785,
29
+ "learning_rate": 1e-05,
30
+ "loss": 1.54,
31
+ "step": 3
32
+ },
33
+ {
34
+ "epoch": 0.002081165452653486,
35
+ "grad_norm": 9.52104663848877,
36
+ "learning_rate": 1.5e-05,
37
+ "loss": 1.5728,
38
+ "step": 4
39
+ },
40
+ {
41
+ "epoch": 0.0026014568158168575,
42
+ "grad_norm": 8.74624252319336,
43
+ "learning_rate": 2e-05,
44
+ "loss": 1.5368,
45
+ "step": 5
46
+ },
47
+ {
48
+ "epoch": 0.003121748178980229,
49
+ "grad_norm": 7.444849491119385,
50
+ "learning_rate": 2.5e-05,
51
+ "loss": 1.2919,
52
+ "step": 6
53
+ },
54
+ {
55
+ "epoch": 0.0036420395421436005,
56
+ "grad_norm": 8.439070701599121,
57
+ "learning_rate": 3e-05,
58
+ "loss": 1.1753,
59
+ "step": 7
60
+ },
61
+ {
62
+ "epoch": 0.004162330905306972,
63
+ "grad_norm": 8.195757865905762,
64
+ "learning_rate": 3.5000000000000004e-05,
65
+ "loss": 1.2146,
66
+ "step": 8
67
+ },
68
+ {
69
+ "epoch": 0.004682622268470343,
70
+ "grad_norm": 9.419265747070312,
71
+ "learning_rate": 4e-05,
72
+ "loss": 1.4365,
73
+ "step": 9
74
+ },
75
+ {
76
+ "epoch": 0.005202913631633715,
77
+ "grad_norm": 9.609909057617188,
78
+ "learning_rate": 4.4999999999999996e-05,
79
+ "loss": 1.3843,
80
+ "step": 10
81
+ },
82
+ {
83
+ "epoch": 0.005723204994797087,
84
+ "grad_norm": 9.44714069366455,
85
+ "learning_rate": 5e-05,
86
+ "loss": 1.2305,
87
+ "step": 11
88
+ },
89
+ {
90
+ "epoch": 0.006243496357960458,
91
+ "grad_norm": 7.349897384643555,
92
+ "learning_rate": 5.5e-05,
93
+ "loss": 1.0253,
94
+ "step": 12
95
+ },
96
+ {
97
+ "epoch": 0.006763787721123829,
98
+ "grad_norm": 8.391256332397461,
99
+ "learning_rate": 6e-05,
100
+ "loss": 1.2242,
101
+ "step": 13
102
+ },
103
+ {
104
+ "epoch": 0.007284079084287201,
105
+ "grad_norm": 8.2301025390625,
106
+ "learning_rate": 6.500000000000001e-05,
107
+ "loss": 1.3285,
108
+ "step": 14
109
+ },
110
+ {
111
+ "epoch": 0.007804370447450572,
112
+ "grad_norm": 7.3472981452941895,
113
+ "learning_rate": 7.000000000000001e-05,
114
+ "loss": 1.2109,
115
+ "step": 15
116
+ },
117
+ {
118
+ "epoch": 0.008324661810613945,
119
+ "grad_norm": 6.808696746826172,
120
+ "learning_rate": 7.5e-05,
121
+ "loss": 0.8487,
122
+ "step": 16
123
+ },
124
+ {
125
+ "epoch": 0.008844953173777315,
126
+ "grad_norm": 7.667227268218994,
127
+ "learning_rate": 8e-05,
128
+ "loss": 1.1392,
129
+ "step": 17
130
+ },
131
+ {
132
+ "epoch": 0.009365244536940686,
133
+ "grad_norm": 7.13895845413208,
134
+ "learning_rate": 8.5e-05,
135
+ "loss": 1.0382,
136
+ "step": 18
137
+ },
138
+ {
139
+ "epoch": 0.009885535900104058,
140
+ "grad_norm": 8.155549049377441,
141
+ "learning_rate": 8.999999999999999e-05,
142
+ "loss": 1.0287,
143
+ "step": 19
144
+ },
145
+ {
146
+ "epoch": 0.01040582726326743,
147
+ "grad_norm": 6.322030544281006,
148
+ "learning_rate": 9.5e-05,
149
+ "loss": 0.8726,
150
+ "step": 20
151
+ },
152
+ {
153
+ "epoch": 0.010926118626430802,
154
+ "grad_norm": 6.219326019287109,
155
+ "learning_rate": 0.0001,
156
+ "loss": 0.8133,
157
+ "step": 21
158
+ },
159
+ {
160
+ "epoch": 0.011446409989594173,
161
+ "grad_norm": 3.4698593616485596,
162
+ "learning_rate": 0.000105,
163
+ "loss": 0.7479,
164
+ "step": 22
165
+ },
166
+ {
167
+ "epoch": 0.011966701352757543,
168
+ "grad_norm": 3.6907284259796143,
169
+ "learning_rate": 0.00011,
170
+ "loss": 0.8183,
171
+ "step": 23
172
+ },
173
+ {
174
+ "epoch": 0.012486992715920915,
175
+ "grad_norm": 5.981033802032471,
176
+ "learning_rate": 0.000115,
177
+ "loss": 0.587,
178
+ "step": 24
179
+ },
180
+ {
181
+ "epoch": 0.013007284079084287,
182
+ "grad_norm": 4.62821626663208,
183
+ "learning_rate": 0.00012,
184
+ "loss": 0.6687,
185
+ "step": 25
186
+ },
187
+ {
188
+ "epoch": 0.013527575442247659,
189
+ "grad_norm": 4.285324573516846,
190
+ "learning_rate": 0.000125,
191
+ "loss": 0.6252,
192
+ "step": 26
193
+ },
194
+ {
195
+ "epoch": 0.01404786680541103,
196
+ "grad_norm": 4.518625736236572,
197
+ "learning_rate": 0.00013000000000000002,
198
+ "loss": 0.5654,
199
+ "step": 27
200
+ },
201
+ {
202
+ "epoch": 0.014568158168574402,
203
+ "grad_norm": 3.4108848571777344,
204
+ "learning_rate": 0.000135,
205
+ "loss": 0.6086,
206
+ "step": 28
207
+ },
208
+ {
209
+ "epoch": 0.015088449531737774,
210
+ "grad_norm": 2.748203754425049,
211
+ "learning_rate": 0.00014000000000000001,
212
+ "loss": 0.552,
213
+ "step": 29
214
+ },
215
+ {
216
+ "epoch": 0.015608740894901144,
217
+ "grad_norm": 2.817368507385254,
218
+ "learning_rate": 0.000145,
219
+ "loss": 0.6438,
220
+ "step": 30
221
+ },
222
+ {
223
+ "epoch": 0.016129032258064516,
224
+ "grad_norm": 2.5259974002838135,
225
+ "learning_rate": 0.00015,
226
+ "loss": 0.7379,
227
+ "step": 31
228
+ },
229
+ {
230
+ "epoch": 0.01664932362122789,
231
+ "grad_norm": 2.2101669311523438,
232
+ "learning_rate": 0.000155,
233
+ "loss": 0.4164,
234
+ "step": 32
235
+ },
236
+ {
237
+ "epoch": 0.01716961498439126,
238
+ "grad_norm": 1.9261822700500488,
239
+ "learning_rate": 0.00016,
240
+ "loss": 0.2381,
241
+ "step": 33
242
+ },
243
+ {
244
+ "epoch": 0.01768990634755463,
245
+ "grad_norm": 3.6622889041900635,
246
+ "learning_rate": 0.000165,
247
+ "loss": 0.868,
248
+ "step": 34
249
+ },
250
+ {
251
+ "epoch": 0.018210197710718003,
252
+ "grad_norm": 3.7180657386779785,
253
+ "learning_rate": 0.00017,
254
+ "loss": 0.6459,
255
+ "step": 35
256
+ },
257
+ {
258
+ "epoch": 0.018730489073881373,
259
+ "grad_norm": 1.89342200756073,
260
+ "learning_rate": 0.000175,
261
+ "loss": 0.3684,
262
+ "step": 36
263
+ },
264
+ {
265
+ "epoch": 0.019250780437044746,
266
+ "grad_norm": 2.9859375953674316,
267
+ "learning_rate": 0.00017999999999999998,
268
+ "loss": 0.6406,
269
+ "step": 37
270
+ },
271
+ {
272
+ "epoch": 0.019771071800208116,
273
+ "grad_norm": 2.1704893112182617,
274
+ "learning_rate": 0.000185,
275
+ "loss": 0.399,
276
+ "step": 38
277
+ },
278
+ {
279
+ "epoch": 0.02029136316337149,
280
+ "grad_norm": 1.5741156339645386,
281
+ "learning_rate": 0.00019,
282
+ "loss": 0.2802,
283
+ "step": 39
284
+ },
285
+ {
286
+ "epoch": 0.02081165452653486,
287
+ "grad_norm": 1.5053398609161377,
288
+ "learning_rate": 0.00019500000000000002,
289
+ "loss": 0.2899,
290
+ "step": 40
291
+ },
292
+ {
293
+ "epoch": 0.02133194588969823,
294
+ "grad_norm": 2.4964590072631836,
295
+ "learning_rate": 0.0002,
296
+ "loss": 0.4765,
297
+ "step": 41
298
+ },
299
+ {
300
+ "epoch": 0.021852237252861603,
301
+ "grad_norm": 1.7406848669052124,
302
+ "learning_rate": 0.000205,
303
+ "loss": 0.3226,
304
+ "step": 42
305
+ },
306
+ {
307
+ "epoch": 0.022372528616024973,
308
+ "grad_norm": 4.920353412628174,
309
+ "learning_rate": 0.00021,
310
+ "loss": 0.8643,
311
+ "step": 43
312
+ },
313
+ {
314
+ "epoch": 0.022892819979188347,
315
+ "grad_norm": 5.375717639923096,
316
+ "learning_rate": 0.000215,
317
+ "loss": 0.654,
318
+ "step": 44
319
+ },
320
+ {
321
+ "epoch": 0.023413111342351717,
322
+ "grad_norm": 4.912171840667725,
323
+ "learning_rate": 0.00022,
324
+ "loss": 0.5138,
325
+ "step": 45
326
+ },
327
+ {
328
+ "epoch": 0.023933402705515087,
329
+ "grad_norm": 1.8745571374893188,
330
+ "learning_rate": 0.00022500000000000002,
331
+ "loss": 0.194,
332
+ "step": 46
333
+ },
334
+ {
335
+ "epoch": 0.02445369406867846,
336
+ "grad_norm": 3.949474811553955,
337
+ "learning_rate": 0.00023,
338
+ "loss": 0.642,
339
+ "step": 47
340
+ },
341
+ {
342
+ "epoch": 0.02497398543184183,
343
+ "grad_norm": 3.1853504180908203,
344
+ "learning_rate": 0.000235,
345
+ "loss": 0.5319,
346
+ "step": 48
347
+ },
348
+ {
349
+ "epoch": 0.025494276795005204,
350
+ "grad_norm": 1.6487188339233398,
351
+ "learning_rate": 0.00024,
352
+ "loss": 0.2386,
353
+ "step": 49
354
+ },
355
+ {
356
+ "epoch": 0.026014568158168574,
357
+ "grad_norm": 2.2893128395080566,
358
+ "learning_rate": 0.000245,
359
+ "loss": 0.3759,
360
+ "step": 50
361
+ },
362
+ {
363
+ "epoch": 0.026534859521331947,
364
+ "grad_norm": 1.7786861658096313,
365
+ "learning_rate": 0.00025,
366
+ "loss": 0.4172,
367
+ "step": 51
368
+ },
369
+ {
370
+ "epoch": 0.027055150884495317,
371
+ "grad_norm": 2.229330062866211,
372
+ "learning_rate": 0.000255,
373
+ "loss": 0.48,
374
+ "step": 52
375
+ },
376
+ {
377
+ "epoch": 0.027575442247658687,
378
+ "grad_norm": 3.2765936851501465,
379
+ "learning_rate": 0.00026000000000000003,
380
+ "loss": 0.5127,
381
+ "step": 53
382
+ },
383
+ {
384
+ "epoch": 0.02809573361082206,
385
+ "grad_norm": 2.407878875732422,
386
+ "learning_rate": 0.00026500000000000004,
387
+ "loss": 0.4979,
388
+ "step": 54
389
+ },
390
+ {
391
+ "epoch": 0.02861602497398543,
392
+ "grad_norm": 2.218383312225342,
393
+ "learning_rate": 0.00027,
394
+ "loss": 0.4228,
395
+ "step": 55
396
+ },
397
+ {
398
+ "epoch": 0.029136316337148804,
399
+ "grad_norm": 1.7399003505706787,
400
+ "learning_rate": 0.000275,
401
+ "loss": 0.3607,
402
+ "step": 56
403
+ },
404
+ {
405
+ "epoch": 0.029656607700312174,
406
+ "grad_norm": 1.4118911027908325,
407
+ "learning_rate": 0.00028000000000000003,
408
+ "loss": 0.2743,
409
+ "step": 57
410
+ },
411
+ {
412
+ "epoch": 0.030176899063475548,
413
+ "grad_norm": 2.2282633781433105,
414
+ "learning_rate": 0.000285,
415
+ "loss": 0.3152,
416
+ "step": 58
417
+ },
418
+ {
419
+ "epoch": 0.030697190426638918,
420
+ "grad_norm": 1.9690927267074585,
421
+ "learning_rate": 0.00029,
422
+ "loss": 0.236,
423
+ "step": 59
424
+ },
425
+ {
426
+ "epoch": 0.031217481789802288,
427
+ "grad_norm": 1.8251880407333374,
428
+ "learning_rate": 0.000295,
429
+ "loss": 0.2945,
430
+ "step": 60
431
+ },
432
+ {
433
+ "epoch": 0.03173777315296566,
434
+ "grad_norm": 2.371242046356201,
435
+ "learning_rate": 0.0003,
436
+ "loss": 0.3196,
437
+ "step": 61
438
+ },
439
+ {
440
+ "epoch": 0.03225806451612903,
441
+ "grad_norm": 2.302980899810791,
442
+ "learning_rate": 0.000305,
443
+ "loss": 0.2548,
444
+ "step": 62
445
+ },
446
+ {
447
+ "epoch": 0.032778355879292405,
448
+ "grad_norm": 1.5861401557922363,
449
+ "learning_rate": 0.00031,
450
+ "loss": 0.3465,
451
+ "step": 63
452
+ },
453
+ {
454
+ "epoch": 0.03329864724245578,
455
+ "grad_norm": 2.5026137828826904,
456
+ "learning_rate": 0.000315,
457
+ "loss": 0.3962,
458
+ "step": 64
459
+ },
460
+ {
461
+ "epoch": 0.033818938605619145,
462
+ "grad_norm": 2.0949132442474365,
463
+ "learning_rate": 0.00032,
464
+ "loss": 0.3963,
465
+ "step": 65
466
+ },
467
+ {
468
+ "epoch": 0.03433922996878252,
469
+ "grad_norm": 1.5639926195144653,
470
+ "learning_rate": 0.00032500000000000004,
471
+ "loss": 0.1823,
472
+ "step": 66
473
+ },
474
+ {
475
+ "epoch": 0.03485952133194589,
476
+ "grad_norm": 2.0358474254608154,
477
+ "learning_rate": 0.00033,
478
+ "loss": 0.2778,
479
+ "step": 67
480
+ },
481
+ {
482
+ "epoch": 0.03537981269510926,
483
+ "grad_norm": 1.1801868677139282,
484
+ "learning_rate": 0.000335,
485
+ "loss": 0.197,
486
+ "step": 68
487
+ },
488
+ {
489
+ "epoch": 0.03590010405827263,
490
+ "grad_norm": 1.996211290359497,
491
+ "learning_rate": 0.00034,
492
+ "loss": 0.3872,
493
+ "step": 69
494
+ },
495
+ {
496
+ "epoch": 0.036420395421436005,
497
+ "grad_norm": 1.555777668952942,
498
+ "learning_rate": 0.000345,
499
+ "loss": 0.2224,
500
+ "step": 70
501
+ },
502
+ {
503
+ "epoch": 0.03694068678459938,
504
+ "grad_norm": 1.497721791267395,
505
+ "learning_rate": 0.00035,
506
+ "loss": 0.2542,
507
+ "step": 71
508
+ },
509
+ {
510
+ "epoch": 0.037460978147762745,
511
+ "grad_norm": 1.0776859521865845,
512
+ "learning_rate": 0.000355,
513
+ "loss": 0.1237,
514
+ "step": 72
515
+ },
516
+ {
517
+ "epoch": 0.03798126951092612,
518
+ "grad_norm": 1.9728138446807861,
519
+ "learning_rate": 0.00035999999999999997,
520
+ "loss": 0.2316,
521
+ "step": 73
522
+ },
523
+ {
524
+ "epoch": 0.03850156087408949,
525
+ "grad_norm": 1.0327483415603638,
526
+ "learning_rate": 0.000365,
527
+ "loss": 0.1536,
528
+ "step": 74
529
+ },
530
+ {
531
+ "epoch": 0.03902185223725286,
532
+ "grad_norm": 3.0641462802886963,
533
+ "learning_rate": 0.00037,
534
+ "loss": 0.3374,
535
+ "step": 75
536
+ },
537
+ {
538
+ "epoch": 0.03954214360041623,
539
+ "grad_norm": 1.374601125717163,
540
+ "learning_rate": 0.000375,
541
+ "loss": 0.1633,
542
+ "step": 76
543
+ },
544
+ {
545
+ "epoch": 0.040062434963579606,
546
+ "grad_norm": 3.1142971515655518,
547
+ "learning_rate": 0.00038,
548
+ "loss": 0.3083,
549
+ "step": 77
550
+ },
551
+ {
552
+ "epoch": 0.04058272632674298,
553
+ "grad_norm": 1.792457103729248,
554
+ "learning_rate": 0.00038500000000000003,
555
+ "loss": 0.2413,
556
+ "step": 78
557
+ },
558
+ {
559
+ "epoch": 0.041103017689906346,
560
+ "grad_norm": 1.4155240058898926,
561
+ "learning_rate": 0.00039000000000000005,
562
+ "loss": 0.334,
563
+ "step": 79
564
+ },
565
+ {
566
+ "epoch": 0.04162330905306972,
567
+ "grad_norm": 2.6872141361236572,
568
+ "learning_rate": 0.000395,
569
+ "loss": 0.328,
570
+ "step": 80
571
+ },
572
+ {
573
+ "epoch": 0.04214360041623309,
574
+ "grad_norm": 1.301841378211975,
575
+ "learning_rate": 0.0004,
576
+ "loss": 0.3391,
577
+ "step": 81
578
+ },
579
+ {
580
+ "epoch": 0.04266389177939646,
581
+ "grad_norm": 1.5664141178131104,
582
+ "learning_rate": 0.00040500000000000003,
583
+ "loss": 0.2262,
584
+ "step": 82
585
+ },
586
+ {
587
+ "epoch": 0.04318418314255983,
588
+ "grad_norm": 1.6563435792922974,
589
+ "learning_rate": 0.00041,
590
+ "loss": 0.3451,
591
+ "step": 83
592
+ },
593
+ {
594
+ "epoch": 0.043704474505723206,
595
+ "grad_norm": 1.2458600997924805,
596
+ "learning_rate": 0.000415,
597
+ "loss": 0.186,
598
+ "step": 84
599
+ },
600
+ {
601
+ "epoch": 0.04422476586888657,
602
+ "grad_norm": 1.3530123233795166,
603
+ "learning_rate": 0.00042,
604
+ "loss": 0.2447,
605
+ "step": 85
606
+ },
607
+ {
608
+ "epoch": 0.044745057232049947,
609
+ "grad_norm": 1.341471791267395,
610
+ "learning_rate": 0.000425,
611
+ "loss": 0.293,
612
+ "step": 86
613
+ },
614
+ {
615
+ "epoch": 0.04526534859521332,
616
+ "grad_norm": 1.2903335094451904,
617
+ "learning_rate": 0.00043,
618
+ "loss": 0.1058,
619
+ "step": 87
620
+ },
621
+ {
622
+ "epoch": 0.045785639958376693,
623
+ "grad_norm": 1.2263115644454956,
624
+ "learning_rate": 0.000435,
625
+ "loss": 0.1733,
626
+ "step": 88
627
+ },
628
+ {
629
+ "epoch": 0.04630593132154006,
630
+ "grad_norm": 2.077279806137085,
631
+ "learning_rate": 0.00044,
632
+ "loss": 0.1786,
633
+ "step": 89
634
+ },
635
+ {
636
+ "epoch": 0.046826222684703434,
637
+ "grad_norm": 1.2153059244155884,
638
+ "learning_rate": 0.00044500000000000003,
639
+ "loss": 0.202,
640
+ "step": 90
641
+ },
642
+ {
643
+ "epoch": 0.04734651404786681,
644
+ "grad_norm": 1.4943453073501587,
645
+ "learning_rate": 0.00045000000000000004,
646
+ "loss": 0.2266,
647
+ "step": 91
648
+ },
649
+ {
650
+ "epoch": 0.047866805411030174,
651
+ "grad_norm": 1.2306129932403564,
652
+ "learning_rate": 0.000455,
653
+ "loss": 0.1646,
654
+ "step": 92
655
+ },
656
+ {
657
+ "epoch": 0.04838709677419355,
658
+ "grad_norm": 0.9076014757156372,
659
+ "learning_rate": 0.00046,
660
+ "loss": 0.1725,
661
+ "step": 93
662
+ },
663
+ {
664
+ "epoch": 0.04890738813735692,
665
+ "grad_norm": 1.7097628116607666,
666
+ "learning_rate": 0.000465,
667
+ "loss": 0.2515,
668
+ "step": 94
669
+ },
670
+ {
671
+ "epoch": 0.049427679500520294,
672
+ "grad_norm": 1.1039310693740845,
673
+ "learning_rate": 0.00047,
674
+ "loss": 0.173,
675
+ "step": 95
676
+ },
677
+ {
678
+ "epoch": 0.04994797086368366,
679
+ "grad_norm": 1.1415642499923706,
680
+ "learning_rate": 0.000475,
681
+ "loss": 0.1644,
682
+ "step": 96
683
+ },
684
+ {
685
+ "epoch": 0.050468262226847034,
686
+ "grad_norm": 1.2579185962677002,
687
+ "learning_rate": 0.00048,
688
+ "loss": 0.1811,
689
+ "step": 97
690
+ },
691
+ {
692
+ "epoch": 0.05098855359001041,
693
+ "grad_norm": 1.0912175178527832,
694
+ "learning_rate": 0.00048499999999999997,
695
+ "loss": 0.1661,
696
+ "step": 98
697
+ },
698
+ {
699
+ "epoch": 0.051508844953173774,
700
+ "grad_norm": 1.124626874923706,
701
+ "learning_rate": 0.00049,
702
+ "loss": 0.204,
703
+ "step": 99
704
+ },
705
+ {
706
+ "epoch": 0.05202913631633715,
707
+ "grad_norm": 0.776817262172699,
708
+ "learning_rate": 0.000495,
709
+ "loss": 0.1299,
710
+ "step": 100
711
+ }
712
+ ],
713
+ "logging_steps": 1,
714
+ "max_steps": 1000,
715
+ "num_input_tokens_seen": 0,
716
+ "num_train_epochs": 1,
717
+ "save_steps": 100,
718
+ "stateful_callbacks": {
719
+ "TrainerControl": {
720
+ "args": {
721
+ "should_epoch_stop": false,
722
+ "should_evaluate": false,
723
+ "should_log": false,
724
+ "should_save": true,
725
+ "should_training_stop": false
726
+ },
727
+ "attributes": {}
728
+ }
729
+ },
730
+ "total_flos": 0.0,
731
+ "train_batch_size": 128,
732
+ "trial_name": null,
733
+ "trial_params": null
734
+ }
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-100/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-400/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-500/added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-500/chat_template.jinja ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system
2
+ You are a helpful assistant.<|im_end|>
3
+ {% endif %}<|im_start|>{{ message['role'] }}
4
+ {% if message['content'] is string %}{{ message['content'] }}<|im_end|>
5
+ {% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>
6
+ {% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
7
+ {% endif %}
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-500/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-500/preprocessor_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_convert_rgb": true,
3
+ "do_normalize": true,
4
+ "do_rescale": true,
5
+ "do_resize": true,
6
+ "image_mean": [
7
+ 0.48145466,
8
+ 0.4578275,
9
+ 0.40821073
10
+ ],
11
+ "image_processor_type": "Qwen2_5_VLImageProcessor",
12
+ "image_std": [
13
+ 0.26862954,
14
+ 0.26130258,
15
+ 0.27577711
16
+ ],
17
+ "max_pixels": 1003520,
18
+ "merge_size": 2,
19
+ "min_pixels": 3136,
20
+ "patch_size": 14,
21
+ "processor_class": "Qwen2_5_VLProcessor",
22
+ "resample": 3,
23
+ "rescale_factor": 0.00392156862745098,
24
+ "size": {
25
+ "max_pixels": 1003520,
26
+ "min_pixels": 3136
27
+ },
28
+ "temporal_patch_size": 2
29
+ }
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-500/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-500/tokenizer_config.json ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "clean_up_tokenization_spaces": false,
199
+ "eos_token": "<|im_end|>",
200
+ "errors": "replace",
201
+ "extra_special_tokens": {},
202
+ "model_max_length": 131072,
203
+ "pad_token": "<|endoftext|>",
204
+ "processor_class": "Qwen2_5_VLProcessor",
205
+ "split_special_tokens": false,
206
+ "tokenizer_class": "Qwen2Tokenizer",
207
+ "unk_token": null
208
+ }
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-500/trainer_state.json ADDED
@@ -0,0 +1,3534 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.2601456815816857,
6
+ "eval_steps": 500,
7
+ "global_step": 500,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.0005202913631633715,
14
+ "grad_norm": 7.347542762756348,
15
+ "learning_rate": 0.0,
16
+ "loss": 1.428,
17
+ "step": 1
18
+ },
19
+ {
20
+ "epoch": 0.001040582726326743,
21
+ "grad_norm": 8.964370727539062,
22
+ "learning_rate": 5e-06,
23
+ "loss": 1.3459,
24
+ "step": 2
25
+ },
26
+ {
27
+ "epoch": 0.0015608740894901144,
28
+ "grad_norm": 10.382317543029785,
29
+ "learning_rate": 1e-05,
30
+ "loss": 1.54,
31
+ "step": 3
32
+ },
33
+ {
34
+ "epoch": 0.002081165452653486,
35
+ "grad_norm": 9.52104663848877,
36
+ "learning_rate": 1.5e-05,
37
+ "loss": 1.5728,
38
+ "step": 4
39
+ },
40
+ {
41
+ "epoch": 0.0026014568158168575,
42
+ "grad_norm": 8.74624252319336,
43
+ "learning_rate": 2e-05,
44
+ "loss": 1.5368,
45
+ "step": 5
46
+ },
47
+ {
48
+ "epoch": 0.003121748178980229,
49
+ "grad_norm": 7.444849491119385,
50
+ "learning_rate": 2.5e-05,
51
+ "loss": 1.2919,
52
+ "step": 6
53
+ },
54
+ {
55
+ "epoch": 0.0036420395421436005,
56
+ "grad_norm": 8.439070701599121,
57
+ "learning_rate": 3e-05,
58
+ "loss": 1.1753,
59
+ "step": 7
60
+ },
61
+ {
62
+ "epoch": 0.004162330905306972,
63
+ "grad_norm": 8.195757865905762,
64
+ "learning_rate": 3.5000000000000004e-05,
65
+ "loss": 1.2146,
66
+ "step": 8
67
+ },
68
+ {
69
+ "epoch": 0.004682622268470343,
70
+ "grad_norm": 9.419265747070312,
71
+ "learning_rate": 4e-05,
72
+ "loss": 1.4365,
73
+ "step": 9
74
+ },
75
+ {
76
+ "epoch": 0.005202913631633715,
77
+ "grad_norm": 9.609909057617188,
78
+ "learning_rate": 4.4999999999999996e-05,
79
+ "loss": 1.3843,
80
+ "step": 10
81
+ },
82
+ {
83
+ "epoch": 0.005723204994797087,
84
+ "grad_norm": 9.44714069366455,
85
+ "learning_rate": 5e-05,
86
+ "loss": 1.2305,
87
+ "step": 11
88
+ },
89
+ {
90
+ "epoch": 0.006243496357960458,
91
+ "grad_norm": 7.349897384643555,
92
+ "learning_rate": 5.5e-05,
93
+ "loss": 1.0253,
94
+ "step": 12
95
+ },
96
+ {
97
+ "epoch": 0.006763787721123829,
98
+ "grad_norm": 8.391256332397461,
99
+ "learning_rate": 6e-05,
100
+ "loss": 1.2242,
101
+ "step": 13
102
+ },
103
+ {
104
+ "epoch": 0.007284079084287201,
105
+ "grad_norm": 8.2301025390625,
106
+ "learning_rate": 6.500000000000001e-05,
107
+ "loss": 1.3285,
108
+ "step": 14
109
+ },
110
+ {
111
+ "epoch": 0.007804370447450572,
112
+ "grad_norm": 7.3472981452941895,
113
+ "learning_rate": 7.000000000000001e-05,
114
+ "loss": 1.2109,
115
+ "step": 15
116
+ },
117
+ {
118
+ "epoch": 0.008324661810613945,
119
+ "grad_norm": 6.808696746826172,
120
+ "learning_rate": 7.5e-05,
121
+ "loss": 0.8487,
122
+ "step": 16
123
+ },
124
+ {
125
+ "epoch": 0.008844953173777315,
126
+ "grad_norm": 7.667227268218994,
127
+ "learning_rate": 8e-05,
128
+ "loss": 1.1392,
129
+ "step": 17
130
+ },
131
+ {
132
+ "epoch": 0.009365244536940686,
133
+ "grad_norm": 7.13895845413208,
134
+ "learning_rate": 8.5e-05,
135
+ "loss": 1.0382,
136
+ "step": 18
137
+ },
138
+ {
139
+ "epoch": 0.009885535900104058,
140
+ "grad_norm": 8.155549049377441,
141
+ "learning_rate": 8.999999999999999e-05,
142
+ "loss": 1.0287,
143
+ "step": 19
144
+ },
145
+ {
146
+ "epoch": 0.01040582726326743,
147
+ "grad_norm": 6.322030544281006,
148
+ "learning_rate": 9.5e-05,
149
+ "loss": 0.8726,
150
+ "step": 20
151
+ },
152
+ {
153
+ "epoch": 0.010926118626430802,
154
+ "grad_norm": 6.219326019287109,
155
+ "learning_rate": 0.0001,
156
+ "loss": 0.8133,
157
+ "step": 21
158
+ },
159
+ {
160
+ "epoch": 0.011446409989594173,
161
+ "grad_norm": 3.4698593616485596,
162
+ "learning_rate": 0.000105,
163
+ "loss": 0.7479,
164
+ "step": 22
165
+ },
166
+ {
167
+ "epoch": 0.011966701352757543,
168
+ "grad_norm": 3.6907284259796143,
169
+ "learning_rate": 0.00011,
170
+ "loss": 0.8183,
171
+ "step": 23
172
+ },
173
+ {
174
+ "epoch": 0.012486992715920915,
175
+ "grad_norm": 5.981033802032471,
176
+ "learning_rate": 0.000115,
177
+ "loss": 0.587,
178
+ "step": 24
179
+ },
180
+ {
181
+ "epoch": 0.013007284079084287,
182
+ "grad_norm": 4.62821626663208,
183
+ "learning_rate": 0.00012,
184
+ "loss": 0.6687,
185
+ "step": 25
186
+ },
187
+ {
188
+ "epoch": 0.013527575442247659,
189
+ "grad_norm": 4.285324573516846,
190
+ "learning_rate": 0.000125,
191
+ "loss": 0.6252,
192
+ "step": 26
193
+ },
194
+ {
195
+ "epoch": 0.01404786680541103,
196
+ "grad_norm": 4.518625736236572,
197
+ "learning_rate": 0.00013000000000000002,
198
+ "loss": 0.5654,
199
+ "step": 27
200
+ },
201
+ {
202
+ "epoch": 0.014568158168574402,
203
+ "grad_norm": 3.4108848571777344,
204
+ "learning_rate": 0.000135,
205
+ "loss": 0.6086,
206
+ "step": 28
207
+ },
208
+ {
209
+ "epoch": 0.015088449531737774,
210
+ "grad_norm": 2.748203754425049,
211
+ "learning_rate": 0.00014000000000000001,
212
+ "loss": 0.552,
213
+ "step": 29
214
+ },
215
+ {
216
+ "epoch": 0.015608740894901144,
217
+ "grad_norm": 2.817368507385254,
218
+ "learning_rate": 0.000145,
219
+ "loss": 0.6438,
220
+ "step": 30
221
+ },
222
+ {
223
+ "epoch": 0.016129032258064516,
224
+ "grad_norm": 2.5259974002838135,
225
+ "learning_rate": 0.00015,
226
+ "loss": 0.7379,
227
+ "step": 31
228
+ },
229
+ {
230
+ "epoch": 0.01664932362122789,
231
+ "grad_norm": 2.2101669311523438,
232
+ "learning_rate": 0.000155,
233
+ "loss": 0.4164,
234
+ "step": 32
235
+ },
236
+ {
237
+ "epoch": 0.01716961498439126,
238
+ "grad_norm": 1.9261822700500488,
239
+ "learning_rate": 0.00016,
240
+ "loss": 0.2381,
241
+ "step": 33
242
+ },
243
+ {
244
+ "epoch": 0.01768990634755463,
245
+ "grad_norm": 3.6622889041900635,
246
+ "learning_rate": 0.000165,
247
+ "loss": 0.868,
248
+ "step": 34
249
+ },
250
+ {
251
+ "epoch": 0.018210197710718003,
252
+ "grad_norm": 3.7180657386779785,
253
+ "learning_rate": 0.00017,
254
+ "loss": 0.6459,
255
+ "step": 35
256
+ },
257
+ {
258
+ "epoch": 0.018730489073881373,
259
+ "grad_norm": 1.89342200756073,
260
+ "learning_rate": 0.000175,
261
+ "loss": 0.3684,
262
+ "step": 36
263
+ },
264
+ {
265
+ "epoch": 0.019250780437044746,
266
+ "grad_norm": 2.9859375953674316,
267
+ "learning_rate": 0.00017999999999999998,
268
+ "loss": 0.6406,
269
+ "step": 37
270
+ },
271
+ {
272
+ "epoch": 0.019771071800208116,
273
+ "grad_norm": 2.1704893112182617,
274
+ "learning_rate": 0.000185,
275
+ "loss": 0.399,
276
+ "step": 38
277
+ },
278
+ {
279
+ "epoch": 0.02029136316337149,
280
+ "grad_norm": 1.5741156339645386,
281
+ "learning_rate": 0.00019,
282
+ "loss": 0.2802,
283
+ "step": 39
284
+ },
285
+ {
286
+ "epoch": 0.02081165452653486,
287
+ "grad_norm": 1.5053398609161377,
288
+ "learning_rate": 0.00019500000000000002,
289
+ "loss": 0.2899,
290
+ "step": 40
291
+ },
292
+ {
293
+ "epoch": 0.02133194588969823,
294
+ "grad_norm": 2.4964590072631836,
295
+ "learning_rate": 0.0002,
296
+ "loss": 0.4765,
297
+ "step": 41
298
+ },
299
+ {
300
+ "epoch": 0.021852237252861603,
301
+ "grad_norm": 1.7406848669052124,
302
+ "learning_rate": 0.000205,
303
+ "loss": 0.3226,
304
+ "step": 42
305
+ },
306
+ {
307
+ "epoch": 0.022372528616024973,
308
+ "grad_norm": 4.920353412628174,
309
+ "learning_rate": 0.00021,
310
+ "loss": 0.8643,
311
+ "step": 43
312
+ },
313
+ {
314
+ "epoch": 0.022892819979188347,
315
+ "grad_norm": 5.375717639923096,
316
+ "learning_rate": 0.000215,
317
+ "loss": 0.654,
318
+ "step": 44
319
+ },
320
+ {
321
+ "epoch": 0.023413111342351717,
322
+ "grad_norm": 4.912171840667725,
323
+ "learning_rate": 0.00022,
324
+ "loss": 0.5138,
325
+ "step": 45
326
+ },
327
+ {
328
+ "epoch": 0.023933402705515087,
329
+ "grad_norm": 1.8745571374893188,
330
+ "learning_rate": 0.00022500000000000002,
331
+ "loss": 0.194,
332
+ "step": 46
333
+ },
334
+ {
335
+ "epoch": 0.02445369406867846,
336
+ "grad_norm": 3.949474811553955,
337
+ "learning_rate": 0.00023,
338
+ "loss": 0.642,
339
+ "step": 47
340
+ },
341
+ {
342
+ "epoch": 0.02497398543184183,
343
+ "grad_norm": 3.1853504180908203,
344
+ "learning_rate": 0.000235,
345
+ "loss": 0.5319,
346
+ "step": 48
347
+ },
348
+ {
349
+ "epoch": 0.025494276795005204,
350
+ "grad_norm": 1.6487188339233398,
351
+ "learning_rate": 0.00024,
352
+ "loss": 0.2386,
353
+ "step": 49
354
+ },
355
+ {
356
+ "epoch": 0.026014568158168574,
357
+ "grad_norm": 2.2893128395080566,
358
+ "learning_rate": 0.000245,
359
+ "loss": 0.3759,
360
+ "step": 50
361
+ },
362
+ {
363
+ "epoch": 0.026534859521331947,
364
+ "grad_norm": 1.7786861658096313,
365
+ "learning_rate": 0.00025,
366
+ "loss": 0.4172,
367
+ "step": 51
368
+ },
369
+ {
370
+ "epoch": 0.027055150884495317,
371
+ "grad_norm": 2.229330062866211,
372
+ "learning_rate": 0.000255,
373
+ "loss": 0.48,
374
+ "step": 52
375
+ },
376
+ {
377
+ "epoch": 0.027575442247658687,
378
+ "grad_norm": 3.2765936851501465,
379
+ "learning_rate": 0.00026000000000000003,
380
+ "loss": 0.5127,
381
+ "step": 53
382
+ },
383
+ {
384
+ "epoch": 0.02809573361082206,
385
+ "grad_norm": 2.407878875732422,
386
+ "learning_rate": 0.00026500000000000004,
387
+ "loss": 0.4979,
388
+ "step": 54
389
+ },
390
+ {
391
+ "epoch": 0.02861602497398543,
392
+ "grad_norm": 2.218383312225342,
393
+ "learning_rate": 0.00027,
394
+ "loss": 0.4228,
395
+ "step": 55
396
+ },
397
+ {
398
+ "epoch": 0.029136316337148804,
399
+ "grad_norm": 1.7399003505706787,
400
+ "learning_rate": 0.000275,
401
+ "loss": 0.3607,
402
+ "step": 56
403
+ },
404
+ {
405
+ "epoch": 0.029656607700312174,
406
+ "grad_norm": 1.4118911027908325,
407
+ "learning_rate": 0.00028000000000000003,
408
+ "loss": 0.2743,
409
+ "step": 57
410
+ },
411
+ {
412
+ "epoch": 0.030176899063475548,
413
+ "grad_norm": 2.2282633781433105,
414
+ "learning_rate": 0.000285,
415
+ "loss": 0.3152,
416
+ "step": 58
417
+ },
418
+ {
419
+ "epoch": 0.030697190426638918,
420
+ "grad_norm": 1.9690927267074585,
421
+ "learning_rate": 0.00029,
422
+ "loss": 0.236,
423
+ "step": 59
424
+ },
425
+ {
426
+ "epoch": 0.031217481789802288,
427
+ "grad_norm": 1.8251880407333374,
428
+ "learning_rate": 0.000295,
429
+ "loss": 0.2945,
430
+ "step": 60
431
+ },
432
+ {
433
+ "epoch": 0.03173777315296566,
434
+ "grad_norm": 2.371242046356201,
435
+ "learning_rate": 0.0003,
436
+ "loss": 0.3196,
437
+ "step": 61
438
+ },
439
+ {
440
+ "epoch": 0.03225806451612903,
441
+ "grad_norm": 2.302980899810791,
442
+ "learning_rate": 0.000305,
443
+ "loss": 0.2548,
444
+ "step": 62
445
+ },
446
+ {
447
+ "epoch": 0.032778355879292405,
448
+ "grad_norm": 1.5861401557922363,
449
+ "learning_rate": 0.00031,
450
+ "loss": 0.3465,
451
+ "step": 63
452
+ },
453
+ {
454
+ "epoch": 0.03329864724245578,
455
+ "grad_norm": 2.5026137828826904,
456
+ "learning_rate": 0.000315,
457
+ "loss": 0.3962,
458
+ "step": 64
459
+ },
460
+ {
461
+ "epoch": 0.033818938605619145,
462
+ "grad_norm": 2.0949132442474365,
463
+ "learning_rate": 0.00032,
464
+ "loss": 0.3963,
465
+ "step": 65
466
+ },
467
+ {
468
+ "epoch": 0.03433922996878252,
469
+ "grad_norm": 1.5639926195144653,
470
+ "learning_rate": 0.00032500000000000004,
471
+ "loss": 0.1823,
472
+ "step": 66
473
+ },
474
+ {
475
+ "epoch": 0.03485952133194589,
476
+ "grad_norm": 2.0358474254608154,
477
+ "learning_rate": 0.00033,
478
+ "loss": 0.2778,
479
+ "step": 67
480
+ },
481
+ {
482
+ "epoch": 0.03537981269510926,
483
+ "grad_norm": 1.1801868677139282,
484
+ "learning_rate": 0.000335,
485
+ "loss": 0.197,
486
+ "step": 68
487
+ },
488
+ {
489
+ "epoch": 0.03590010405827263,
490
+ "grad_norm": 1.996211290359497,
491
+ "learning_rate": 0.00034,
492
+ "loss": 0.3872,
493
+ "step": 69
494
+ },
495
+ {
496
+ "epoch": 0.036420395421436005,
497
+ "grad_norm": 1.555777668952942,
498
+ "learning_rate": 0.000345,
499
+ "loss": 0.2224,
500
+ "step": 70
501
+ },
502
+ {
503
+ "epoch": 0.03694068678459938,
504
+ "grad_norm": 1.497721791267395,
505
+ "learning_rate": 0.00035,
506
+ "loss": 0.2542,
507
+ "step": 71
508
+ },
509
+ {
510
+ "epoch": 0.037460978147762745,
511
+ "grad_norm": 1.0776859521865845,
512
+ "learning_rate": 0.000355,
513
+ "loss": 0.1237,
514
+ "step": 72
515
+ },
516
+ {
517
+ "epoch": 0.03798126951092612,
518
+ "grad_norm": 1.9728138446807861,
519
+ "learning_rate": 0.00035999999999999997,
520
+ "loss": 0.2316,
521
+ "step": 73
522
+ },
523
+ {
524
+ "epoch": 0.03850156087408949,
525
+ "grad_norm": 1.0327483415603638,
526
+ "learning_rate": 0.000365,
527
+ "loss": 0.1536,
528
+ "step": 74
529
+ },
530
+ {
531
+ "epoch": 0.03902185223725286,
532
+ "grad_norm": 3.0641462802886963,
533
+ "learning_rate": 0.00037,
534
+ "loss": 0.3374,
535
+ "step": 75
536
+ },
537
+ {
538
+ "epoch": 0.03954214360041623,
539
+ "grad_norm": 1.374601125717163,
540
+ "learning_rate": 0.000375,
541
+ "loss": 0.1633,
542
+ "step": 76
543
+ },
544
+ {
545
+ "epoch": 0.040062434963579606,
546
+ "grad_norm": 3.1142971515655518,
547
+ "learning_rate": 0.00038,
548
+ "loss": 0.3083,
549
+ "step": 77
550
+ },
551
+ {
552
+ "epoch": 0.04058272632674298,
553
+ "grad_norm": 1.792457103729248,
554
+ "learning_rate": 0.00038500000000000003,
555
+ "loss": 0.2413,
556
+ "step": 78
557
+ },
558
+ {
559
+ "epoch": 0.041103017689906346,
560
+ "grad_norm": 1.4155240058898926,
561
+ "learning_rate": 0.00039000000000000005,
562
+ "loss": 0.334,
563
+ "step": 79
564
+ },
565
+ {
566
+ "epoch": 0.04162330905306972,
567
+ "grad_norm": 2.6872141361236572,
568
+ "learning_rate": 0.000395,
569
+ "loss": 0.328,
570
+ "step": 80
571
+ },
572
+ {
573
+ "epoch": 0.04214360041623309,
574
+ "grad_norm": 1.301841378211975,
575
+ "learning_rate": 0.0004,
576
+ "loss": 0.3391,
577
+ "step": 81
578
+ },
579
+ {
580
+ "epoch": 0.04266389177939646,
581
+ "grad_norm": 1.5664141178131104,
582
+ "learning_rate": 0.00040500000000000003,
583
+ "loss": 0.2262,
584
+ "step": 82
585
+ },
586
+ {
587
+ "epoch": 0.04318418314255983,
588
+ "grad_norm": 1.6563435792922974,
589
+ "learning_rate": 0.00041,
590
+ "loss": 0.3451,
591
+ "step": 83
592
+ },
593
+ {
594
+ "epoch": 0.043704474505723206,
595
+ "grad_norm": 1.2458600997924805,
596
+ "learning_rate": 0.000415,
597
+ "loss": 0.186,
598
+ "step": 84
599
+ },
600
+ {
601
+ "epoch": 0.04422476586888657,
602
+ "grad_norm": 1.3530123233795166,
603
+ "learning_rate": 0.00042,
604
+ "loss": 0.2447,
605
+ "step": 85
606
+ },
607
+ {
608
+ "epoch": 0.044745057232049947,
609
+ "grad_norm": 1.341471791267395,
610
+ "learning_rate": 0.000425,
611
+ "loss": 0.293,
612
+ "step": 86
613
+ },
614
+ {
615
+ "epoch": 0.04526534859521332,
616
+ "grad_norm": 1.2903335094451904,
617
+ "learning_rate": 0.00043,
618
+ "loss": 0.1058,
619
+ "step": 87
620
+ },
621
+ {
622
+ "epoch": 0.045785639958376693,
623
+ "grad_norm": 1.2263115644454956,
624
+ "learning_rate": 0.000435,
625
+ "loss": 0.1733,
626
+ "step": 88
627
+ },
628
+ {
629
+ "epoch": 0.04630593132154006,
630
+ "grad_norm": 2.077279806137085,
631
+ "learning_rate": 0.00044,
632
+ "loss": 0.1786,
633
+ "step": 89
634
+ },
635
+ {
636
+ "epoch": 0.046826222684703434,
637
+ "grad_norm": 1.2153059244155884,
638
+ "learning_rate": 0.00044500000000000003,
639
+ "loss": 0.202,
640
+ "step": 90
641
+ },
642
+ {
643
+ "epoch": 0.04734651404786681,
644
+ "grad_norm": 1.4943453073501587,
645
+ "learning_rate": 0.00045000000000000004,
646
+ "loss": 0.2266,
647
+ "step": 91
648
+ },
649
+ {
650
+ "epoch": 0.047866805411030174,
651
+ "grad_norm": 1.2306129932403564,
652
+ "learning_rate": 0.000455,
653
+ "loss": 0.1646,
654
+ "step": 92
655
+ },
656
+ {
657
+ "epoch": 0.04838709677419355,
658
+ "grad_norm": 0.9076014757156372,
659
+ "learning_rate": 0.00046,
660
+ "loss": 0.1725,
661
+ "step": 93
662
+ },
663
+ {
664
+ "epoch": 0.04890738813735692,
665
+ "grad_norm": 1.7097628116607666,
666
+ "learning_rate": 0.000465,
667
+ "loss": 0.2515,
668
+ "step": 94
669
+ },
670
+ {
671
+ "epoch": 0.049427679500520294,
672
+ "grad_norm": 1.1039310693740845,
673
+ "learning_rate": 0.00047,
674
+ "loss": 0.173,
675
+ "step": 95
676
+ },
677
+ {
678
+ "epoch": 0.04994797086368366,
679
+ "grad_norm": 1.1415642499923706,
680
+ "learning_rate": 0.000475,
681
+ "loss": 0.1644,
682
+ "step": 96
683
+ },
684
+ {
685
+ "epoch": 0.050468262226847034,
686
+ "grad_norm": 1.2579185962677002,
687
+ "learning_rate": 0.00048,
688
+ "loss": 0.1811,
689
+ "step": 97
690
+ },
691
+ {
692
+ "epoch": 0.05098855359001041,
693
+ "grad_norm": 1.0912175178527832,
694
+ "learning_rate": 0.00048499999999999997,
695
+ "loss": 0.1661,
696
+ "step": 98
697
+ },
698
+ {
699
+ "epoch": 0.051508844953173774,
700
+ "grad_norm": 1.124626874923706,
701
+ "learning_rate": 0.00049,
702
+ "loss": 0.204,
703
+ "step": 99
704
+ },
705
+ {
706
+ "epoch": 0.05202913631633715,
707
+ "grad_norm": 0.776817262172699,
708
+ "learning_rate": 0.000495,
709
+ "loss": 0.1299,
710
+ "step": 100
711
+ },
712
+ {
713
+ "epoch": 0.05254942767950052,
714
+ "grad_norm": 1.7208999395370483,
715
+ "learning_rate": 0.0005,
716
+ "loss": 0.2042,
717
+ "step": 101
718
+ },
719
+ {
720
+ "epoch": 0.053069719042663895,
721
+ "grad_norm": 0.8992323875427246,
722
+ "learning_rate": 0.0004994444444444445,
723
+ "loss": 0.133,
724
+ "step": 102
725
+ },
726
+ {
727
+ "epoch": 0.05359001040582726,
728
+ "grad_norm": 1.1753864288330078,
729
+ "learning_rate": 0.0004988888888888889,
730
+ "loss": 0.1929,
731
+ "step": 103
732
+ },
733
+ {
734
+ "epoch": 0.054110301768990635,
735
+ "grad_norm": 0.8430110812187195,
736
+ "learning_rate": 0.0004983333333333334,
737
+ "loss": 0.1562,
738
+ "step": 104
739
+ },
740
+ {
741
+ "epoch": 0.05463059313215401,
742
+ "grad_norm": 0.7993983626365662,
743
+ "learning_rate": 0.0004977777777777778,
744
+ "loss": 0.1191,
745
+ "step": 105
746
+ },
747
+ {
748
+ "epoch": 0.055150884495317375,
749
+ "grad_norm": 0.7009360194206238,
750
+ "learning_rate": 0.0004972222222222222,
751
+ "loss": 0.0774,
752
+ "step": 106
753
+ },
754
+ {
755
+ "epoch": 0.05567117585848075,
756
+ "grad_norm": 1.1701024770736694,
757
+ "learning_rate": 0.0004966666666666666,
758
+ "loss": 0.1479,
759
+ "step": 107
760
+ },
761
+ {
762
+ "epoch": 0.05619146722164412,
763
+ "grad_norm": 1.1719233989715576,
764
+ "learning_rate": 0.0004961111111111111,
765
+ "loss": 0.1862,
766
+ "step": 108
767
+ },
768
+ {
769
+ "epoch": 0.056711758584807495,
770
+ "grad_norm": 0.7088543772697449,
771
+ "learning_rate": 0.0004955555555555556,
772
+ "loss": 0.089,
773
+ "step": 109
774
+ },
775
+ {
776
+ "epoch": 0.05723204994797086,
777
+ "grad_norm": 0.7498027086257935,
778
+ "learning_rate": 0.000495,
779
+ "loss": 0.1664,
780
+ "step": 110
781
+ },
782
+ {
783
+ "epoch": 0.057752341311134235,
784
+ "grad_norm": 1.3316175937652588,
785
+ "learning_rate": 0.0004944444444444445,
786
+ "loss": 0.1582,
787
+ "step": 111
788
+ },
789
+ {
790
+ "epoch": 0.05827263267429761,
791
+ "grad_norm": 1.1741178035736084,
792
+ "learning_rate": 0.0004938888888888889,
793
+ "loss": 0.1409,
794
+ "step": 112
795
+ },
796
+ {
797
+ "epoch": 0.058792924037460975,
798
+ "grad_norm": 0.8257745504379272,
799
+ "learning_rate": 0.0004933333333333334,
800
+ "loss": 0.1443,
801
+ "step": 113
802
+ },
803
+ {
804
+ "epoch": 0.05931321540062435,
805
+ "grad_norm": 0.7418781518936157,
806
+ "learning_rate": 0.0004927777777777777,
807
+ "loss": 0.0851,
808
+ "step": 114
809
+ },
810
+ {
811
+ "epoch": 0.05983350676378772,
812
+ "grad_norm": 0.7079729437828064,
813
+ "learning_rate": 0.0004922222222222222,
814
+ "loss": 0.1046,
815
+ "step": 115
816
+ },
817
+ {
818
+ "epoch": 0.060353798126951096,
819
+ "grad_norm": 0.8635478019714355,
820
+ "learning_rate": 0.0004916666666666666,
821
+ "loss": 0.1176,
822
+ "step": 116
823
+ },
824
+ {
825
+ "epoch": 0.06087408949011446,
826
+ "grad_norm": 0.9280768632888794,
827
+ "learning_rate": 0.0004911111111111111,
828
+ "loss": 0.1064,
829
+ "step": 117
830
+ },
831
+ {
832
+ "epoch": 0.061394380853277836,
833
+ "grad_norm": 1.0225319862365723,
834
+ "learning_rate": 0.0004905555555555556,
835
+ "loss": 0.1482,
836
+ "step": 118
837
+ },
838
+ {
839
+ "epoch": 0.06191467221644121,
840
+ "grad_norm": 0.6103273630142212,
841
+ "learning_rate": 0.00049,
842
+ "loss": 0.0657,
843
+ "step": 119
844
+ },
845
+ {
846
+ "epoch": 0.062434963579604576,
847
+ "grad_norm": 0.7268538475036621,
848
+ "learning_rate": 0.0004894444444444445,
849
+ "loss": 0.147,
850
+ "step": 120
851
+ },
852
+ {
853
+ "epoch": 0.06295525494276795,
854
+ "grad_norm": 0.7652425765991211,
855
+ "learning_rate": 0.0004888888888888889,
856
+ "loss": 0.1118,
857
+ "step": 121
858
+ },
859
+ {
860
+ "epoch": 0.06347554630593132,
861
+ "grad_norm": 0.7623610496520996,
862
+ "learning_rate": 0.0004883333333333333,
863
+ "loss": 0.1252,
864
+ "step": 122
865
+ },
866
+ {
867
+ "epoch": 0.0639958376690947,
868
+ "grad_norm": 0.9734853506088257,
869
+ "learning_rate": 0.0004877777777777778,
870
+ "loss": 0.1418,
871
+ "step": 123
872
+ },
873
+ {
874
+ "epoch": 0.06451612903225806,
875
+ "grad_norm": 0.8588402271270752,
876
+ "learning_rate": 0.0004872222222222222,
877
+ "loss": 0.0848,
878
+ "step": 124
879
+ },
880
+ {
881
+ "epoch": 0.06503642039542143,
882
+ "grad_norm": 0.5615188479423523,
883
+ "learning_rate": 0.0004866666666666667,
884
+ "loss": 0.1006,
885
+ "step": 125
886
+ },
887
+ {
888
+ "epoch": 0.06555671175858481,
889
+ "grad_norm": 0.9584555625915527,
890
+ "learning_rate": 0.0004861111111111111,
891
+ "loss": 0.1728,
892
+ "step": 126
893
+ },
894
+ {
895
+ "epoch": 0.06607700312174818,
896
+ "grad_norm": 0.6202451586723328,
897
+ "learning_rate": 0.0004855555555555556,
898
+ "loss": 0.0931,
899
+ "step": 127
900
+ },
901
+ {
902
+ "epoch": 0.06659729448491156,
903
+ "grad_norm": 0.6236227750778198,
904
+ "learning_rate": 0.00048499999999999997,
905
+ "loss": 0.0674,
906
+ "step": 128
907
+ },
908
+ {
909
+ "epoch": 0.06711758584807492,
910
+ "grad_norm": 0.66746985912323,
911
+ "learning_rate": 0.00048444444444444446,
912
+ "loss": 0.1218,
913
+ "step": 129
914
+ },
915
+ {
916
+ "epoch": 0.06763787721123829,
917
+ "grad_norm": 0.5942522883415222,
918
+ "learning_rate": 0.0004838888888888889,
919
+ "loss": 0.0748,
920
+ "step": 130
921
+ },
922
+ {
923
+ "epoch": 0.06815816857440167,
924
+ "grad_norm": 0.6593474745750427,
925
+ "learning_rate": 0.00048333333333333334,
926
+ "loss": 0.1078,
927
+ "step": 131
928
+ },
929
+ {
930
+ "epoch": 0.06867845993756504,
931
+ "grad_norm": 0.9823837876319885,
932
+ "learning_rate": 0.0004827777777777778,
933
+ "loss": 0.143,
934
+ "step": 132
935
+ },
936
+ {
937
+ "epoch": 0.0691987513007284,
938
+ "grad_norm": 0.6464436054229736,
939
+ "learning_rate": 0.0004822222222222222,
940
+ "loss": 0.1289,
941
+ "step": 133
942
+ },
943
+ {
944
+ "epoch": 0.06971904266389178,
945
+ "grad_norm": 0.8930130004882812,
946
+ "learning_rate": 0.0004816666666666667,
947
+ "loss": 0.1437,
948
+ "step": 134
949
+ },
950
+ {
951
+ "epoch": 0.07023933402705515,
952
+ "grad_norm": 0.6195514798164368,
953
+ "learning_rate": 0.0004811111111111111,
954
+ "loss": 0.0743,
955
+ "step": 135
956
+ },
957
+ {
958
+ "epoch": 0.07075962539021852,
959
+ "grad_norm": 0.5456336736679077,
960
+ "learning_rate": 0.0004805555555555556,
961
+ "loss": 0.1139,
962
+ "step": 136
963
+ },
964
+ {
965
+ "epoch": 0.0712799167533819,
966
+ "grad_norm": 0.5359215140342712,
967
+ "learning_rate": 0.00048,
968
+ "loss": 0.0969,
969
+ "step": 137
970
+ },
971
+ {
972
+ "epoch": 0.07180020811654526,
973
+ "grad_norm": 0.8201822638511658,
974
+ "learning_rate": 0.00047944444444444445,
975
+ "loss": 0.0993,
976
+ "step": 138
977
+ },
978
+ {
979
+ "epoch": 0.07232049947970863,
980
+ "grad_norm": 0.6110750436782837,
981
+ "learning_rate": 0.0004788888888888889,
982
+ "loss": 0.1011,
983
+ "step": 139
984
+ },
985
+ {
986
+ "epoch": 0.07284079084287201,
987
+ "grad_norm": 0.48351359367370605,
988
+ "learning_rate": 0.0004783333333333333,
989
+ "loss": 0.0869,
990
+ "step": 140
991
+ },
992
+ {
993
+ "epoch": 0.07336108220603538,
994
+ "grad_norm": 0.6683951020240784,
995
+ "learning_rate": 0.0004777777777777778,
996
+ "loss": 0.0814,
997
+ "step": 141
998
+ },
999
+ {
1000
+ "epoch": 0.07388137356919876,
1001
+ "grad_norm": 0.742268443107605,
1002
+ "learning_rate": 0.00047722222222222225,
1003
+ "loss": 0.0894,
1004
+ "step": 142
1005
+ },
1006
+ {
1007
+ "epoch": 0.07440166493236212,
1008
+ "grad_norm": 0.6042747497558594,
1009
+ "learning_rate": 0.0004766666666666667,
1010
+ "loss": 0.0794,
1011
+ "step": 143
1012
+ },
1013
+ {
1014
+ "epoch": 0.07492195629552549,
1015
+ "grad_norm": 0.6750574111938477,
1016
+ "learning_rate": 0.0004761111111111111,
1017
+ "loss": 0.0801,
1018
+ "step": 144
1019
+ },
1020
+ {
1021
+ "epoch": 0.07544224765868887,
1022
+ "grad_norm": 0.6264745593070984,
1023
+ "learning_rate": 0.00047555555555555556,
1024
+ "loss": 0.1127,
1025
+ "step": 145
1026
+ },
1027
+ {
1028
+ "epoch": 0.07596253902185224,
1029
+ "grad_norm": 0.7027119994163513,
1030
+ "learning_rate": 0.000475,
1031
+ "loss": 0.1043,
1032
+ "step": 146
1033
+ },
1034
+ {
1035
+ "epoch": 0.0764828303850156,
1036
+ "grad_norm": 0.5967740416526794,
1037
+ "learning_rate": 0.00047444444444444444,
1038
+ "loss": 0.1012,
1039
+ "step": 147
1040
+ },
1041
+ {
1042
+ "epoch": 0.07700312174817898,
1043
+ "grad_norm": 0.6070584058761597,
1044
+ "learning_rate": 0.00047388888888888893,
1045
+ "loss": 0.1279,
1046
+ "step": 148
1047
+ },
1048
+ {
1049
+ "epoch": 0.07752341311134235,
1050
+ "grad_norm": 0.5560263991355896,
1051
+ "learning_rate": 0.00047333333333333336,
1052
+ "loss": 0.0862,
1053
+ "step": 149
1054
+ },
1055
+ {
1056
+ "epoch": 0.07804370447450572,
1057
+ "grad_norm": 0.5680839419364929,
1058
+ "learning_rate": 0.0004727777777777778,
1059
+ "loss": 0.0542,
1060
+ "step": 150
1061
+ },
1062
+ {
1063
+ "epoch": 0.0785639958376691,
1064
+ "grad_norm": 0.7852948904037476,
1065
+ "learning_rate": 0.00047222222222222224,
1066
+ "loss": 0.1563,
1067
+ "step": 151
1068
+ },
1069
+ {
1070
+ "epoch": 0.07908428720083246,
1071
+ "grad_norm": 0.6082651019096375,
1072
+ "learning_rate": 0.0004716666666666667,
1073
+ "loss": 0.0897,
1074
+ "step": 152
1075
+ },
1076
+ {
1077
+ "epoch": 0.07960457856399583,
1078
+ "grad_norm": 0.43691495060920715,
1079
+ "learning_rate": 0.0004711111111111111,
1080
+ "loss": 0.0923,
1081
+ "step": 153
1082
+ },
1083
+ {
1084
+ "epoch": 0.08012486992715921,
1085
+ "grad_norm": 0.5423274040222168,
1086
+ "learning_rate": 0.00047055555555555555,
1087
+ "loss": 0.1048,
1088
+ "step": 154
1089
+ },
1090
+ {
1091
+ "epoch": 0.08064516129032258,
1092
+ "grad_norm": 0.5422453284263611,
1093
+ "learning_rate": 0.00047,
1094
+ "loss": 0.0818,
1095
+ "step": 155
1096
+ },
1097
+ {
1098
+ "epoch": 0.08116545265348596,
1099
+ "grad_norm": 0.3782746493816376,
1100
+ "learning_rate": 0.0004694444444444445,
1101
+ "loss": 0.0763,
1102
+ "step": 156
1103
+ },
1104
+ {
1105
+ "epoch": 0.08168574401664933,
1106
+ "grad_norm": 0.735381543636322,
1107
+ "learning_rate": 0.0004688888888888889,
1108
+ "loss": 0.1249,
1109
+ "step": 157
1110
+ },
1111
+ {
1112
+ "epoch": 0.08220603537981269,
1113
+ "grad_norm": 0.43137192726135254,
1114
+ "learning_rate": 0.00046833333333333335,
1115
+ "loss": 0.0509,
1116
+ "step": 158
1117
+ },
1118
+ {
1119
+ "epoch": 0.08272632674297607,
1120
+ "grad_norm": 0.49553734064102173,
1121
+ "learning_rate": 0.0004677777777777778,
1122
+ "loss": 0.059,
1123
+ "step": 159
1124
+ },
1125
+ {
1126
+ "epoch": 0.08324661810613944,
1127
+ "grad_norm": 0.8710311651229858,
1128
+ "learning_rate": 0.0004672222222222222,
1129
+ "loss": 0.1079,
1130
+ "step": 160
1131
+ },
1132
+ {
1133
+ "epoch": 0.0837669094693028,
1134
+ "grad_norm": 0.3895374536514282,
1135
+ "learning_rate": 0.00046666666666666666,
1136
+ "loss": 0.0761,
1137
+ "step": 161
1138
+ },
1139
+ {
1140
+ "epoch": 0.08428720083246619,
1141
+ "grad_norm": 0.6220502257347107,
1142
+ "learning_rate": 0.0004661111111111111,
1143
+ "loss": 0.1077,
1144
+ "step": 162
1145
+ },
1146
+ {
1147
+ "epoch": 0.08480749219562955,
1148
+ "grad_norm": 0.43123388290405273,
1149
+ "learning_rate": 0.0004655555555555556,
1150
+ "loss": 0.0809,
1151
+ "step": 163
1152
+ },
1153
+ {
1154
+ "epoch": 0.08532778355879292,
1155
+ "grad_norm": 0.5482419729232788,
1156
+ "learning_rate": 0.000465,
1157
+ "loss": 0.0887,
1158
+ "step": 164
1159
+ },
1160
+ {
1161
+ "epoch": 0.0858480749219563,
1162
+ "grad_norm": 0.3709481358528137,
1163
+ "learning_rate": 0.00046444444444444446,
1164
+ "loss": 0.0492,
1165
+ "step": 165
1166
+ },
1167
+ {
1168
+ "epoch": 0.08636836628511967,
1169
+ "grad_norm": 0.3871099650859833,
1170
+ "learning_rate": 0.0004638888888888889,
1171
+ "loss": 0.0525,
1172
+ "step": 166
1173
+ },
1174
+ {
1175
+ "epoch": 0.08688865764828303,
1176
+ "grad_norm": 0.49930575489997864,
1177
+ "learning_rate": 0.00046333333333333334,
1178
+ "loss": 0.0456,
1179
+ "step": 167
1180
+ },
1181
+ {
1182
+ "epoch": 0.08740894901144641,
1183
+ "grad_norm": 0.35331490635871887,
1184
+ "learning_rate": 0.0004627777777777778,
1185
+ "loss": 0.0556,
1186
+ "step": 168
1187
+ },
1188
+ {
1189
+ "epoch": 0.08792924037460978,
1190
+ "grad_norm": 0.3593418300151825,
1191
+ "learning_rate": 0.0004622222222222222,
1192
+ "loss": 0.0664,
1193
+ "step": 169
1194
+ },
1195
+ {
1196
+ "epoch": 0.08844953173777315,
1197
+ "grad_norm": 0.38897961378097534,
1198
+ "learning_rate": 0.0004616666666666667,
1199
+ "loss": 0.0849,
1200
+ "step": 170
1201
+ },
1202
+ {
1203
+ "epoch": 0.08896982310093653,
1204
+ "grad_norm": 0.4496786296367645,
1205
+ "learning_rate": 0.00046111111111111114,
1206
+ "loss": 0.0768,
1207
+ "step": 171
1208
+ },
1209
+ {
1210
+ "epoch": 0.08949011446409989,
1211
+ "grad_norm": 0.43698763847351074,
1212
+ "learning_rate": 0.0004605555555555556,
1213
+ "loss": 0.062,
1214
+ "step": 172
1215
+ },
1216
+ {
1217
+ "epoch": 0.09001040582726327,
1218
+ "grad_norm": 0.3045942485332489,
1219
+ "learning_rate": 0.00046,
1220
+ "loss": 0.0466,
1221
+ "step": 173
1222
+ },
1223
+ {
1224
+ "epoch": 0.09053069719042664,
1225
+ "grad_norm": 0.3364112079143524,
1226
+ "learning_rate": 0.00045944444444444445,
1227
+ "loss": 0.051,
1228
+ "step": 174
1229
+ },
1230
+ {
1231
+ "epoch": 0.09105098855359,
1232
+ "grad_norm": 0.7610157132148743,
1233
+ "learning_rate": 0.0004588888888888889,
1234
+ "loss": 0.0752,
1235
+ "step": 175
1236
+ },
1237
+ {
1238
+ "epoch": 0.09157127991675339,
1239
+ "grad_norm": 0.4646570682525635,
1240
+ "learning_rate": 0.0004583333333333333,
1241
+ "loss": 0.1027,
1242
+ "step": 176
1243
+ },
1244
+ {
1245
+ "epoch": 0.09209157127991675,
1246
+ "grad_norm": 0.8062249422073364,
1247
+ "learning_rate": 0.0004577777777777778,
1248
+ "loss": 0.1622,
1249
+ "step": 177
1250
+ },
1251
+ {
1252
+ "epoch": 0.09261186264308012,
1253
+ "grad_norm": 0.3921089470386505,
1254
+ "learning_rate": 0.0004572222222222222,
1255
+ "loss": 0.042,
1256
+ "step": 178
1257
+ },
1258
+ {
1259
+ "epoch": 0.0931321540062435,
1260
+ "grad_norm": 0.3350071310997009,
1261
+ "learning_rate": 0.0004566666666666667,
1262
+ "loss": 0.0461,
1263
+ "step": 179
1264
+ },
1265
+ {
1266
+ "epoch": 0.09365244536940687,
1267
+ "grad_norm": 0.272399365901947,
1268
+ "learning_rate": 0.0004561111111111111,
1269
+ "loss": 0.0443,
1270
+ "step": 180
1271
+ },
1272
+ {
1273
+ "epoch": 0.09417273673257023,
1274
+ "grad_norm": 0.33471840620040894,
1275
+ "learning_rate": 0.00045555555555555556,
1276
+ "loss": 0.0504,
1277
+ "step": 181
1278
+ },
1279
+ {
1280
+ "epoch": 0.09469302809573361,
1281
+ "grad_norm": 0.3427852392196655,
1282
+ "learning_rate": 0.000455,
1283
+ "loss": 0.0475,
1284
+ "step": 182
1285
+ },
1286
+ {
1287
+ "epoch": 0.09521331945889698,
1288
+ "grad_norm": 0.40719184279441833,
1289
+ "learning_rate": 0.00045444444444444444,
1290
+ "loss": 0.0595,
1291
+ "step": 183
1292
+ },
1293
+ {
1294
+ "epoch": 0.09573361082206035,
1295
+ "grad_norm": 0.35792386531829834,
1296
+ "learning_rate": 0.00045388888888888893,
1297
+ "loss": 0.0593,
1298
+ "step": 184
1299
+ },
1300
+ {
1301
+ "epoch": 0.09625390218522373,
1302
+ "grad_norm": 0.47860586643218994,
1303
+ "learning_rate": 0.0004533333333333333,
1304
+ "loss": 0.0787,
1305
+ "step": 185
1306
+ },
1307
+ {
1308
+ "epoch": 0.0967741935483871,
1309
+ "grad_norm": 0.5289556980133057,
1310
+ "learning_rate": 0.0004527777777777778,
1311
+ "loss": 0.0756,
1312
+ "step": 186
1313
+ },
1314
+ {
1315
+ "epoch": 0.09729448491155047,
1316
+ "grad_norm": 0.4445546567440033,
1317
+ "learning_rate": 0.00045222222222222224,
1318
+ "loss": 0.0611,
1319
+ "step": 187
1320
+ },
1321
+ {
1322
+ "epoch": 0.09781477627471384,
1323
+ "grad_norm": 0.4470248222351074,
1324
+ "learning_rate": 0.0004516666666666667,
1325
+ "loss": 0.0784,
1326
+ "step": 188
1327
+ },
1328
+ {
1329
+ "epoch": 0.09833506763787721,
1330
+ "grad_norm": 0.4186774790287018,
1331
+ "learning_rate": 0.0004511111111111111,
1332
+ "loss": 0.0421,
1333
+ "step": 189
1334
+ },
1335
+ {
1336
+ "epoch": 0.09885535900104059,
1337
+ "grad_norm": 0.28850093483924866,
1338
+ "learning_rate": 0.00045055555555555555,
1339
+ "loss": 0.0414,
1340
+ "step": 190
1341
+ },
1342
+ {
1343
+ "epoch": 0.09937565036420395,
1344
+ "grad_norm": 0.3566621243953705,
1345
+ "learning_rate": 0.00045000000000000004,
1346
+ "loss": 0.0511,
1347
+ "step": 191
1348
+ },
1349
+ {
1350
+ "epoch": 0.09989594172736732,
1351
+ "grad_norm": 0.4454294741153717,
1352
+ "learning_rate": 0.0004494444444444444,
1353
+ "loss": 0.0995,
1354
+ "step": 192
1355
+ },
1356
+ {
1357
+ "epoch": 0.1004162330905307,
1358
+ "grad_norm": 0.45749202370643616,
1359
+ "learning_rate": 0.0004488888888888889,
1360
+ "loss": 0.0946,
1361
+ "step": 193
1362
+ },
1363
+ {
1364
+ "epoch": 0.10093652445369407,
1365
+ "grad_norm": 0.2874762713909149,
1366
+ "learning_rate": 0.0004483333333333333,
1367
+ "loss": 0.0546,
1368
+ "step": 194
1369
+ },
1370
+ {
1371
+ "epoch": 0.10145681581685743,
1372
+ "grad_norm": 0.26859250664711,
1373
+ "learning_rate": 0.0004477777777777778,
1374
+ "loss": 0.0329,
1375
+ "step": 195
1376
+ },
1377
+ {
1378
+ "epoch": 0.10197710718002082,
1379
+ "grad_norm": 0.3758945167064667,
1380
+ "learning_rate": 0.0004472222222222222,
1381
+ "loss": 0.0721,
1382
+ "step": 196
1383
+ },
1384
+ {
1385
+ "epoch": 0.10249739854318418,
1386
+ "grad_norm": 0.3250490725040436,
1387
+ "learning_rate": 0.00044666666666666666,
1388
+ "loss": 0.0454,
1389
+ "step": 197
1390
+ },
1391
+ {
1392
+ "epoch": 0.10301768990634755,
1393
+ "grad_norm": 0.43297529220581055,
1394
+ "learning_rate": 0.00044611111111111115,
1395
+ "loss": 0.0705,
1396
+ "step": 198
1397
+ },
1398
+ {
1399
+ "epoch": 0.10353798126951093,
1400
+ "grad_norm": 0.2871391773223877,
1401
+ "learning_rate": 0.00044555555555555554,
1402
+ "loss": 0.0389,
1403
+ "step": 199
1404
+ },
1405
+ {
1406
+ "epoch": 0.1040582726326743,
1407
+ "grad_norm": 0.3059026896953583,
1408
+ "learning_rate": 0.00044500000000000003,
1409
+ "loss": 0.0532,
1410
+ "step": 200
1411
+ },
1412
+ {
1413
+ "epoch": 0.10457856399583768,
1414
+ "grad_norm": 0.37971311807632446,
1415
+ "learning_rate": 0.0004444444444444444,
1416
+ "loss": 0.0805,
1417
+ "step": 201
1418
+ },
1419
+ {
1420
+ "epoch": 0.10509885535900104,
1421
+ "grad_norm": 0.3776862621307373,
1422
+ "learning_rate": 0.0004438888888888889,
1423
+ "loss": 0.0936,
1424
+ "step": 202
1425
+ },
1426
+ {
1427
+ "epoch": 0.10561914672216441,
1428
+ "grad_norm": 0.33026885986328125,
1429
+ "learning_rate": 0.00044333333333333334,
1430
+ "loss": 0.0453,
1431
+ "step": 203
1432
+ },
1433
+ {
1434
+ "epoch": 0.10613943808532779,
1435
+ "grad_norm": 0.36573582887649536,
1436
+ "learning_rate": 0.0004427777777777778,
1437
+ "loss": 0.0375,
1438
+ "step": 204
1439
+ },
1440
+ {
1441
+ "epoch": 0.10665972944849116,
1442
+ "grad_norm": 0.5324421525001526,
1443
+ "learning_rate": 0.00044222222222222227,
1444
+ "loss": 0.0572,
1445
+ "step": 205
1446
+ },
1447
+ {
1448
+ "epoch": 0.10718002081165452,
1449
+ "grad_norm": 0.2825300395488739,
1450
+ "learning_rate": 0.00044166666666666665,
1451
+ "loss": 0.043,
1452
+ "step": 206
1453
+ },
1454
+ {
1455
+ "epoch": 0.1077003121748179,
1456
+ "grad_norm": 0.5899777412414551,
1457
+ "learning_rate": 0.00044111111111111114,
1458
+ "loss": 0.0601,
1459
+ "step": 207
1460
+ },
1461
+ {
1462
+ "epoch": 0.10822060353798127,
1463
+ "grad_norm": 0.4580536186695099,
1464
+ "learning_rate": 0.0004405555555555555,
1465
+ "loss": 0.0962,
1466
+ "step": 208
1467
+ },
1468
+ {
1469
+ "epoch": 0.10874089490114464,
1470
+ "grad_norm": 0.28349611163139343,
1471
+ "learning_rate": 0.00044,
1472
+ "loss": 0.0499,
1473
+ "step": 209
1474
+ },
1475
+ {
1476
+ "epoch": 0.10926118626430802,
1477
+ "grad_norm": 0.35658761858940125,
1478
+ "learning_rate": 0.0004394444444444445,
1479
+ "loss": 0.063,
1480
+ "step": 210
1481
+ },
1482
+ {
1483
+ "epoch": 0.10978147762747138,
1484
+ "grad_norm": 0.28881627321243286,
1485
+ "learning_rate": 0.0004388888888888889,
1486
+ "loss": 0.0468,
1487
+ "step": 211
1488
+ },
1489
+ {
1490
+ "epoch": 0.11030176899063475,
1491
+ "grad_norm": 0.3207852840423584,
1492
+ "learning_rate": 0.0004383333333333334,
1493
+ "loss": 0.0437,
1494
+ "step": 212
1495
+ },
1496
+ {
1497
+ "epoch": 0.11082206035379813,
1498
+ "grad_norm": 0.3225831985473633,
1499
+ "learning_rate": 0.00043777777777777776,
1500
+ "loss": 0.045,
1501
+ "step": 213
1502
+ },
1503
+ {
1504
+ "epoch": 0.1113423517169615,
1505
+ "grad_norm": 0.3248918056488037,
1506
+ "learning_rate": 0.00043722222222222225,
1507
+ "loss": 0.0725,
1508
+ "step": 214
1509
+ },
1510
+ {
1511
+ "epoch": 0.11186264308012488,
1512
+ "grad_norm": 0.45690080523490906,
1513
+ "learning_rate": 0.00043666666666666664,
1514
+ "loss": 0.0485,
1515
+ "step": 215
1516
+ },
1517
+ {
1518
+ "epoch": 0.11238293444328824,
1519
+ "grad_norm": 0.41606688499450684,
1520
+ "learning_rate": 0.00043611111111111113,
1521
+ "loss": 0.0902,
1522
+ "step": 216
1523
+ },
1524
+ {
1525
+ "epoch": 0.11290322580645161,
1526
+ "grad_norm": 0.2519379258155823,
1527
+ "learning_rate": 0.0004355555555555555,
1528
+ "loss": 0.0347,
1529
+ "step": 217
1530
+ },
1531
+ {
1532
+ "epoch": 0.11342351716961499,
1533
+ "grad_norm": 0.2908113896846771,
1534
+ "learning_rate": 0.000435,
1535
+ "loss": 0.0557,
1536
+ "step": 218
1537
+ },
1538
+ {
1539
+ "epoch": 0.11394380853277836,
1540
+ "grad_norm": 0.3034886121749878,
1541
+ "learning_rate": 0.0004344444444444445,
1542
+ "loss": 0.0774,
1543
+ "step": 219
1544
+ },
1545
+ {
1546
+ "epoch": 0.11446409989594172,
1547
+ "grad_norm": 0.3579472005367279,
1548
+ "learning_rate": 0.0004338888888888889,
1549
+ "loss": 0.0717,
1550
+ "step": 220
1551
+ },
1552
+ {
1553
+ "epoch": 0.1149843912591051,
1554
+ "grad_norm": 0.33985862135887146,
1555
+ "learning_rate": 0.00043333333333333337,
1556
+ "loss": 0.0611,
1557
+ "step": 221
1558
+ },
1559
+ {
1560
+ "epoch": 0.11550468262226847,
1561
+ "grad_norm": 0.42294999957084656,
1562
+ "learning_rate": 0.00043277777777777775,
1563
+ "loss": 0.0824,
1564
+ "step": 222
1565
+ },
1566
+ {
1567
+ "epoch": 0.11602497398543184,
1568
+ "grad_norm": 0.33317992091178894,
1569
+ "learning_rate": 0.00043222222222222224,
1570
+ "loss": 0.0631,
1571
+ "step": 223
1572
+ },
1573
+ {
1574
+ "epoch": 0.11654526534859522,
1575
+ "grad_norm": 0.347391813993454,
1576
+ "learning_rate": 0.0004316666666666667,
1577
+ "loss": 0.0726,
1578
+ "step": 224
1579
+ },
1580
+ {
1581
+ "epoch": 0.11706555671175858,
1582
+ "grad_norm": 0.4332979917526245,
1583
+ "learning_rate": 0.0004311111111111111,
1584
+ "loss": 0.0493,
1585
+ "step": 225
1586
+ },
1587
+ {
1588
+ "epoch": 0.11758584807492195,
1589
+ "grad_norm": 0.2794676721096039,
1590
+ "learning_rate": 0.0004305555555555556,
1591
+ "loss": 0.0428,
1592
+ "step": 226
1593
+ },
1594
+ {
1595
+ "epoch": 0.11810613943808533,
1596
+ "grad_norm": 0.2665698826313019,
1597
+ "learning_rate": 0.00043,
1598
+ "loss": 0.0628,
1599
+ "step": 227
1600
+ },
1601
+ {
1602
+ "epoch": 0.1186264308012487,
1603
+ "grad_norm": 0.47581610083580017,
1604
+ "learning_rate": 0.0004294444444444445,
1605
+ "loss": 0.0412,
1606
+ "step": 228
1607
+ },
1608
+ {
1609
+ "epoch": 0.11914672216441206,
1610
+ "grad_norm": 0.356357216835022,
1611
+ "learning_rate": 0.00042888888888888886,
1612
+ "loss": 0.0309,
1613
+ "step": 229
1614
+ },
1615
+ {
1616
+ "epoch": 0.11966701352757544,
1617
+ "grad_norm": 0.2871776819229126,
1618
+ "learning_rate": 0.00042833333333333335,
1619
+ "loss": 0.0702,
1620
+ "step": 230
1621
+ },
1622
+ {
1623
+ "epoch": 0.12018730489073881,
1624
+ "grad_norm": 0.18419012427330017,
1625
+ "learning_rate": 0.0004277777777777778,
1626
+ "loss": 0.0279,
1627
+ "step": 231
1628
+ },
1629
+ {
1630
+ "epoch": 0.12070759625390219,
1631
+ "grad_norm": 0.33823081851005554,
1632
+ "learning_rate": 0.00042722222222222223,
1633
+ "loss": 0.0315,
1634
+ "step": 232
1635
+ },
1636
+ {
1637
+ "epoch": 0.12122788761706556,
1638
+ "grad_norm": 0.33040091395378113,
1639
+ "learning_rate": 0.0004266666666666667,
1640
+ "loss": 0.0699,
1641
+ "step": 233
1642
+ },
1643
+ {
1644
+ "epoch": 0.12174817898022892,
1645
+ "grad_norm": 0.3405701518058777,
1646
+ "learning_rate": 0.0004261111111111111,
1647
+ "loss": 0.0649,
1648
+ "step": 234
1649
+ },
1650
+ {
1651
+ "epoch": 0.1222684703433923,
1652
+ "grad_norm": 0.49750658869743347,
1653
+ "learning_rate": 0.0004255555555555556,
1654
+ "loss": 0.0546,
1655
+ "step": 235
1656
+ },
1657
+ {
1658
+ "epoch": 0.12278876170655567,
1659
+ "grad_norm": 0.4337189495563507,
1660
+ "learning_rate": 0.000425,
1661
+ "loss": 0.04,
1662
+ "step": 236
1663
+ },
1664
+ {
1665
+ "epoch": 0.12330905306971904,
1666
+ "grad_norm": 0.4933618903160095,
1667
+ "learning_rate": 0.00042444444444444447,
1668
+ "loss": 0.0754,
1669
+ "step": 237
1670
+ },
1671
+ {
1672
+ "epoch": 0.12382934443288242,
1673
+ "grad_norm": 0.2554134130477905,
1674
+ "learning_rate": 0.0004238888888888889,
1675
+ "loss": 0.0466,
1676
+ "step": 238
1677
+ },
1678
+ {
1679
+ "epoch": 0.12434963579604578,
1680
+ "grad_norm": 0.34308987855911255,
1681
+ "learning_rate": 0.00042333333333333334,
1682
+ "loss": 0.0621,
1683
+ "step": 239
1684
+ },
1685
+ {
1686
+ "epoch": 0.12486992715920915,
1687
+ "grad_norm": 0.3505181670188904,
1688
+ "learning_rate": 0.0004227777777777778,
1689
+ "loss": 0.0646,
1690
+ "step": 240
1691
+ },
1692
+ {
1693
+ "epoch": 0.12539021852237253,
1694
+ "grad_norm": 0.36588212847709656,
1695
+ "learning_rate": 0.0004222222222222222,
1696
+ "loss": 0.0528,
1697
+ "step": 241
1698
+ },
1699
+ {
1700
+ "epoch": 0.1259105098855359,
1701
+ "grad_norm": 0.4307107627391815,
1702
+ "learning_rate": 0.0004216666666666667,
1703
+ "loss": 0.0573,
1704
+ "step": 242
1705
+ },
1706
+ {
1707
+ "epoch": 0.12643080124869926,
1708
+ "grad_norm": 0.37300869822502136,
1709
+ "learning_rate": 0.0004211111111111111,
1710
+ "loss": 0.0486,
1711
+ "step": 243
1712
+ },
1713
+ {
1714
+ "epoch": 0.12695109261186263,
1715
+ "grad_norm": 0.30476123094558716,
1716
+ "learning_rate": 0.0004205555555555556,
1717
+ "loss": 0.0508,
1718
+ "step": 244
1719
+ },
1720
+ {
1721
+ "epoch": 0.12747138397502603,
1722
+ "grad_norm": 0.3410852551460266,
1723
+ "learning_rate": 0.00042,
1724
+ "loss": 0.0678,
1725
+ "step": 245
1726
+ },
1727
+ {
1728
+ "epoch": 0.1279916753381894,
1729
+ "grad_norm": 0.2417326420545578,
1730
+ "learning_rate": 0.00041944444444444445,
1731
+ "loss": 0.0359,
1732
+ "step": 246
1733
+ },
1734
+ {
1735
+ "epoch": 0.12851196670135276,
1736
+ "grad_norm": 0.27660486102104187,
1737
+ "learning_rate": 0.0004188888888888889,
1738
+ "loss": 0.0445,
1739
+ "step": 247
1740
+ },
1741
+ {
1742
+ "epoch": 0.12903225806451613,
1743
+ "grad_norm": 0.2319687157869339,
1744
+ "learning_rate": 0.00041833333333333333,
1745
+ "loss": 0.0486,
1746
+ "step": 248
1747
+ },
1748
+ {
1749
+ "epoch": 0.1295525494276795,
1750
+ "grad_norm": 0.3000936210155487,
1751
+ "learning_rate": 0.0004177777777777778,
1752
+ "loss": 0.05,
1753
+ "step": 249
1754
+ },
1755
+ {
1756
+ "epoch": 0.13007284079084286,
1757
+ "grad_norm": 0.31069836020469666,
1758
+ "learning_rate": 0.0004172222222222222,
1759
+ "loss": 0.0412,
1760
+ "step": 250
1761
+ },
1762
+ {
1763
+ "epoch": 0.13059313215400625,
1764
+ "grad_norm": 0.24123403429985046,
1765
+ "learning_rate": 0.0004166666666666667,
1766
+ "loss": 0.0573,
1767
+ "step": 251
1768
+ },
1769
+ {
1770
+ "epoch": 0.13111342351716962,
1771
+ "grad_norm": 0.2845012843608856,
1772
+ "learning_rate": 0.00041611111111111113,
1773
+ "loss": 0.0322,
1774
+ "step": 252
1775
+ },
1776
+ {
1777
+ "epoch": 0.13163371488033299,
1778
+ "grad_norm": 0.3060798943042755,
1779
+ "learning_rate": 0.00041555555555555557,
1780
+ "loss": 0.0328,
1781
+ "step": 253
1782
+ },
1783
+ {
1784
+ "epoch": 0.13215400624349635,
1785
+ "grad_norm": 0.3751870393753052,
1786
+ "learning_rate": 0.000415,
1787
+ "loss": 0.0481,
1788
+ "step": 254
1789
+ },
1790
+ {
1791
+ "epoch": 0.13267429760665972,
1792
+ "grad_norm": 0.27469107508659363,
1793
+ "learning_rate": 0.00041444444444444444,
1794
+ "loss": 0.0415,
1795
+ "step": 255
1796
+ },
1797
+ {
1798
+ "epoch": 0.1331945889698231,
1799
+ "grad_norm": 0.32122480869293213,
1800
+ "learning_rate": 0.0004138888888888889,
1801
+ "loss": 0.0641,
1802
+ "step": 256
1803
+ },
1804
+ {
1805
+ "epoch": 0.13371488033298648,
1806
+ "grad_norm": 0.34307950735092163,
1807
+ "learning_rate": 0.0004133333333333333,
1808
+ "loss": 0.0623,
1809
+ "step": 257
1810
+ },
1811
+ {
1812
+ "epoch": 0.13423517169614985,
1813
+ "grad_norm": 0.25482696294784546,
1814
+ "learning_rate": 0.0004127777777777778,
1815
+ "loss": 0.044,
1816
+ "step": 258
1817
+ },
1818
+ {
1819
+ "epoch": 0.1347554630593132,
1820
+ "grad_norm": 0.4288344383239746,
1821
+ "learning_rate": 0.00041222222222222224,
1822
+ "loss": 0.0757,
1823
+ "step": 259
1824
+ },
1825
+ {
1826
+ "epoch": 0.13527575442247658,
1827
+ "grad_norm": 0.24957087635993958,
1828
+ "learning_rate": 0.0004116666666666667,
1829
+ "loss": 0.0328,
1830
+ "step": 260
1831
+ },
1832
+ {
1833
+ "epoch": 0.13579604578563995,
1834
+ "grad_norm": 0.14633908867835999,
1835
+ "learning_rate": 0.0004111111111111111,
1836
+ "loss": 0.0279,
1837
+ "step": 261
1838
+ },
1839
+ {
1840
+ "epoch": 0.13631633714880334,
1841
+ "grad_norm": 0.2976965606212616,
1842
+ "learning_rate": 0.00041055555555555555,
1843
+ "loss": 0.0452,
1844
+ "step": 262
1845
+ },
1846
+ {
1847
+ "epoch": 0.1368366285119667,
1848
+ "grad_norm": 0.3640998601913452,
1849
+ "learning_rate": 0.00041,
1850
+ "loss": 0.0493,
1851
+ "step": 263
1852
+ },
1853
+ {
1854
+ "epoch": 0.13735691987513007,
1855
+ "grad_norm": 0.5083469748497009,
1856
+ "learning_rate": 0.00040944444444444443,
1857
+ "loss": 0.0788,
1858
+ "step": 264
1859
+ },
1860
+ {
1861
+ "epoch": 0.13787721123829344,
1862
+ "grad_norm": 0.24888603389263153,
1863
+ "learning_rate": 0.0004088888888888889,
1864
+ "loss": 0.0641,
1865
+ "step": 265
1866
+ },
1867
+ {
1868
+ "epoch": 0.1383975026014568,
1869
+ "grad_norm": 0.2294796109199524,
1870
+ "learning_rate": 0.00040833333333333336,
1871
+ "loss": 0.0532,
1872
+ "step": 266
1873
+ },
1874
+ {
1875
+ "epoch": 0.1389177939646202,
1876
+ "grad_norm": 0.2386179268360138,
1877
+ "learning_rate": 0.0004077777777777778,
1878
+ "loss": 0.0292,
1879
+ "step": 267
1880
+ },
1881
+ {
1882
+ "epoch": 0.13943808532778357,
1883
+ "grad_norm": 0.23145556449890137,
1884
+ "learning_rate": 0.00040722222222222223,
1885
+ "loss": 0.0499,
1886
+ "step": 268
1887
+ },
1888
+ {
1889
+ "epoch": 0.13995837669094693,
1890
+ "grad_norm": 0.23750337958335876,
1891
+ "learning_rate": 0.00040666666666666667,
1892
+ "loss": 0.0393,
1893
+ "step": 269
1894
+ },
1895
+ {
1896
+ "epoch": 0.1404786680541103,
1897
+ "grad_norm": 0.2392527312040329,
1898
+ "learning_rate": 0.0004061111111111111,
1899
+ "loss": 0.0486,
1900
+ "step": 270
1901
+ },
1902
+ {
1903
+ "epoch": 0.14099895941727367,
1904
+ "grad_norm": 0.26626136898994446,
1905
+ "learning_rate": 0.00040555555555555554,
1906
+ "loss": 0.0385,
1907
+ "step": 271
1908
+ },
1909
+ {
1910
+ "epoch": 0.14151925078043703,
1911
+ "grad_norm": 0.2984270751476288,
1912
+ "learning_rate": 0.00040500000000000003,
1913
+ "loss": 0.049,
1914
+ "step": 272
1915
+ },
1916
+ {
1917
+ "epoch": 0.14203954214360043,
1918
+ "grad_norm": 0.2629552483558655,
1919
+ "learning_rate": 0.00040444444444444447,
1920
+ "loss": 0.0639,
1921
+ "step": 273
1922
+ },
1923
+ {
1924
+ "epoch": 0.1425598335067638,
1925
+ "grad_norm": 0.2580653429031372,
1926
+ "learning_rate": 0.0004038888888888889,
1927
+ "loss": 0.0397,
1928
+ "step": 274
1929
+ },
1930
+ {
1931
+ "epoch": 0.14308012486992716,
1932
+ "grad_norm": 0.2550220787525177,
1933
+ "learning_rate": 0.00040333333333333334,
1934
+ "loss": 0.058,
1935
+ "step": 275
1936
+ },
1937
+ {
1938
+ "epoch": 0.14360041623309053,
1939
+ "grad_norm": 0.15206913650035858,
1940
+ "learning_rate": 0.0004027777777777778,
1941
+ "loss": 0.0293,
1942
+ "step": 276
1943
+ },
1944
+ {
1945
+ "epoch": 0.1441207075962539,
1946
+ "grad_norm": 0.26907533407211304,
1947
+ "learning_rate": 0.0004022222222222222,
1948
+ "loss": 0.0474,
1949
+ "step": 277
1950
+ },
1951
+ {
1952
+ "epoch": 0.14464099895941726,
1953
+ "grad_norm": 0.32724031805992126,
1954
+ "learning_rate": 0.00040166666666666665,
1955
+ "loss": 0.0489,
1956
+ "step": 278
1957
+ },
1958
+ {
1959
+ "epoch": 0.14516129032258066,
1960
+ "grad_norm": 0.298304945230484,
1961
+ "learning_rate": 0.0004011111111111111,
1962
+ "loss": 0.0502,
1963
+ "step": 279
1964
+ },
1965
+ {
1966
+ "epoch": 0.14568158168574402,
1967
+ "grad_norm": 0.24465495347976685,
1968
+ "learning_rate": 0.0004005555555555556,
1969
+ "loss": 0.0375,
1970
+ "step": 280
1971
+ },
1972
+ {
1973
+ "epoch": 0.1462018730489074,
1974
+ "grad_norm": 0.3101779818534851,
1975
+ "learning_rate": 0.0004,
1976
+ "loss": 0.0486,
1977
+ "step": 281
1978
+ },
1979
+ {
1980
+ "epoch": 0.14672216441207075,
1981
+ "grad_norm": 0.3175954520702362,
1982
+ "learning_rate": 0.00039944444444444446,
1983
+ "loss": 0.0456,
1984
+ "step": 282
1985
+ },
1986
+ {
1987
+ "epoch": 0.14724245577523412,
1988
+ "grad_norm": 0.27732956409454346,
1989
+ "learning_rate": 0.0003988888888888889,
1990
+ "loss": 0.0372,
1991
+ "step": 283
1992
+ },
1993
+ {
1994
+ "epoch": 0.14776274713839752,
1995
+ "grad_norm": 0.5436331033706665,
1996
+ "learning_rate": 0.00039833333333333333,
1997
+ "loss": 0.0913,
1998
+ "step": 284
1999
+ },
2000
+ {
2001
+ "epoch": 0.14828303850156088,
2002
+ "grad_norm": 0.38064879179000854,
2003
+ "learning_rate": 0.00039777777777777777,
2004
+ "loss": 0.0879,
2005
+ "step": 285
2006
+ },
2007
+ {
2008
+ "epoch": 0.14880332986472425,
2009
+ "grad_norm": 0.20538319647312164,
2010
+ "learning_rate": 0.0003972222222222222,
2011
+ "loss": 0.0379,
2012
+ "step": 286
2013
+ },
2014
+ {
2015
+ "epoch": 0.14932362122788762,
2016
+ "grad_norm": 0.3068322241306305,
2017
+ "learning_rate": 0.0003966666666666667,
2018
+ "loss": 0.0614,
2019
+ "step": 287
2020
+ },
2021
+ {
2022
+ "epoch": 0.14984391259105098,
2023
+ "grad_norm": 0.2988760769367218,
2024
+ "learning_rate": 0.00039611111111111113,
2025
+ "loss": 0.0347,
2026
+ "step": 288
2027
+ },
2028
+ {
2029
+ "epoch": 0.15036420395421435,
2030
+ "grad_norm": 0.24667970836162567,
2031
+ "learning_rate": 0.00039555555555555557,
2032
+ "loss": 0.0481,
2033
+ "step": 289
2034
+ },
2035
+ {
2036
+ "epoch": 0.15088449531737774,
2037
+ "grad_norm": 0.3291466236114502,
2038
+ "learning_rate": 0.000395,
2039
+ "loss": 0.0585,
2040
+ "step": 290
2041
+ },
2042
+ {
2043
+ "epoch": 0.1514047866805411,
2044
+ "grad_norm": 0.2097170203924179,
2045
+ "learning_rate": 0.00039444444444444444,
2046
+ "loss": 0.0267,
2047
+ "step": 291
2048
+ },
2049
+ {
2050
+ "epoch": 0.15192507804370448,
2051
+ "grad_norm": 0.33159908652305603,
2052
+ "learning_rate": 0.00039388888888888893,
2053
+ "loss": 0.0328,
2054
+ "step": 292
2055
+ },
2056
+ {
2057
+ "epoch": 0.15244536940686784,
2058
+ "grad_norm": 0.2823585867881775,
2059
+ "learning_rate": 0.0003933333333333333,
2060
+ "loss": 0.0368,
2061
+ "step": 293
2062
+ },
2063
+ {
2064
+ "epoch": 0.1529656607700312,
2065
+ "grad_norm": 0.1939367949962616,
2066
+ "learning_rate": 0.0003927777777777778,
2067
+ "loss": 0.0338,
2068
+ "step": 294
2069
+ },
2070
+ {
2071
+ "epoch": 0.15348595213319458,
2072
+ "grad_norm": 0.23737141489982605,
2073
+ "learning_rate": 0.00039222222222222225,
2074
+ "loss": 0.0522,
2075
+ "step": 295
2076
+ },
2077
+ {
2078
+ "epoch": 0.15400624349635797,
2079
+ "grad_norm": 0.29729461669921875,
2080
+ "learning_rate": 0.0003916666666666667,
2081
+ "loss": 0.0409,
2082
+ "step": 296
2083
+ },
2084
+ {
2085
+ "epoch": 0.15452653485952134,
2086
+ "grad_norm": 0.186125710606575,
2087
+ "learning_rate": 0.0003911111111111111,
2088
+ "loss": 0.035,
2089
+ "step": 297
2090
+ },
2091
+ {
2092
+ "epoch": 0.1550468262226847,
2093
+ "grad_norm": 0.23367059230804443,
2094
+ "learning_rate": 0.00039055555555555556,
2095
+ "loss": 0.0471,
2096
+ "step": 298
2097
+ },
2098
+ {
2099
+ "epoch": 0.15556711758584807,
2100
+ "grad_norm": 0.2210577130317688,
2101
+ "learning_rate": 0.00039000000000000005,
2102
+ "loss": 0.0349,
2103
+ "step": 299
2104
+ },
2105
+ {
2106
+ "epoch": 0.15608740894901144,
2107
+ "grad_norm": 0.5184774994850159,
2108
+ "learning_rate": 0.00038944444444444443,
2109
+ "loss": 0.091,
2110
+ "step": 300
2111
+ },
2112
+ {
2113
+ "epoch": 0.15660770031217483,
2114
+ "grad_norm": 0.2522483170032501,
2115
+ "learning_rate": 0.0003888888888888889,
2116
+ "loss": 0.0491,
2117
+ "step": 301
2118
+ },
2119
+ {
2120
+ "epoch": 0.1571279916753382,
2121
+ "grad_norm": 0.21878671646118164,
2122
+ "learning_rate": 0.0003883333333333333,
2123
+ "loss": 0.0393,
2124
+ "step": 302
2125
+ },
2126
+ {
2127
+ "epoch": 0.15764828303850156,
2128
+ "grad_norm": 0.23364581167697906,
2129
+ "learning_rate": 0.0003877777777777778,
2130
+ "loss": 0.044,
2131
+ "step": 303
2132
+ },
2133
+ {
2134
+ "epoch": 0.15816857440166493,
2135
+ "grad_norm": 0.2022484838962555,
2136
+ "learning_rate": 0.00038722222222222223,
2137
+ "loss": 0.0479,
2138
+ "step": 304
2139
+ },
2140
+ {
2141
+ "epoch": 0.1586888657648283,
2142
+ "grad_norm": 0.2595768868923187,
2143
+ "learning_rate": 0.00038666666666666667,
2144
+ "loss": 0.0393,
2145
+ "step": 305
2146
+ },
2147
+ {
2148
+ "epoch": 0.15920915712799166,
2149
+ "grad_norm": 0.44060519337654114,
2150
+ "learning_rate": 0.00038611111111111116,
2151
+ "loss": 0.0543,
2152
+ "step": 306
2153
+ },
2154
+ {
2155
+ "epoch": 0.15972944849115506,
2156
+ "grad_norm": 0.2032414823770523,
2157
+ "learning_rate": 0.00038555555555555554,
2158
+ "loss": 0.0346,
2159
+ "step": 307
2160
+ },
2161
+ {
2162
+ "epoch": 0.16024973985431842,
2163
+ "grad_norm": 0.3000059723854065,
2164
+ "learning_rate": 0.00038500000000000003,
2165
+ "loss": 0.0427,
2166
+ "step": 308
2167
+ },
2168
+ {
2169
+ "epoch": 0.1607700312174818,
2170
+ "grad_norm": 0.18585479259490967,
2171
+ "learning_rate": 0.0003844444444444444,
2172
+ "loss": 0.0272,
2173
+ "step": 309
2174
+ },
2175
+ {
2176
+ "epoch": 0.16129032258064516,
2177
+ "grad_norm": 0.22494898736476898,
2178
+ "learning_rate": 0.0003838888888888889,
2179
+ "loss": 0.0232,
2180
+ "step": 310
2181
+ },
2182
+ {
2183
+ "epoch": 0.16181061394380852,
2184
+ "grad_norm": 0.19582317769527435,
2185
+ "learning_rate": 0.00038333333333333334,
2186
+ "loss": 0.04,
2187
+ "step": 311
2188
+ },
2189
+ {
2190
+ "epoch": 0.16233090530697192,
2191
+ "grad_norm": 0.4270728826522827,
2192
+ "learning_rate": 0.0003827777777777778,
2193
+ "loss": 0.0619,
2194
+ "step": 312
2195
+ },
2196
+ {
2197
+ "epoch": 0.16285119667013528,
2198
+ "grad_norm": 0.20349858701229095,
2199
+ "learning_rate": 0.0003822222222222223,
2200
+ "loss": 0.0421,
2201
+ "step": 313
2202
+ },
2203
+ {
2204
+ "epoch": 0.16337148803329865,
2205
+ "grad_norm": 0.3676919639110565,
2206
+ "learning_rate": 0.00038166666666666666,
2207
+ "loss": 0.0472,
2208
+ "step": 314
2209
+ },
2210
+ {
2211
+ "epoch": 0.16389177939646202,
2212
+ "grad_norm": 0.32425612211227417,
2213
+ "learning_rate": 0.00038111111111111115,
2214
+ "loss": 0.0663,
2215
+ "step": 315
2216
+ },
2217
+ {
2218
+ "epoch": 0.16441207075962538,
2219
+ "grad_norm": 0.26070570945739746,
2220
+ "learning_rate": 0.00038055555555555553,
2221
+ "loss": 0.0551,
2222
+ "step": 316
2223
+ },
2224
+ {
2225
+ "epoch": 0.16493236212278875,
2226
+ "grad_norm": 0.2385774403810501,
2227
+ "learning_rate": 0.00038,
2228
+ "loss": 0.0508,
2229
+ "step": 317
2230
+ },
2231
+ {
2232
+ "epoch": 0.16545265348595214,
2233
+ "grad_norm": 0.2036454826593399,
2234
+ "learning_rate": 0.0003794444444444444,
2235
+ "loss": 0.0393,
2236
+ "step": 318
2237
+ },
2238
+ {
2239
+ "epoch": 0.1659729448491155,
2240
+ "grad_norm": 0.21891064941883087,
2241
+ "learning_rate": 0.0003788888888888889,
2242
+ "loss": 0.0347,
2243
+ "step": 319
2244
+ },
2245
+ {
2246
+ "epoch": 0.16649323621227888,
2247
+ "grad_norm": 0.18101496994495392,
2248
+ "learning_rate": 0.0003783333333333334,
2249
+ "loss": 0.0295,
2250
+ "step": 320
2251
+ },
2252
+ {
2253
+ "epoch": 0.16701352757544224,
2254
+ "grad_norm": 0.19484540820121765,
2255
+ "learning_rate": 0.00037777777777777777,
2256
+ "loss": 0.0313,
2257
+ "step": 321
2258
+ },
2259
+ {
2260
+ "epoch": 0.1675338189386056,
2261
+ "grad_norm": 0.22532738745212555,
2262
+ "learning_rate": 0.00037722222222222226,
2263
+ "loss": 0.0382,
2264
+ "step": 322
2265
+ },
2266
+ {
2267
+ "epoch": 0.16805411030176898,
2268
+ "grad_norm": 0.2155781388282776,
2269
+ "learning_rate": 0.00037666666666666664,
2270
+ "loss": 0.0352,
2271
+ "step": 323
2272
+ },
2273
+ {
2274
+ "epoch": 0.16857440166493237,
2275
+ "grad_norm": 0.22214792668819427,
2276
+ "learning_rate": 0.00037611111111111113,
2277
+ "loss": 0.0425,
2278
+ "step": 324
2279
+ },
2280
+ {
2281
+ "epoch": 0.16909469302809574,
2282
+ "grad_norm": 0.2648473083972931,
2283
+ "learning_rate": 0.0003755555555555555,
2284
+ "loss": 0.0422,
2285
+ "step": 325
2286
+ },
2287
+ {
2288
+ "epoch": 0.1696149843912591,
2289
+ "grad_norm": 0.22539383172988892,
2290
+ "learning_rate": 0.000375,
2291
+ "loss": 0.0366,
2292
+ "step": 326
2293
+ },
2294
+ {
2295
+ "epoch": 0.17013527575442247,
2296
+ "grad_norm": 0.19195836782455444,
2297
+ "learning_rate": 0.0003744444444444445,
2298
+ "loss": 0.0279,
2299
+ "step": 327
2300
+ },
2301
+ {
2302
+ "epoch": 0.17065556711758584,
2303
+ "grad_norm": 0.2254018783569336,
2304
+ "learning_rate": 0.0003738888888888889,
2305
+ "loss": 0.0455,
2306
+ "step": 328
2307
+ },
2308
+ {
2309
+ "epoch": 0.17117585848074923,
2310
+ "grad_norm": 0.2259969264268875,
2311
+ "learning_rate": 0.0003733333333333334,
2312
+ "loss": 0.0301,
2313
+ "step": 329
2314
+ },
2315
+ {
2316
+ "epoch": 0.1716961498439126,
2317
+ "grad_norm": 0.20350702106952667,
2318
+ "learning_rate": 0.00037277777777777776,
2319
+ "loss": 0.0459,
2320
+ "step": 330
2321
+ },
2322
+ {
2323
+ "epoch": 0.17221644120707597,
2324
+ "grad_norm": 0.17894725501537323,
2325
+ "learning_rate": 0.00037222222222222225,
2326
+ "loss": 0.0325,
2327
+ "step": 331
2328
+ },
2329
+ {
2330
+ "epoch": 0.17273673257023933,
2331
+ "grad_norm": 0.22505900263786316,
2332
+ "learning_rate": 0.00037166666666666663,
2333
+ "loss": 0.0404,
2334
+ "step": 332
2335
+ },
2336
+ {
2337
+ "epoch": 0.1732570239334027,
2338
+ "grad_norm": 0.10483799874782562,
2339
+ "learning_rate": 0.0003711111111111111,
2340
+ "loss": 0.0153,
2341
+ "step": 333
2342
+ },
2343
+ {
2344
+ "epoch": 0.17377731529656607,
2345
+ "grad_norm": 0.1504441499710083,
2346
+ "learning_rate": 0.0003705555555555556,
2347
+ "loss": 0.0281,
2348
+ "step": 334
2349
+ },
2350
+ {
2351
+ "epoch": 0.17429760665972946,
2352
+ "grad_norm": 0.22857385873794556,
2353
+ "learning_rate": 0.00037,
2354
+ "loss": 0.0257,
2355
+ "step": 335
2356
+ },
2357
+ {
2358
+ "epoch": 0.17481789802289283,
2359
+ "grad_norm": 0.19890117645263672,
2360
+ "learning_rate": 0.0003694444444444445,
2361
+ "loss": 0.0275,
2362
+ "step": 336
2363
+ },
2364
+ {
2365
+ "epoch": 0.1753381893860562,
2366
+ "grad_norm": 0.17106270790100098,
2367
+ "learning_rate": 0.00036888888888888887,
2368
+ "loss": 0.0371,
2369
+ "step": 337
2370
+ },
2371
+ {
2372
+ "epoch": 0.17585848074921956,
2373
+ "grad_norm": 0.3300045430660248,
2374
+ "learning_rate": 0.00036833333333333336,
2375
+ "loss": 0.0481,
2376
+ "step": 338
2377
+ },
2378
+ {
2379
+ "epoch": 0.17637877211238293,
2380
+ "grad_norm": 0.26582735776901245,
2381
+ "learning_rate": 0.00036777777777777774,
2382
+ "loss": 0.0398,
2383
+ "step": 339
2384
+ },
2385
+ {
2386
+ "epoch": 0.1768990634755463,
2387
+ "grad_norm": 0.20054687559604645,
2388
+ "learning_rate": 0.00036722222222222223,
2389
+ "loss": 0.0288,
2390
+ "step": 340
2391
+ },
2392
+ {
2393
+ "epoch": 0.1774193548387097,
2394
+ "grad_norm": 0.15207038819789886,
2395
+ "learning_rate": 0.00036666666666666667,
2396
+ "loss": 0.0352,
2397
+ "step": 341
2398
+ },
2399
+ {
2400
+ "epoch": 0.17793964620187305,
2401
+ "grad_norm": 0.13785234093666077,
2402
+ "learning_rate": 0.0003661111111111111,
2403
+ "loss": 0.027,
2404
+ "step": 342
2405
+ },
2406
+ {
2407
+ "epoch": 0.17845993756503642,
2408
+ "grad_norm": 0.16440491378307343,
2409
+ "learning_rate": 0.0003655555555555556,
2410
+ "loss": 0.0327,
2411
+ "step": 343
2412
+ },
2413
+ {
2414
+ "epoch": 0.17898022892819979,
2415
+ "grad_norm": 0.15854951739311218,
2416
+ "learning_rate": 0.000365,
2417
+ "loss": 0.0254,
2418
+ "step": 344
2419
+ },
2420
+ {
2421
+ "epoch": 0.17950052029136315,
2422
+ "grad_norm": 0.1805776059627533,
2423
+ "learning_rate": 0.00036444444444444447,
2424
+ "loss": 0.0425,
2425
+ "step": 345
2426
+ },
2427
+ {
2428
+ "epoch": 0.18002081165452655,
2429
+ "grad_norm": 0.4681404232978821,
2430
+ "learning_rate": 0.00036388888888888886,
2431
+ "loss": 0.0588,
2432
+ "step": 346
2433
+ },
2434
+ {
2435
+ "epoch": 0.1805411030176899,
2436
+ "grad_norm": 0.16028301417827606,
2437
+ "learning_rate": 0.00036333333333333335,
2438
+ "loss": 0.0215,
2439
+ "step": 347
2440
+ },
2441
+ {
2442
+ "epoch": 0.18106139438085328,
2443
+ "grad_norm": 0.16450455784797668,
2444
+ "learning_rate": 0.0003627777777777778,
2445
+ "loss": 0.0245,
2446
+ "step": 348
2447
+ },
2448
+ {
2449
+ "epoch": 0.18158168574401665,
2450
+ "grad_norm": 0.2902337312698364,
2451
+ "learning_rate": 0.0003622222222222222,
2452
+ "loss": 0.0475,
2453
+ "step": 349
2454
+ },
2455
+ {
2456
+ "epoch": 0.18210197710718,
2457
+ "grad_norm": 0.27946949005126953,
2458
+ "learning_rate": 0.0003616666666666667,
2459
+ "loss": 0.0449,
2460
+ "step": 350
2461
+ },
2462
+ {
2463
+ "epoch": 0.18262226847034338,
2464
+ "grad_norm": 0.17264722287654877,
2465
+ "learning_rate": 0.0003611111111111111,
2466
+ "loss": 0.0331,
2467
+ "step": 351
2468
+ },
2469
+ {
2470
+ "epoch": 0.18314255983350677,
2471
+ "grad_norm": 0.24759423732757568,
2472
+ "learning_rate": 0.0003605555555555556,
2473
+ "loss": 0.0385,
2474
+ "step": 352
2475
+ },
2476
+ {
2477
+ "epoch": 0.18366285119667014,
2478
+ "grad_norm": 0.14519743621349335,
2479
+ "learning_rate": 0.00035999999999999997,
2480
+ "loss": 0.0209,
2481
+ "step": 353
2482
+ },
2483
+ {
2484
+ "epoch": 0.1841831425598335,
2485
+ "grad_norm": 0.27116432785987854,
2486
+ "learning_rate": 0.00035944444444444446,
2487
+ "loss": 0.0419,
2488
+ "step": 354
2489
+ },
2490
+ {
2491
+ "epoch": 0.18470343392299687,
2492
+ "grad_norm": 0.1809036135673523,
2493
+ "learning_rate": 0.0003588888888888889,
2494
+ "loss": 0.0423,
2495
+ "step": 355
2496
+ },
2497
+ {
2498
+ "epoch": 0.18522372528616024,
2499
+ "grad_norm": 0.23334546387195587,
2500
+ "learning_rate": 0.00035833333333333333,
2501
+ "loss": 0.0483,
2502
+ "step": 356
2503
+ },
2504
+ {
2505
+ "epoch": 0.18574401664932363,
2506
+ "grad_norm": 0.12199573218822479,
2507
+ "learning_rate": 0.00035777777777777777,
2508
+ "loss": 0.0294,
2509
+ "step": 357
2510
+ },
2511
+ {
2512
+ "epoch": 0.186264308012487,
2513
+ "grad_norm": 0.24833369255065918,
2514
+ "learning_rate": 0.0003572222222222222,
2515
+ "loss": 0.0328,
2516
+ "step": 358
2517
+ },
2518
+ {
2519
+ "epoch": 0.18678459937565037,
2520
+ "grad_norm": 0.46454137563705444,
2521
+ "learning_rate": 0.0003566666666666667,
2522
+ "loss": 0.0598,
2523
+ "step": 359
2524
+ },
2525
+ {
2526
+ "epoch": 0.18730489073881373,
2527
+ "grad_norm": 0.19704070687294006,
2528
+ "learning_rate": 0.0003561111111111111,
2529
+ "loss": 0.0278,
2530
+ "step": 360
2531
+ },
2532
+ {
2533
+ "epoch": 0.1878251821019771,
2534
+ "grad_norm": 0.18981118500232697,
2535
+ "learning_rate": 0.00035555555555555557,
2536
+ "loss": 0.0404,
2537
+ "step": 361
2538
+ },
2539
+ {
2540
+ "epoch": 0.18834547346514047,
2541
+ "grad_norm": 0.20858381688594818,
2542
+ "learning_rate": 0.000355,
2543
+ "loss": 0.0343,
2544
+ "step": 362
2545
+ },
2546
+ {
2547
+ "epoch": 0.18886576482830386,
2548
+ "grad_norm": 0.14601057767868042,
2549
+ "learning_rate": 0.00035444444444444445,
2550
+ "loss": 0.0293,
2551
+ "step": 363
2552
+ },
2553
+ {
2554
+ "epoch": 0.18938605619146723,
2555
+ "grad_norm": 0.1576007604598999,
2556
+ "learning_rate": 0.0003538888888888889,
2557
+ "loss": 0.0328,
2558
+ "step": 364
2559
+ },
2560
+ {
2561
+ "epoch": 0.1899063475546306,
2562
+ "grad_norm": 0.18504372239112854,
2563
+ "learning_rate": 0.0003533333333333333,
2564
+ "loss": 0.0315,
2565
+ "step": 365
2566
+ },
2567
+ {
2568
+ "epoch": 0.19042663891779396,
2569
+ "grad_norm": 0.16126742959022522,
2570
+ "learning_rate": 0.0003527777777777778,
2571
+ "loss": 0.0277,
2572
+ "step": 366
2573
+ },
2574
+ {
2575
+ "epoch": 0.19094693028095733,
2576
+ "grad_norm": 0.22791104018688202,
2577
+ "learning_rate": 0.00035222222222222225,
2578
+ "loss": 0.0452,
2579
+ "step": 367
2580
+ },
2581
+ {
2582
+ "epoch": 0.1914672216441207,
2583
+ "grad_norm": 0.2693690359592438,
2584
+ "learning_rate": 0.0003516666666666667,
2585
+ "loss": 0.0427,
2586
+ "step": 368
2587
+ },
2588
+ {
2589
+ "epoch": 0.1919875130072841,
2590
+ "grad_norm": 0.1650257259607315,
2591
+ "learning_rate": 0.0003511111111111111,
2592
+ "loss": 0.0241,
2593
+ "step": 369
2594
+ },
2595
+ {
2596
+ "epoch": 0.19250780437044746,
2597
+ "grad_norm": 0.22772428393363953,
2598
+ "learning_rate": 0.00035055555555555556,
2599
+ "loss": 0.0483,
2600
+ "step": 370
2601
+ },
2602
+ {
2603
+ "epoch": 0.19302809573361082,
2604
+ "grad_norm": 0.24612616002559662,
2605
+ "learning_rate": 0.00035,
2606
+ "loss": 0.042,
2607
+ "step": 371
2608
+ },
2609
+ {
2610
+ "epoch": 0.1935483870967742,
2611
+ "grad_norm": 0.22736461460590363,
2612
+ "learning_rate": 0.00034944444444444443,
2613
+ "loss": 0.0487,
2614
+ "step": 372
2615
+ },
2616
+ {
2617
+ "epoch": 0.19406867845993755,
2618
+ "grad_norm": 0.23257088661193848,
2619
+ "learning_rate": 0.0003488888888888889,
2620
+ "loss": 0.0515,
2621
+ "step": 373
2622
+ },
2623
+ {
2624
+ "epoch": 0.19458896982310095,
2625
+ "grad_norm": 0.2097531259059906,
2626
+ "learning_rate": 0.00034833333333333336,
2627
+ "loss": 0.0352,
2628
+ "step": 374
2629
+ },
2630
+ {
2631
+ "epoch": 0.19510926118626432,
2632
+ "grad_norm": 0.28301218152046204,
2633
+ "learning_rate": 0.0003477777777777778,
2634
+ "loss": 0.042,
2635
+ "step": 375
2636
+ },
2637
+ {
2638
+ "epoch": 0.19562955254942768,
2639
+ "grad_norm": 0.18338818848133087,
2640
+ "learning_rate": 0.00034722222222222224,
2641
+ "loss": 0.0285,
2642
+ "step": 376
2643
+ },
2644
+ {
2645
+ "epoch": 0.19614984391259105,
2646
+ "grad_norm": 0.21453578770160675,
2647
+ "learning_rate": 0.00034666666666666667,
2648
+ "loss": 0.0557,
2649
+ "step": 377
2650
+ },
2651
+ {
2652
+ "epoch": 0.19667013527575442,
2653
+ "grad_norm": 0.16289933025836945,
2654
+ "learning_rate": 0.0003461111111111111,
2655
+ "loss": 0.0337,
2656
+ "step": 378
2657
+ },
2658
+ {
2659
+ "epoch": 0.19719042663891778,
2660
+ "grad_norm": 0.19443009793758392,
2661
+ "learning_rate": 0.00034555555555555555,
2662
+ "loss": 0.0314,
2663
+ "step": 379
2664
+ },
2665
+ {
2666
+ "epoch": 0.19771071800208118,
2667
+ "grad_norm": 0.24147702753543854,
2668
+ "learning_rate": 0.000345,
2669
+ "loss": 0.051,
2670
+ "step": 380
2671
+ },
2672
+ {
2673
+ "epoch": 0.19823100936524454,
2674
+ "grad_norm": 0.19166404008865356,
2675
+ "learning_rate": 0.0003444444444444445,
2676
+ "loss": 0.041,
2677
+ "step": 381
2678
+ },
2679
+ {
2680
+ "epoch": 0.1987513007284079,
2681
+ "grad_norm": 0.26511725783348083,
2682
+ "learning_rate": 0.0003438888888888889,
2683
+ "loss": 0.043,
2684
+ "step": 382
2685
+ },
2686
+ {
2687
+ "epoch": 0.19927159209157128,
2688
+ "grad_norm": 0.19884304702281952,
2689
+ "learning_rate": 0.00034333333333333335,
2690
+ "loss": 0.0388,
2691
+ "step": 383
2692
+ },
2693
+ {
2694
+ "epoch": 0.19979188345473464,
2695
+ "grad_norm": 0.1897716373205185,
2696
+ "learning_rate": 0.0003427777777777778,
2697
+ "loss": 0.0288,
2698
+ "step": 384
2699
+ },
2700
+ {
2701
+ "epoch": 0.20031217481789804,
2702
+ "grad_norm": 0.228108212351799,
2703
+ "learning_rate": 0.0003422222222222222,
2704
+ "loss": 0.0208,
2705
+ "step": 385
2706
+ },
2707
+ {
2708
+ "epoch": 0.2008324661810614,
2709
+ "grad_norm": 0.22205443680286407,
2710
+ "learning_rate": 0.00034166666666666666,
2711
+ "loss": 0.0449,
2712
+ "step": 386
2713
+ },
2714
+ {
2715
+ "epoch": 0.20135275754422477,
2716
+ "grad_norm": 0.2547477185726166,
2717
+ "learning_rate": 0.0003411111111111111,
2718
+ "loss": 0.0419,
2719
+ "step": 387
2720
+ },
2721
+ {
2722
+ "epoch": 0.20187304890738814,
2723
+ "grad_norm": 0.26517170667648315,
2724
+ "learning_rate": 0.0003405555555555556,
2725
+ "loss": 0.0419,
2726
+ "step": 388
2727
+ },
2728
+ {
2729
+ "epoch": 0.2023933402705515,
2730
+ "grad_norm": 0.37391191720962524,
2731
+ "learning_rate": 0.00034,
2732
+ "loss": 0.0593,
2733
+ "step": 389
2734
+ },
2735
+ {
2736
+ "epoch": 0.20291363163371487,
2737
+ "grad_norm": 0.18347249925136566,
2738
+ "learning_rate": 0.00033944444444444446,
2739
+ "loss": 0.0283,
2740
+ "step": 390
2741
+ },
2742
+ {
2743
+ "epoch": 0.20343392299687826,
2744
+ "grad_norm": 0.20623968541622162,
2745
+ "learning_rate": 0.0003388888888888889,
2746
+ "loss": 0.0308,
2747
+ "step": 391
2748
+ },
2749
+ {
2750
+ "epoch": 0.20395421436004163,
2751
+ "grad_norm": 0.25673940777778625,
2752
+ "learning_rate": 0.00033833333333333334,
2753
+ "loss": 0.0571,
2754
+ "step": 392
2755
+ },
2756
+ {
2757
+ "epoch": 0.204474505723205,
2758
+ "grad_norm": 0.12756018340587616,
2759
+ "learning_rate": 0.00033777777777777777,
2760
+ "loss": 0.0291,
2761
+ "step": 393
2762
+ },
2763
+ {
2764
+ "epoch": 0.20499479708636836,
2765
+ "grad_norm": 0.18630138039588928,
2766
+ "learning_rate": 0.0003372222222222222,
2767
+ "loss": 0.0353,
2768
+ "step": 394
2769
+ },
2770
+ {
2771
+ "epoch": 0.20551508844953173,
2772
+ "grad_norm": 0.1463075578212738,
2773
+ "learning_rate": 0.0003366666666666667,
2774
+ "loss": 0.0274,
2775
+ "step": 395
2776
+ },
2777
+ {
2778
+ "epoch": 0.2060353798126951,
2779
+ "grad_norm": 0.21908532083034515,
2780
+ "learning_rate": 0.00033611111111111114,
2781
+ "loss": 0.0366,
2782
+ "step": 396
2783
+ },
2784
+ {
2785
+ "epoch": 0.2065556711758585,
2786
+ "grad_norm": 0.23834265768527985,
2787
+ "learning_rate": 0.0003355555555555556,
2788
+ "loss": 0.0228,
2789
+ "step": 397
2790
+ },
2791
+ {
2792
+ "epoch": 0.20707596253902186,
2793
+ "grad_norm": 0.264460951089859,
2794
+ "learning_rate": 0.000335,
2795
+ "loss": 0.0412,
2796
+ "step": 398
2797
+ },
2798
+ {
2799
+ "epoch": 0.20759625390218522,
2800
+ "grad_norm": 0.29063087701797485,
2801
+ "learning_rate": 0.00033444444444444445,
2802
+ "loss": 0.0493,
2803
+ "step": 399
2804
+ },
2805
+ {
2806
+ "epoch": 0.2081165452653486,
2807
+ "grad_norm": 0.1892634779214859,
2808
+ "learning_rate": 0.0003338888888888889,
2809
+ "loss": 0.0336,
2810
+ "step": 400
2811
+ },
2812
+ {
2813
+ "epoch": 0.20863683662851196,
2814
+ "grad_norm": 0.2367408573627472,
2815
+ "learning_rate": 0.0003333333333333333,
2816
+ "loss": 0.0332,
2817
+ "step": 401
2818
+ },
2819
+ {
2820
+ "epoch": 0.20915712799167535,
2821
+ "grad_norm": 0.1723126769065857,
2822
+ "learning_rate": 0.0003327777777777778,
2823
+ "loss": 0.0319,
2824
+ "step": 402
2825
+ },
2826
+ {
2827
+ "epoch": 0.20967741935483872,
2828
+ "grad_norm": 0.14627283811569214,
2829
+ "learning_rate": 0.0003322222222222222,
2830
+ "loss": 0.0266,
2831
+ "step": 403
2832
+ },
2833
+ {
2834
+ "epoch": 0.21019771071800208,
2835
+ "grad_norm": 0.1599954068660736,
2836
+ "learning_rate": 0.0003316666666666667,
2837
+ "loss": 0.0388,
2838
+ "step": 404
2839
+ },
2840
+ {
2841
+ "epoch": 0.21071800208116545,
2842
+ "grad_norm": 0.37434253096580505,
2843
+ "learning_rate": 0.0003311111111111111,
2844
+ "loss": 0.0653,
2845
+ "step": 405
2846
+ },
2847
+ {
2848
+ "epoch": 0.21123829344432882,
2849
+ "grad_norm": 0.1519968956708908,
2850
+ "learning_rate": 0.00033055555555555556,
2851
+ "loss": 0.0289,
2852
+ "step": 406
2853
+ },
2854
+ {
2855
+ "epoch": 0.21175858480749218,
2856
+ "grad_norm": 0.14485976099967957,
2857
+ "learning_rate": 0.00033,
2858
+ "loss": 0.0161,
2859
+ "step": 407
2860
+ },
2861
+ {
2862
+ "epoch": 0.21227887617065558,
2863
+ "grad_norm": 0.3291303217411041,
2864
+ "learning_rate": 0.00032944444444444444,
2865
+ "loss": 0.0527,
2866
+ "step": 408
2867
+ },
2868
+ {
2869
+ "epoch": 0.21279916753381894,
2870
+ "grad_norm": 0.19733606278896332,
2871
+ "learning_rate": 0.0003288888888888889,
2872
+ "loss": 0.0405,
2873
+ "step": 409
2874
+ },
2875
+ {
2876
+ "epoch": 0.2133194588969823,
2877
+ "grad_norm": 0.2552485764026642,
2878
+ "learning_rate": 0.0003283333333333333,
2879
+ "loss": 0.0339,
2880
+ "step": 410
2881
+ },
2882
+ {
2883
+ "epoch": 0.21383975026014568,
2884
+ "grad_norm": 0.14234775304794312,
2885
+ "learning_rate": 0.0003277777777777778,
2886
+ "loss": 0.0294,
2887
+ "step": 411
2888
+ },
2889
+ {
2890
+ "epoch": 0.21436004162330904,
2891
+ "grad_norm": 0.2233223021030426,
2892
+ "learning_rate": 0.00032722222222222224,
2893
+ "loss": 0.0492,
2894
+ "step": 412
2895
+ },
2896
+ {
2897
+ "epoch": 0.2148803329864724,
2898
+ "grad_norm": 0.11738775670528412,
2899
+ "learning_rate": 0.0003266666666666667,
2900
+ "loss": 0.0247,
2901
+ "step": 413
2902
+ },
2903
+ {
2904
+ "epoch": 0.2154006243496358,
2905
+ "grad_norm": 0.1777840107679367,
2906
+ "learning_rate": 0.0003261111111111111,
2907
+ "loss": 0.0241,
2908
+ "step": 414
2909
+ },
2910
+ {
2911
+ "epoch": 0.21592091571279917,
2912
+ "grad_norm": 0.20584549009799957,
2913
+ "learning_rate": 0.00032555555555555555,
2914
+ "loss": 0.0251,
2915
+ "step": 415
2916
+ },
2917
+ {
2918
+ "epoch": 0.21644120707596254,
2919
+ "grad_norm": 0.16335804760456085,
2920
+ "learning_rate": 0.00032500000000000004,
2921
+ "loss": 0.0232,
2922
+ "step": 416
2923
+ },
2924
+ {
2925
+ "epoch": 0.2169614984391259,
2926
+ "grad_norm": 0.1476750373840332,
2927
+ "learning_rate": 0.0003244444444444444,
2928
+ "loss": 0.0259,
2929
+ "step": 417
2930
+ },
2931
+ {
2932
+ "epoch": 0.21748178980228927,
2933
+ "grad_norm": 0.25620049238204956,
2934
+ "learning_rate": 0.0003238888888888889,
2935
+ "loss": 0.056,
2936
+ "step": 418
2937
+ },
2938
+ {
2939
+ "epoch": 0.21800208116545267,
2940
+ "grad_norm": 0.2029629945755005,
2941
+ "learning_rate": 0.0003233333333333333,
2942
+ "loss": 0.0318,
2943
+ "step": 419
2944
+ },
2945
+ {
2946
+ "epoch": 0.21852237252861603,
2947
+ "grad_norm": 0.3485390245914459,
2948
+ "learning_rate": 0.0003227777777777778,
2949
+ "loss": 0.0371,
2950
+ "step": 420
2951
+ },
2952
+ {
2953
+ "epoch": 0.2190426638917794,
2954
+ "grad_norm": 0.11944156140089035,
2955
+ "learning_rate": 0.0003222222222222222,
2956
+ "loss": 0.0225,
2957
+ "step": 421
2958
+ },
2959
+ {
2960
+ "epoch": 0.21956295525494277,
2961
+ "grad_norm": 0.1591196358203888,
2962
+ "learning_rate": 0.00032166666666666666,
2963
+ "loss": 0.0312,
2964
+ "step": 422
2965
+ },
2966
+ {
2967
+ "epoch": 0.22008324661810613,
2968
+ "grad_norm": 0.1827545315027237,
2969
+ "learning_rate": 0.00032111111111111115,
2970
+ "loss": 0.0343,
2971
+ "step": 423
2972
+ },
2973
+ {
2974
+ "epoch": 0.2206035379812695,
2975
+ "grad_norm": 0.21761400997638702,
2976
+ "learning_rate": 0.00032055555555555554,
2977
+ "loss": 0.0407,
2978
+ "step": 424
2979
+ },
2980
+ {
2981
+ "epoch": 0.2211238293444329,
2982
+ "grad_norm": 0.18732213973999023,
2983
+ "learning_rate": 0.00032,
2984
+ "loss": 0.0395,
2985
+ "step": 425
2986
+ },
2987
+ {
2988
+ "epoch": 0.22164412070759626,
2989
+ "grad_norm": 0.12878796458244324,
2990
+ "learning_rate": 0.0003194444444444444,
2991
+ "loss": 0.0234,
2992
+ "step": 426
2993
+ },
2994
+ {
2995
+ "epoch": 0.22216441207075963,
2996
+ "grad_norm": 0.29317036271095276,
2997
+ "learning_rate": 0.0003188888888888889,
2998
+ "loss": 0.0436,
2999
+ "step": 427
3000
+ },
3001
+ {
3002
+ "epoch": 0.222684703433923,
3003
+ "grad_norm": 0.27346885204315186,
3004
+ "learning_rate": 0.00031833333333333334,
3005
+ "loss": 0.0359,
3006
+ "step": 428
3007
+ },
3008
+ {
3009
+ "epoch": 0.22320499479708636,
3010
+ "grad_norm": 0.12804454565048218,
3011
+ "learning_rate": 0.0003177777777777778,
3012
+ "loss": 0.0269,
3013
+ "step": 429
3014
+ },
3015
+ {
3016
+ "epoch": 0.22372528616024975,
3017
+ "grad_norm": 0.2954390347003937,
3018
+ "learning_rate": 0.00031722222222222227,
3019
+ "loss": 0.0429,
3020
+ "step": 430
3021
+ },
3022
+ {
3023
+ "epoch": 0.22424557752341312,
3024
+ "grad_norm": 0.12796026468276978,
3025
+ "learning_rate": 0.00031666666666666665,
3026
+ "loss": 0.0346,
3027
+ "step": 431
3028
+ },
3029
+ {
3030
+ "epoch": 0.2247658688865765,
3031
+ "grad_norm": 0.24340416491031647,
3032
+ "learning_rate": 0.00031611111111111114,
3033
+ "loss": 0.0322,
3034
+ "step": 432
3035
+ },
3036
+ {
3037
+ "epoch": 0.22528616024973985,
3038
+ "grad_norm": 0.13648621737957,
3039
+ "learning_rate": 0.0003155555555555555,
3040
+ "loss": 0.0238,
3041
+ "step": 433
3042
+ },
3043
+ {
3044
+ "epoch": 0.22580645161290322,
3045
+ "grad_norm": 0.17342010140419006,
3046
+ "learning_rate": 0.000315,
3047
+ "loss": 0.0222,
3048
+ "step": 434
3049
+ },
3050
+ {
3051
+ "epoch": 0.22632674297606659,
3052
+ "grad_norm": 0.21049854159355164,
3053
+ "learning_rate": 0.0003144444444444445,
3054
+ "loss": 0.022,
3055
+ "step": 435
3056
+ },
3057
+ {
3058
+ "epoch": 0.22684703433922998,
3059
+ "grad_norm": 0.24159543216228485,
3060
+ "learning_rate": 0.0003138888888888889,
3061
+ "loss": 0.0359,
3062
+ "step": 436
3063
+ },
3064
+ {
3065
+ "epoch": 0.22736732570239335,
3066
+ "grad_norm": 0.18714144825935364,
3067
+ "learning_rate": 0.0003133333333333334,
3068
+ "loss": 0.0386,
3069
+ "step": 437
3070
+ },
3071
+ {
3072
+ "epoch": 0.2278876170655567,
3073
+ "grad_norm": 0.24189646542072296,
3074
+ "learning_rate": 0.00031277777777777776,
3075
+ "loss": 0.0382,
3076
+ "step": 438
3077
+ },
3078
+ {
3079
+ "epoch": 0.22840790842872008,
3080
+ "grad_norm": 0.16704939305782318,
3081
+ "learning_rate": 0.00031222222222222225,
3082
+ "loss": 0.0443,
3083
+ "step": 439
3084
+ },
3085
+ {
3086
+ "epoch": 0.22892819979188345,
3087
+ "grad_norm": 0.20545163750648499,
3088
+ "learning_rate": 0.00031166666666666663,
3089
+ "loss": 0.041,
3090
+ "step": 440
3091
+ },
3092
+ {
3093
+ "epoch": 0.2294484911550468,
3094
+ "grad_norm": 0.16772353649139404,
3095
+ "learning_rate": 0.0003111111111111111,
3096
+ "loss": 0.0275,
3097
+ "step": 441
3098
+ },
3099
+ {
3100
+ "epoch": 0.2299687825182102,
3101
+ "grad_norm": 0.22355173528194427,
3102
+ "learning_rate": 0.0003105555555555555,
3103
+ "loss": 0.0352,
3104
+ "step": 442
3105
+ },
3106
+ {
3107
+ "epoch": 0.23048907388137357,
3108
+ "grad_norm": 0.24697473645210266,
3109
+ "learning_rate": 0.00031,
3110
+ "loss": 0.0351,
3111
+ "step": 443
3112
+ },
3113
+ {
3114
+ "epoch": 0.23100936524453694,
3115
+ "grad_norm": 0.17634686827659607,
3116
+ "learning_rate": 0.0003094444444444445,
3117
+ "loss": 0.0274,
3118
+ "step": 444
3119
+ },
3120
+ {
3121
+ "epoch": 0.2315296566077003,
3122
+ "grad_norm": 0.24014054238796234,
3123
+ "learning_rate": 0.0003088888888888889,
3124
+ "loss": 0.0268,
3125
+ "step": 445
3126
+ },
3127
+ {
3128
+ "epoch": 0.23204994797086367,
3129
+ "grad_norm": 0.12158364802598953,
3130
+ "learning_rate": 0.00030833333333333337,
3131
+ "loss": 0.0238,
3132
+ "step": 446
3133
+ },
3134
+ {
3135
+ "epoch": 0.23257023933402707,
3136
+ "grad_norm": 0.24085555970668793,
3137
+ "learning_rate": 0.00030777777777777775,
3138
+ "loss": 0.0332,
3139
+ "step": 447
3140
+ },
3141
+ {
3142
+ "epoch": 0.23309053069719043,
3143
+ "grad_norm": 0.17350415885448456,
3144
+ "learning_rate": 0.00030722222222222224,
3145
+ "loss": 0.0418,
3146
+ "step": 448
3147
+ },
3148
+ {
3149
+ "epoch": 0.2336108220603538,
3150
+ "grad_norm": 0.10665347427129745,
3151
+ "learning_rate": 0.0003066666666666667,
3152
+ "loss": 0.0217,
3153
+ "step": 449
3154
+ },
3155
+ {
3156
+ "epoch": 0.23413111342351717,
3157
+ "grad_norm": 0.23732754588127136,
3158
+ "learning_rate": 0.0003061111111111111,
3159
+ "loss": 0.0406,
3160
+ "step": 450
3161
+ },
3162
+ {
3163
+ "epoch": 0.23465140478668053,
3164
+ "grad_norm": 0.09794217348098755,
3165
+ "learning_rate": 0.0003055555555555556,
3166
+ "loss": 0.0207,
3167
+ "step": 451
3168
+ },
3169
+ {
3170
+ "epoch": 0.2351716961498439,
3171
+ "grad_norm": 0.20581063628196716,
3172
+ "learning_rate": 0.000305,
3173
+ "loss": 0.0339,
3174
+ "step": 452
3175
+ },
3176
+ {
3177
+ "epoch": 0.2356919875130073,
3178
+ "grad_norm": 0.17121030390262604,
3179
+ "learning_rate": 0.0003044444444444445,
3180
+ "loss": 0.0325,
3181
+ "step": 453
3182
+ },
3183
+ {
3184
+ "epoch": 0.23621227887617066,
3185
+ "grad_norm": 0.16894112527370453,
3186
+ "learning_rate": 0.00030388888888888886,
3187
+ "loss": 0.033,
3188
+ "step": 454
3189
+ },
3190
+ {
3191
+ "epoch": 0.23673257023933403,
3192
+ "grad_norm": 0.09503252059221268,
3193
+ "learning_rate": 0.00030333333333333335,
3194
+ "loss": 0.0156,
3195
+ "step": 455
3196
+ },
3197
+ {
3198
+ "epoch": 0.2372528616024974,
3199
+ "grad_norm": 0.2337169647216797,
3200
+ "learning_rate": 0.0003027777777777778,
3201
+ "loss": 0.0208,
3202
+ "step": 456
3203
+ },
3204
+ {
3205
+ "epoch": 0.23777315296566076,
3206
+ "grad_norm": 0.20605909824371338,
3207
+ "learning_rate": 0.0003022222222222222,
3208
+ "loss": 0.034,
3209
+ "step": 457
3210
+ },
3211
+ {
3212
+ "epoch": 0.23829344432882413,
3213
+ "grad_norm": 0.15843386948108673,
3214
+ "learning_rate": 0.0003016666666666667,
3215
+ "loss": 0.0298,
3216
+ "step": 458
3217
+ },
3218
+ {
3219
+ "epoch": 0.23881373569198752,
3220
+ "grad_norm": 0.1802842915058136,
3221
+ "learning_rate": 0.0003011111111111111,
3222
+ "loss": 0.0216,
3223
+ "step": 459
3224
+ },
3225
+ {
3226
+ "epoch": 0.2393340270551509,
3227
+ "grad_norm": 0.13717086613178253,
3228
+ "learning_rate": 0.0003005555555555556,
3229
+ "loss": 0.0249,
3230
+ "step": 460
3231
+ },
3232
+ {
3233
+ "epoch": 0.23985431841831426,
3234
+ "grad_norm": 0.19162088632583618,
3235
+ "learning_rate": 0.0003,
3236
+ "loss": 0.0475,
3237
+ "step": 461
3238
+ },
3239
+ {
3240
+ "epoch": 0.24037460978147762,
3241
+ "grad_norm": 0.23011524975299835,
3242
+ "learning_rate": 0.00029944444444444446,
3243
+ "loss": 0.0313,
3244
+ "step": 462
3245
+ },
3246
+ {
3247
+ "epoch": 0.240894901144641,
3248
+ "grad_norm": 0.18215711414813995,
3249
+ "learning_rate": 0.0002988888888888889,
3250
+ "loss": 0.0378,
3251
+ "step": 463
3252
+ },
3253
+ {
3254
+ "epoch": 0.24141519250780438,
3255
+ "grad_norm": 0.3314879834651947,
3256
+ "learning_rate": 0.00029833333333333334,
3257
+ "loss": 0.0386,
3258
+ "step": 464
3259
+ },
3260
+ {
3261
+ "epoch": 0.24193548387096775,
3262
+ "grad_norm": 0.18399035930633545,
3263
+ "learning_rate": 0.0002977777777777778,
3264
+ "loss": 0.037,
3265
+ "step": 465
3266
+ },
3267
+ {
3268
+ "epoch": 0.24245577523413112,
3269
+ "grad_norm": 0.3071196377277374,
3270
+ "learning_rate": 0.0002972222222222222,
3271
+ "loss": 0.0362,
3272
+ "step": 466
3273
+ },
3274
+ {
3275
+ "epoch": 0.24297606659729448,
3276
+ "grad_norm": 0.13809853792190552,
3277
+ "learning_rate": 0.0002966666666666667,
3278
+ "loss": 0.0276,
3279
+ "step": 467
3280
+ },
3281
+ {
3282
+ "epoch": 0.24349635796045785,
3283
+ "grad_norm": 0.24184127151966095,
3284
+ "learning_rate": 0.0002961111111111111,
3285
+ "loss": 0.0419,
3286
+ "step": 468
3287
+ },
3288
+ {
3289
+ "epoch": 0.24401664932362122,
3290
+ "grad_norm": 0.1667579561471939,
3291
+ "learning_rate": 0.0002955555555555556,
3292
+ "loss": 0.0318,
3293
+ "step": 469
3294
+ },
3295
+ {
3296
+ "epoch": 0.2445369406867846,
3297
+ "grad_norm": 0.1575225442647934,
3298
+ "learning_rate": 0.000295,
3299
+ "loss": 0.0305,
3300
+ "step": 470
3301
+ },
3302
+ {
3303
+ "epoch": 0.24505723204994798,
3304
+ "grad_norm": 0.17671610414981842,
3305
+ "learning_rate": 0.00029444444444444445,
3306
+ "loss": 0.034,
3307
+ "step": 471
3308
+ },
3309
+ {
3310
+ "epoch": 0.24557752341311134,
3311
+ "grad_norm": 0.138526052236557,
3312
+ "learning_rate": 0.0002938888888888889,
3313
+ "loss": 0.0157,
3314
+ "step": 472
3315
+ },
3316
+ {
3317
+ "epoch": 0.2460978147762747,
3318
+ "grad_norm": 0.27597323060035706,
3319
+ "learning_rate": 0.0002933333333333333,
3320
+ "loss": 0.0381,
3321
+ "step": 473
3322
+ },
3323
+ {
3324
+ "epoch": 0.24661810613943808,
3325
+ "grad_norm": 0.15420523285865784,
3326
+ "learning_rate": 0.0002927777777777778,
3327
+ "loss": 0.0226,
3328
+ "step": 474
3329
+ },
3330
+ {
3331
+ "epoch": 0.24713839750260147,
3332
+ "grad_norm": 0.20491866767406464,
3333
+ "learning_rate": 0.0002922222222222222,
3334
+ "loss": 0.0161,
3335
+ "step": 475
3336
+ },
3337
+ {
3338
+ "epoch": 0.24765868886576484,
3339
+ "grad_norm": 0.14067193865776062,
3340
+ "learning_rate": 0.0002916666666666667,
3341
+ "loss": 0.0335,
3342
+ "step": 476
3343
+ },
3344
+ {
3345
+ "epoch": 0.2481789802289282,
3346
+ "grad_norm": 0.19436928629875183,
3347
+ "learning_rate": 0.00029111111111111113,
3348
+ "loss": 0.0435,
3349
+ "step": 477
3350
+ },
3351
+ {
3352
+ "epoch": 0.24869927159209157,
3353
+ "grad_norm": 0.19192419946193695,
3354
+ "learning_rate": 0.00029055555555555556,
3355
+ "loss": 0.031,
3356
+ "step": 478
3357
+ },
3358
+ {
3359
+ "epoch": 0.24921956295525494,
3360
+ "grad_norm": 0.1773335337638855,
3361
+ "learning_rate": 0.00029,
3362
+ "loss": 0.03,
3363
+ "step": 479
3364
+ },
3365
+ {
3366
+ "epoch": 0.2497398543184183,
3367
+ "grad_norm": 0.19989162683486938,
3368
+ "learning_rate": 0.00028944444444444444,
3369
+ "loss": 0.0232,
3370
+ "step": 480
3371
+ },
3372
+ {
3373
+ "epoch": 0.25026014568158167,
3374
+ "grad_norm": 0.1678122878074646,
3375
+ "learning_rate": 0.0002888888888888889,
3376
+ "loss": 0.0324,
3377
+ "step": 481
3378
+ },
3379
+ {
3380
+ "epoch": 0.25078043704474506,
3381
+ "grad_norm": 0.15694859623908997,
3382
+ "learning_rate": 0.0002883333333333333,
3383
+ "loss": 0.0186,
3384
+ "step": 482
3385
+ },
3386
+ {
3387
+ "epoch": 0.2513007284079084,
3388
+ "grad_norm": 0.1930493712425232,
3389
+ "learning_rate": 0.0002877777777777778,
3390
+ "loss": 0.0354,
3391
+ "step": 483
3392
+ },
3393
+ {
3394
+ "epoch": 0.2518210197710718,
3395
+ "grad_norm": 0.29496946930885315,
3396
+ "learning_rate": 0.00028722222222222224,
3397
+ "loss": 0.0499,
3398
+ "step": 484
3399
+ },
3400
+ {
3401
+ "epoch": 0.2523413111342352,
3402
+ "grad_norm": 0.1735425591468811,
3403
+ "learning_rate": 0.0002866666666666667,
3404
+ "loss": 0.0307,
3405
+ "step": 485
3406
+ },
3407
+ {
3408
+ "epoch": 0.25286160249739853,
3409
+ "grad_norm": 0.2830154299736023,
3410
+ "learning_rate": 0.0002861111111111111,
3411
+ "loss": 0.0375,
3412
+ "step": 486
3413
+ },
3414
+ {
3415
+ "epoch": 0.2533818938605619,
3416
+ "grad_norm": 0.13438007235527039,
3417
+ "learning_rate": 0.00028555555555555555,
3418
+ "loss": 0.0327,
3419
+ "step": 487
3420
+ },
3421
+ {
3422
+ "epoch": 0.25390218522372526,
3423
+ "grad_norm": 0.2650485336780548,
3424
+ "learning_rate": 0.000285,
3425
+ "loss": 0.0493,
3426
+ "step": 488
3427
+ },
3428
+ {
3429
+ "epoch": 0.25442247658688866,
3430
+ "grad_norm": 0.17854094505310059,
3431
+ "learning_rate": 0.0002844444444444444,
3432
+ "loss": 0.0325,
3433
+ "step": 489
3434
+ },
3435
+ {
3436
+ "epoch": 0.25494276795005205,
3437
+ "grad_norm": 0.14844731986522675,
3438
+ "learning_rate": 0.0002838888888888889,
3439
+ "loss": 0.0362,
3440
+ "step": 490
3441
+ },
3442
+ {
3443
+ "epoch": 0.2554630593132154,
3444
+ "grad_norm": 0.14285333454608917,
3445
+ "learning_rate": 0.00028333333333333335,
3446
+ "loss": 0.0312,
3447
+ "step": 491
3448
+ },
3449
+ {
3450
+ "epoch": 0.2559833506763788,
3451
+ "grad_norm": 0.3904401361942291,
3452
+ "learning_rate": 0.0002827777777777778,
3453
+ "loss": 0.0506,
3454
+ "step": 492
3455
+ },
3456
+ {
3457
+ "epoch": 0.2565036420395421,
3458
+ "grad_norm": 0.24693432450294495,
3459
+ "learning_rate": 0.00028222222222222223,
3460
+ "loss": 0.0356,
3461
+ "step": 493
3462
+ },
3463
+ {
3464
+ "epoch": 0.2570239334027055,
3465
+ "grad_norm": 0.1814284324645996,
3466
+ "learning_rate": 0.00028166666666666666,
3467
+ "loss": 0.0379,
3468
+ "step": 494
3469
+ },
3470
+ {
3471
+ "epoch": 0.2575442247658689,
3472
+ "grad_norm": 0.1869429349899292,
3473
+ "learning_rate": 0.0002811111111111111,
3474
+ "loss": 0.0384,
3475
+ "step": 495
3476
+ },
3477
+ {
3478
+ "epoch": 0.25806451612903225,
3479
+ "grad_norm": 0.13896095752716064,
3480
+ "learning_rate": 0.00028055555555555554,
3481
+ "loss": 0.0334,
3482
+ "step": 496
3483
+ },
3484
+ {
3485
+ "epoch": 0.25858480749219565,
3486
+ "grad_norm": 0.13905422389507294,
3487
+ "learning_rate": 0.00028000000000000003,
3488
+ "loss": 0.0276,
3489
+ "step": 497
3490
+ },
3491
+ {
3492
+ "epoch": 0.259105098855359,
3493
+ "grad_norm": 0.16455614566802979,
3494
+ "learning_rate": 0.00027944444444444447,
3495
+ "loss": 0.0338,
3496
+ "step": 498
3497
+ },
3498
+ {
3499
+ "epoch": 0.2596253902185224,
3500
+ "grad_norm": 0.22037294507026672,
3501
+ "learning_rate": 0.0002788888888888889,
3502
+ "loss": 0.0492,
3503
+ "step": 499
3504
+ },
3505
+ {
3506
+ "epoch": 0.2601456815816857,
3507
+ "grad_norm": 0.12147378921508789,
3508
+ "learning_rate": 0.00027833333333333334,
3509
+ "loss": 0.033,
3510
+ "step": 500
3511
+ }
3512
+ ],
3513
+ "logging_steps": 1,
3514
+ "max_steps": 1000,
3515
+ "num_input_tokens_seen": 0,
3516
+ "num_train_epochs": 1,
3517
+ "save_steps": 100,
3518
+ "stateful_callbacks": {
3519
+ "TrainerControl": {
3520
+ "args": {
3521
+ "should_epoch_stop": false,
3522
+ "should_evaluate": false,
3523
+ "should_log": false,
3524
+ "should_save": true,
3525
+ "should_training_stop": false
3526
+ },
3527
+ "attributes": {}
3528
+ }
3529
+ },
3530
+ "total_flos": 0.0,
3531
+ "train_batch_size": 128,
3532
+ "trial_name": null,
3533
+ "trial_params": null
3534
+ }
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-500/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-600/added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-600/chat_template.jinja ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system
2
+ You are a helpful assistant.<|im_end|>
3
+ {% endif %}<|im_start|>{{ message['role'] }}
4
+ {% if message['content'] is string %}{{ message['content'] }}<|im_end|>
5
+ {% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>
6
+ {% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
7
+ {% endif %}
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-600/preprocessor_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_convert_rgb": true,
3
+ "do_normalize": true,
4
+ "do_rescale": true,
5
+ "do_resize": true,
6
+ "image_mean": [
7
+ 0.48145466,
8
+ 0.4578275,
9
+ 0.40821073
10
+ ],
11
+ "image_processor_type": "Qwen2_5_VLImageProcessor",
12
+ "image_std": [
13
+ 0.26862954,
14
+ 0.26130258,
15
+ 0.27577711
16
+ ],
17
+ "max_pixels": 1003520,
18
+ "merge_size": 2,
19
+ "min_pixels": 3136,
20
+ "patch_size": 14,
21
+ "processor_class": "Qwen2_5_VLProcessor",
22
+ "resample": 3,
23
+ "rescale_factor": 0.00392156862745098,
24
+ "size": {
25
+ "max_pixels": 1003520,
26
+ "min_pixels": 3136
27
+ },
28
+ "temporal_patch_size": 2
29
+ }
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-600/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-600/tokenizer_config.json ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "clean_up_tokenization_spaces": false,
199
+ "eos_token": "<|im_end|>",
200
+ "errors": "replace",
201
+ "extra_special_tokens": {},
202
+ "model_max_length": 131072,
203
+ "pad_token": "<|endoftext|>",
204
+ "processor_class": "Qwen2_5_VLProcessor",
205
+ "split_special_tokens": false,
206
+ "tokenizer_class": "Qwen2Tokenizer",
207
+ "unk_token": null
208
+ }
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-600/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-700/added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-700/chat_template.jinja ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system
2
+ You are a helpful assistant.<|im_end|>
3
+ {% endif %}<|im_start|>{{ message['role'] }}
4
+ {% if message['content'] is string %}{{ message['content'] }}<|im_end|>
5
+ {% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>
6
+ {% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
7
+ {% endif %}
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-700/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-700/preprocessor_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_convert_rgb": true,
3
+ "do_normalize": true,
4
+ "do_rescale": true,
5
+ "do_resize": true,
6
+ "image_mean": [
7
+ 0.48145466,
8
+ 0.4578275,
9
+ 0.40821073
10
+ ],
11
+ "image_processor_type": "Qwen2_5_VLImageProcessor",
12
+ "image_std": [
13
+ 0.26862954,
14
+ 0.26130258,
15
+ 0.27577711
16
+ ],
17
+ "max_pixels": 1003520,
18
+ "merge_size": 2,
19
+ "min_pixels": 3136,
20
+ "patch_size": 14,
21
+ "processor_class": "Qwen2_5_VLProcessor",
22
+ "resample": 3,
23
+ "rescale_factor": 0.00392156862745098,
24
+ "size": {
25
+ "max_pixels": 1003520,
26
+ "min_pixels": 3136
27
+ },
28
+ "temporal_patch_size": 2
29
+ }
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-700/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-700/tokenizer_config.json ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "clean_up_tokenization_spaces": false,
199
+ "eos_token": "<|im_end|>",
200
+ "errors": "replace",
201
+ "extra_special_tokens": {},
202
+ "model_max_length": 131072,
203
+ "pad_token": "<|endoftext|>",
204
+ "processor_class": "Qwen2_5_VLProcessor",
205
+ "split_special_tokens": false,
206
+ "tokenizer_class": "Qwen2Tokenizer",
207
+ "unk_token": null
208
+ }
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-700/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-700/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-800/added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }