MgGladys commited on Feb 9

Commit

1ffaeb6

verified ·

1 Parent(s): 43feefe

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

adhoc/debug/iterable_dataset_drop_last_batch.py +55 -0
adhoc/eval_mteb/e5mistral_prompt.py +143 -0
adhoc/eval_mteb/merge_cqadupstack.py +80 -0
adhoc/eval_mteb/mteb_utils.py +348 -0
adhoc/eval_mteb/run_mteb.py +198 -0
adhoc/gather_score_byckpt_aws.py +136 -0
adhoc/hf_datasets.py +37 -0
adhoc/merge_checkpoint.py +26 -0
adhoc/plot.py +31 -0
adhoc/plot2.py +47 -0
adhoc/test_ddp.py +24 -0
adhoc/testset_stats.py +66 -0
adhoc/visual_doc/category_colpali_training.py +27 -0
adhoc/visual_doc/category_visrag_training.py +38 -0
adhoc/visual_doc/check_corpus.py +7 -0
adhoc/visual_doc/mmdoclong-doc.py +124 -0
adhoc/visual_doc/mmdoclong.py +124 -0
adhoc/visual_doc/vidoseek.py +117 -0
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-100/added_tokens.json +24 -0
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-100/chat_template.jinja +7 -0
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-100/merges.txt +0 -0
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-100/preprocessor_config.json +29 -0
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-100/special_tokens_map.json +31 -0
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-100/tokenizer_config.json +208 -0
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-100/trainer_state.json +734 -0
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-100/vocab.json +0 -0
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-400/special_tokens_map.json +31 -0
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-500/added_tokens.json +24 -0
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-500/chat_template.jinja +7 -0
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-500/merges.txt +0 -0
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-500/preprocessor_config.json +29 -0
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-500/special_tokens_map.json +31 -0
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-500/tokenizer_config.json +208 -0
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-500/trainer_state.json +3534 -0
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-500/vocab.json +0 -0
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-600/added_tokens.json +24 -0
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-600/chat_template.jinja +7 -0
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-600/preprocessor_config.json +29 -0
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-600/special_tokens_map.json +31 -0
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-600/tokenizer_config.json +208 -0
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-600/trainer_state.json +0 -0
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-700/added_tokens.json +24 -0
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-700/chat_template.jinja +7 -0
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-700/merges.txt +0 -0
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-700/preprocessor_config.json +29 -0
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-700/special_tokens_map.json +31 -0
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-700/tokenizer_config.json +208 -0
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-700/trainer_state.json +0 -0
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-700/vocab.json +0 -0
experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-800/added_tokens.json +24 -0

adhoc/debug/iterable_dataset_drop_last_batch.py ADDED Viewed

	@@ -0,0 +1,55 @@

+from datasets import Dataset
+from datasets import interleave_datasets
+from torch.utils.data import DataLoader
+def convert_to_str(batch, dataset_name):
+    batch['a'] = [f"{dataset_name}-{e}" for e in batch['a']]
+    return batch
+def gen1():
+    for ii in range(1, 25):
+        yield {"a": ii}
+def gen2():
+    for ii in range(1, 25):
+        yield {"a": ii}
+# https://github.com/huggingface/datasets/issues/6565
+if __name__ == '__main__':
+    dataset1 = Dataset.from_generator(gen1).to_iterable_dataset(num_shards=2)
+    dataset2 = Dataset.from_generator(gen2).to_iterable_dataset(num_shards=2)
+    dataset1 = dataset1.map(lambda x: convert_to_str(x, dataset_name="a"), batched=True, batch_size=10, drop_last_batch=True)
+    dataset2 = dataset2.map(lambda x: convert_to_str(x, dataset_name="b"), batched=True, batch_size=10, drop_last_batch=True)
+    interleaved = interleave_datasets([dataset1, dataset2], stopping_strategy="all_exhausted")
+    print(f"num_workers=0")
+    loader = DataLoader(interleaved, batch_size=5, num_workers=0)
+    i = 0
+    for b in loader:
+        print(i, b['a'])
+        i += 1
+    print('=-' * 20)
+    print(f"num_workers=1")
+    loader = DataLoader(interleaved, batch_size=5, num_workers=1)
+    i = 0
+    for b in loader:
+        print(i, b['a'])
+        i += 1
+    print('=-' * 20)
+    print(f"num_workers=2")
+    loader = DataLoader(interleaved, batch_size=5, num_workers=2)
+    i = 0
+    for b in loader:
+        print(i, b['a'])
+        i += 1
+    print('=-' * 20)
+    print(f"num_workers=3")
+    loader = DataLoader(interleaved, batch_size=5, num_workers=3)
+    i = 0
+    for b in loader:
+        print(i, b['a'])
+        i += 1

adhoc/eval_mteb/e5mistral_prompt.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import copy
+from typing import Dict
+def load_e5mistral_prompt(task_name, task_type, *args, **kwargs):
+    if task_type is None:
+        task_type = "Retrieval"
+    if task_name.endswith("_small") or task_name.endswith("_s") or task_name.endswith("_xs"):
+        task_name = task_name[:task_name.rindex("_")]
+    if task_name.startswith("cqadupstack-"):
+        task_name = "cqadupstack"
+    task_def = get_task_def_by_task_name_and_type(task_name=task_name, task_type=task_type)
+    prompt = get_detailed_instruct(task_def)
+    prompt_dict = {"q_prompt": prompt, "d_prompt": ""}
+    return prompt_dict
+def get_task_def_by_task_name_and_type(task_type: str, task_name: str) -> str:
+    # @ruimeng added
+    if task_name.lower() in ['nli', 'allnli']:
+        return "Retrieve a sentence that is semantically entailed by the given sentence."
+    if task_type in ['STS', 'sts']:
+        return "Retrieve semantically similar text."
+    if task_type in ['Summarization', 'summarization']:
+        return "Given a news summary, retrieve other semantically similar summaries"
+    if task_type in ['BitextMining', 'bitextmining']:
+        return "Retrieve parallel sentences."
+    if task_type in ['Classification', 'classification']:
+        task_name_to_instruct: Dict[str, str] = {
+            'AmazonCounterfactualClassification': 'Classify a given Amazon customer review text as either counterfactual or not-counterfactual',
+            'AmazonPolarityClassification': 'Classify Amazon reviews into positive or negative sentiment',
+            'AmazonReviewsClassification': 'Classify the given Amazon review into its appropriate rating category',
+            'AmazonReviewsPairClassification': 'Given an Amazon review, locate reviews within the same rating category',
+            'Banking77Classification': 'Given a online banking query, find the corresponding intents',
+            'EmotionClassification': 'Classify the emotion expressed in the given Twitter message into one of the six emotions: anger, fear, joy, love, sadness, and surprise',
+            'EmotionPairClassification': 'Given an Twitter message, locate message within the same emotion category',
+            'ImdbClassification': 'Classify the sentiment expressed in the given movie review text from the IMDB dataset',
+            'MassiveIntentClassification': 'Given a user utterance as query, find the user intents',
+            'MassiveScenarioClassification': 'Given a user utterance as query, find the user scenarios',
+            'MTOPDomainClassification': 'Classify the intent domain of the given utterance in task-oriented conversation',
+            'MTOPIntentClassification': 'Classify the intent of the given utterance in task-oriented conversation',
+            'MTOPIntentPairClassification': 'Given an utterance in task-oriented conversation, locate utterance within the same intent category',
+            'ToxicConversationsClassification': 'Classify the given comments as either toxic or not toxic',
+            'ToxicConversationsPairClassification': 'Given an comment as toxic or non-toxic, locate comments within the same category',
+            'TweetSentimentExtractionClassification': 'Classify the sentiment of a given tweet as either positive, negative, or neutral',
+            'TweetSentimentPairClassification': 'Given an comment as either positive, negative, or neutral, locate comments within the same category',
+        }
+        return task_name_to_instruct[task_name]
+    if task_type in ['Clustering', 'clustering']:
+        task_name_to_instruct: Dict[str, str] = {
+            'ArxivClusteringP2P': 'Identify the main and secondary category of Arxiv papers based on the titles and abstracts',
+            'ArxivClusteringS2S': 'Identify the main and secondary category of Arxiv papers based on the titles',
+            'BiorxivClusteringP2P': 'Identify the main category of Biorxiv papers based on the titles and abstracts',
+            'BiorxivClusteringS2S': 'Identify the main category of Biorxiv papers based on the titles',
+            'MedrxivClusteringP2P': 'Identify the main category of Medrxiv papers based on the titles and abstracts',
+            'MedrxivClusteringS2S': 'Identify the main category of Medrxiv papers based on the titles',
+            'RedditClustering': 'Identify the topic or theme of Reddit posts based on the titles',
+            'RedditClusteringP2P': 'Identify the topic or theme of Reddit posts based on the titles and posts',
+            'StackExchangeClustering': 'Identify the topic or theme of StackExchange posts based on the titles',
+            'StackExchangeClusteringP2P': 'Identify the topic or theme of StackExchange posts based on the given paragraphs',
+            'TwentyNewsgroupsClustering': 'Identify the topic or theme of the given news articles',
+        }
+        return task_name_to_instruct[task_name]
+    if task_type in ['Reranking', 'PairClassification', 'reranking', 'pairclassification']:
+        task_name_to_instruct: Dict[str, str] = {
+            'AskUbuntuDupQuestions': 'Retrieve duplicate questions from AskUbuntu forum',
+            'MindSmallReranking': 'Retrieve relevant news articles based on user browsing history',
+            'SciDocsRR': 'Given a title of a scientific paper, retrieve the titles of other relevant papers',
+            'StackOverflowDupQuestions': 'Retrieve duplicate questions from StackOverflow forum',
+            'SprintDuplicateQuestions': 'Retrieve duplicate questions from Sprint forum',
+            'TwitterSemEval2015': 'Retrieve tweets that are semantically similar to the given tweet',
+            'TwitterURLCorpus': 'Retrieve tweets that are semantically similar to the given tweet',
+        }
+        return task_name_to_instruct[task_name]
+    if task_type in ['Retrieval', 'retrieval']:
+        if task_name.lower().startswith('cqadupstack'):
+            return 'Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question'
+        task_name_to_instruct: Dict[str, str] = {
+            'ArguAna': 'Given a claim, find documents that refute the claim',
+            'ClimateFEVER': 'Given a claim about climate change, retrieve documents that support or refute the claim',
+            'DBPedia': 'Given a query, retrieve relevant entity descriptions from DBPedia',
+            'FEVER': 'Given a claim, retrieve documents that support or refute the claim',
+            'FiQA2018': 'Given a financial question, retrieve user replies that best answer the question',
+            'HotpotQA': 'Given a multi-hop question, retrieve documents that can help answer the question',
+            'MSMARCO': 'Given a web search query, retrieve relevant passages that answer the query',
+            'NFCorpus': 'Given a question, retrieve relevant documents that best answer the question',
+            'NQ': 'Given a question, retrieve Wikipedia passages that answer the question',
+            'QuoraRetrieval': 'Given a question, retrieve questions that are semantically equivalent to the given question',
+            'SCIDOCS': 'Given a scientific paper title, retrieve paper abstracts that are cited by the given paper',
+            'SciFact': 'Given a scientific claim, retrieve documents that support or refute the claim',
+            'Touche2020': 'Given a question, retrieve detailed and persuasive arguments that answer the question',
+            'TRECCOVID': 'Given a query on COVID-19, retrieve documents that answer the query',
+            'InstructConversation': "Given a question asked by user, the assistant answers",
+            'MrTydi': "Given a question, retrieve Wikipedia passages that answer the question",
+            "ChatgptShortLong": "Given a query, retrieve passages that answer the query",
+            # E5 public training
+            "msmarco_document": "Given a web search query, retrieve relevant documents that answer the query",
+            "msmarco_passage": "Given a web search query, retrieve relevant passages that answer the query",
+            "allnli": "Given a web search query, retrieve relevant documents that answer the query",
+            "dureader": "Given a Chinese search query, retrieve web passages that answer the question",
+            "eli5_question_answer": "Provided a user question, retrieve the highest voted answers on Reddit ELI5 forum",
+            "fever": "Given a claim, retrieve documents that support or refute the claim",
+            "hotpot_qa": "Given a multi-hop question, retrieve documents that can help answer the question",
+            "miracl": "Given a question, retrieve Wikipedia passages that answer the question",
+            "mrtydi": "Given a question, retrieve Wikipedia passages that answer the question",
+            "nq": "Given a question, retrieve Wikipedia passages that answer the question",
+            "quora_duplicates": "Given a question, retrieve questions that are semantically equivalent to the given question",
+            "squad": "Retrieve Wikipedia passages that answer the question",
+            "t2ranking": "Given a Chinese search query, retrieve web passages that answer the question",
+            "trivia_qa": "Retrieve Wikipedia passages that answer the question'",
+        }
+        # add lower case keys to match some beir names
+        task_name_to_instruct.update({k.lower(): v for k, v in task_name_to_instruct.items()})
+        # other cases where lower case match still doesn't work
+        task_name_to_instruct['trec-covid'] = task_name_to_instruct['TRECCOVID']
+        task_name_to_instruct['climate-fever'] = task_name_to_instruct['ClimateFEVER']
+        task_name_to_instruct['dbpedia-entity'] = task_name_to_instruct['DBPedia']
+        task_name_to_instruct['webis-touche2020'] = task_name_to_instruct['Touche2020']
+        task_name_to_instruct['fiqa'] = task_name_to_instruct['FiQA2018']
+        task_name_to_instruct['quora'] = task_name_to_instruct['QuoraRetrieval']
+        task_name_to_instruct['instructed-conversation'] = task_name_to_instruct['InstructConversation']
+        # for miracl evaluation
+        task_name_to_instruct['miracl'] = 'Given a question, retrieve Wikipedia passages that answer the question'
+        return task_name_to_instruct[task_name]
+    raise ValueError(f"No instruction config for task {task_name} with type {task_type}")
+def get_detailed_instruct(task_description: str) -> str:
+    if not task_description:
+        return ''
+    return 'Instruct: {}\nQuery: '.format(task_description)

adhoc/eval_mteb/merge_cqadupstack.py ADDED Viewed

	@@ -0,0 +1,80 @@

+"""Merges CQADupstack subset results
+Usage: python merge_cqadupstack.py path_to_results_folder
+Adapted from: https://github.com/embeddings-benchmark/mteb/blob/main/scripts/merge_cqadupstack.py
+"""
+from __future__ import annotations
+import glob
+import json
+import logging
+import os
+import sys
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+TASK_LIST_CQA = [
+    "CQADupstackAndroidRetrieval",
+    "CQADupstackEnglishRetrieval",
+    "CQADupstackGamingRetrieval",
+    "CQADupstackGisRetrieval",
+    "CQADupstackMathematicaRetrieval",
+    "CQADupstackPhysicsRetrieval",
+    "CQADupstackProgrammersRetrieval",
+    "CQADupstackStatsRetrieval",
+    "CQADupstackTexRetrieval",
+    "CQADupstackUnixRetrieval",
+    "CQADupstackWebmastersRetrieval",
+    "CQADupstackWordpressRetrieval",
+]
+NOAVG_KEYS = [
+    "hf_subset",
+    "languages",
+    "evaluation_time",
+    "mteb_version",
+    "mteb_dataset_name",
+    "dataset_revision",
+]
+results_folder =  '/export/xgen-embedding/release/SFR-Embedding-Mistral-v2/RC3/eval_output/public_mteb/beir'
+# Ensure at least 1 character btw CQADupstack & Retrieval
+files = glob.glob(f'{results_folder.rstrip("/")}/CQADupstack*?*Retrieval.json')
+logger.info(f"Found CQADupstack files {len(files)}/{len(TASK_LIST_CQA)}: \n{files}")
+if len(files) == len(TASK_LIST_CQA):
+    all_results = {}
+    for file_name in files:
+        with open(file_name, "r", encoding="utf-8") as f:
+            results = json.load(f)
+            for split, split_results in results.items():
+                if split not in ("train", "validation", "dev", "test"):
+                    all_results[split] = split_results
+                    continue
+                all_results.setdefault(split, {})
+                for metric, score in split_results.items():
+                    all_results[split].setdefault(metric, 0)
+                    if metric == "evaluation_time":
+                        score = all_results[split][metric] + score
+                    elif metric not in NOAVG_KEYS:
+                        score = all_results[split][metric] + score * 1 / len(
+                            TASK_LIST_CQA
+                        )
+                    all_results[split][metric] = score
+    final_results = results
+    final_results['scores'] = all_results
+    final_results["task_name"] = "CQADupstackRetrieval"
+    final_results["evaluation_time"] = None
+    logger.info(all_results)
+    logger.info(f"Saving results to {os.path.join(results_folder, 'CQADupstackRetrieval.json')}")
+    with open(os.path.join(results_folder, "CQADupstackRetrieval.json"), "w", encoding="utf-8") as f:
+        json.dump(final_results, f, indent=4)
+else:
+    logger.warning(
+        f"Got {len(files)}, but expected {len(TASK_LIST_CQA)} files. Missing: {set(TASK_LIST_CQA) - set([x.split('/')[-1].split('.')[0] for x in files])}; Too much: {set([x.split('/')[-1].split('.')[0] for x in files]) - set(TASK_LIST_CQA)}"
+    )

adhoc/eval_mteb/mteb_utils.py ADDED Viewed

	@@ -0,0 +1,348 @@

+import argparse
+import torch
+import logging
+from torch import Tensor
+from transformers import PreTrainedTokenizerFast, BatchEncoding
+from typing import Mapping, Dict, List
+import torch.distributed as dist
+def _setup_logger():
+    log_format = logging.Formatter("[%(asctime)s %(levelname)s] %(message)s")
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)
+    console_handler = logging.StreamHandler()
+    console_handler.setFormatter(log_format)
+    logger.handlers = [console_handler]
+    return logger
+logger = _setup_logger()
+def str2bool(v):
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        raise argparse.ArgumentTypeError('Boolean value expected.')
+def move_to_cuda(sample):
+    if len(sample) == 0:
+        return {}
+    def _move_to_cuda(maybe_tensor):
+        if torch.is_tensor(maybe_tensor):
+            return maybe_tensor.cuda(non_blocking=True)
+        elif isinstance(maybe_tensor, dict):
+            return {key: _move_to_cuda(value) for key, value in maybe_tensor.items()}
+        elif isinstance(maybe_tensor, list):
+            return [_move_to_cuda(x) for x in maybe_tensor]
+        elif isinstance(maybe_tensor, tuple):
+            return tuple([_move_to_cuda(x) for x in maybe_tensor])
+        elif isinstance(maybe_tensor, Mapping):
+            return type(maybe_tensor)({k: _move_to_cuda(v) for k, v in maybe_tensor.items()})
+        else:
+            return maybe_tensor
+    return _move_to_cuda(sample)
+def pool(last_hidden_states: Tensor,
+         attention_mask: Tensor,
+         pool_type: str) -> Tensor:
+    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
+    if pool_type == "avg":
+        emb = last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
+    elif pool_type == "weightedavg":  # position-weighted mean pooling from SGPT (https://arxiv.org/abs/2202.08904)
+        attention_mask *= attention_mask.cumsum(dim=1)  # [0,1,1,1,0,0] -> [0,1,2,3,0,0]
+        s = torch.sum(last_hidden * attention_mask.unsqueeze(-1).float(), dim=1)
+        d = attention_mask.sum(dim=1, keepdim=True).float()
+        emb = s / d
+    elif pool_type == "cls":
+        emb = last_hidden[:, 0]
+    elif pool_type == "last" or pool_type == "eos":
+        left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
+        if left_padding:
+            emb = last_hidden[:, -1]
+        else:
+            sequence_lengths = attention_mask.sum(dim=1) - 1
+            batch_size = last_hidden.shape[0]
+            emb = last_hidden[torch.arange(batch_size, device=last_hidden.device), sequence_lengths]
+    elif pool_type.lower() == "none":
+        emb = last_hidden
+    else:
+        raise ValueError(f"pool_type {pool_type} not supported")
+    return emb
+def create_batch_dict(tokenizer: PreTrainedTokenizerFast, input_texts: List[str], always_add_eos: bool, max_length: int = 512) -> BatchEncoding:
+    if not always_add_eos:
+        return tokenizer(
+            input_texts,
+            max_length=max_length,
+            padding=True,
+            pad_to_multiple_of=8,
+            return_token_type_ids=False,
+            truncation=True,
+            return_tensors='pt'
+        )
+    else:
+        batch_dict = tokenizer(
+            input_texts,
+            max_length=max_length - 1,
+            return_token_type_ids=False,
+            return_attention_mask=False,
+            padding=False,
+            truncation=True
+        )
+        # append eos_token_id to every input_ids
+        batch_dict['input_ids'] = [input_ids + [tokenizer.eos_token_id] for input_ids in batch_dict['input_ids']]
+        return tokenizer.pad(
+            batch_dict,
+            padding=True,
+            pad_to_multiple_of=8,
+            return_attention_mask=True,
+            return_tensors="pt",
+        )
+def get_task_def_by_task_name_and_type(task_name: str, task_type: str) -> str:
+    if task_type in ['STS']:
+        return "Retrieve semantically similar text."
+    if task_type in ['Summarization']:
+        return "Given a news summary, retrieve other semantically similar summaries"
+    if task_type in ['BitextMining']:
+        return "Retrieve parallel sentences."
+    if task_type in ['Classification']:
+        task_name_to_instruct: Dict[str, str] = {
+            'AmazonCounterfactualClassification': 'Classify a given Amazon customer review text as either counterfactual or not-counterfactual',
+            'AmazonPolarityClassification': 'Classify Amazon reviews into positive or negative sentiment',
+            'AmazonReviewsClassification': 'Classify the given Amazon review into its appropriate rating category',
+            'Banking77Classification': 'Given a online banking query, find the corresponding intents',
+            'EmotionClassification': 'Classify the emotion expressed in the given Twitter message into one of the six emotions: anger, fear, joy, love, sadness, and surprise',
+            'ImdbClassification': 'Classify the sentiment expressed in the given movie review text from the IMDB dataset',
+            'MassiveIntentClassification': 'Given a user utterance as query, find the user intents',
+            'MassiveScenarioClassification': 'Given a user utterance as query, find the user scenarios',
+            'MTOPDomainClassification': 'Classify the intent domain of the given utterance in task-oriented conversation',
+            'MTOPIntentClassification': 'Classify the intent of the given utterance in task-oriented conversation',
+            'ToxicConversationsClassification': 'Classify the given comments as either toxic or not toxic',
+            'TweetSentimentExtractionClassification': 'Classify the sentiment of a given tweet as either positive, negative, or neutral',
+            # C-MTEB eval instructions
+            'TNews': 'Classify the fine-grained category of the given news title',
+            'IFlyTek': 'Given an App description text, find the appropriate fine-grained category',
+            'MultilingualSentiment': 'Classify sentiment of the customer review into positive, neutral, or negative',
+            'JDReview': 'Classify the customer review for iPhone on e-commerce platform into positive or negative',
+            'OnlineShopping': 'Classify the customer review for online shopping into positive or negative',
+            'Waimai': 'Classify the customer review from a food takeaway platform into positive or negative',
+        }
+        return task_name_to_instruct[task_name]
+    if task_type in ['Clustering']:
+        task_name_to_instruct: Dict[str, str] = {
+            'ArxivClusteringP2P': 'Identify the main and secondary category of Arxiv papers based on the titles and abstracts',
+            'ArxivClusteringS2S': 'Identify the main and secondary category of Arxiv papers based on the titles',
+            'BiorxivClusteringP2P': 'Identify the main category of Biorxiv papers based on the titles and abstracts',
+            'BiorxivClusteringS2S': 'Identify the main category of Biorxiv papers based on the titles',
+            'MedrxivClusteringP2P': 'Identify the main category of Medrxiv papers based on the titles and abstracts',
+            'MedrxivClusteringS2S': 'Identify the main category of Medrxiv papers based on the titles',
+            'RedditClustering': 'Identify the topic or theme of Reddit posts based on the titles',
+            'RedditClusteringP2P': 'Identify the topic or theme of Reddit posts based on the titles and posts',
+            'StackExchangeClustering': 'Identify the topic or theme of StackExchange posts based on the titles',
+            'StackExchangeClusteringP2P': 'Identify the topic or theme of StackExchange posts based on the given paragraphs',
+            'TwentyNewsgroupsClustering': 'Identify the topic or theme of the given news articles',
+            # C-MTEB eval instructions
+            'CLSClusteringS2S': 'Identify the main category of scholar papers based on the titles',
+            'CLSClusteringP2P': 'Identify the main category of scholar papers based on the titles and abstracts',
+            'ThuNewsClusteringS2S': 'Identify the topic or theme of the given news articles based on the titles',
+            'ThuNewsClusteringP2P': 'Identify the topic or theme of the given news articles based on the titles and contents',
+        }
+        return task_name_to_instruct[task_name]
+    if task_type in ['Reranking', 'PairClassification']:
+        task_name_to_instruct: Dict[str, str] = {
+            'AskUbuntuDupQuestions': 'Retrieve duplicate questions from AskUbuntu forum',
+            'MindSmallReranking': 'Retrieve relevant news articles based on user browsing history',
+            'SciDocsRR': 'Given a title of a scientific paper, retrieve the titles of other relevant papers',
+            'StackOverflowDupQuestions': 'Retrieve duplicate questions from StackOverflow forum',
+            'SprintDuplicateQuestions': 'Retrieve duplicate questions from Sprint forum',
+            'TwitterSemEval2015': 'Retrieve tweets that are semantically similar to the given tweet',
+            'TwitterURLCorpus': 'Retrieve tweets that are semantically similar to the given tweet',
+            # C-MTEB eval instructions
+            'T2Reranking': 'Given a Chinese search query, retrieve web passages that answer the question',
+            'MMarcoReranking': 'Given a Chinese search query, retrieve web passages that answer the question',
+            'CMedQAv1': 'Given a Chinese community medical question, retrieve replies that best answer the question',
+            'CMedQAv2': 'Given a Chinese community medical question, retrieve replies that best answer the question',
+            'Ocnli': 'Retrieve semantically similar text.',
+            'Cmnli': 'Retrieve semantically similar text.',
+        }
+        return task_name_to_instruct[task_name]
+    if task_type in ['Retrieval']:
+        if task_name.lower().startswith('cqadupstack'):
+            return 'Given a question, retrieve detailed question descriptions from Stackexchange that are duplicates to the given question'
+        task_name_to_instruct: Dict[str, str] = {
+            'ArguAna': 'Given a claim, find documents that refute the claim',
+            'ClimateFEVER': 'Given a claim about climate change, retrieve documents that support or refute the claim',
+            'DBPedia': 'Given a query, retrieve relevant entity descriptions from DBPedia',
+            'FEVER': 'Given a claim, retrieve documents that support or refute the claim',
+            'FiQA2018': 'Given a financial question, retrieve user replies that best answer the question',
+            'HotpotQA': 'Given a multi-hop question, retrieve documents that can help answer the question',
+            'MSMARCO': 'Given a web search query, retrieve relevant passages that answer the query',
+            'NFCorpus': 'Given a question, retrieve relevant documents that best answer the question',
+            'NQ': 'Given a question, retrieve Wikipedia passages that answer the question',
+            'QuoraRetrieval': 'Given a question, retrieve questions that are semantically equivalent to the given question',
+            'SCIDOCS': 'Given a scientific paper title, retrieve paper abstracts that are cited by the given paper',
+            'SciFact': 'Given a scientific claim, retrieve documents that support or refute the claim',
+            'Touche2020': 'Given a question, retrieve detailed and persuasive arguments that answer the question',
+            'TRECCOVID': 'Given a query on COVID-19, retrieve documents that answer the query',
+            # C-MTEB eval instructions
+            'T2Retrieval': 'Given a Chinese search query, retrieve web passages that answer the question',
+            'MMarcoRetrieval': 'Given a web search query, retrieve relevant passages that answer the query',
+            'DuRetrieval': 'Given a Chinese search query, retrieve web passages that answer the question',
+            'CovidRetrieval': 'Given a question on COVID-19, retrieve news articles that answer the question',
+            'CmedqaRetrieval': 'Given a Chinese community medical question, retrieve replies that best answer the question',
+            'EcomRetrieval': 'Given a user query from an e-commerce website, retrieve description sentences of relevant products',
+            'MedicalRetrieval': 'Given a medical question, retrieve user replies that best answer the question',
+            'VideoRetrieval': 'Given a video search query, retrieve the titles of relevant videos',
+        }
+        # add lower case keys to match some beir names
+        task_name_to_instruct.update({k.lower(): v for k, v in task_name_to_instruct.items()})
+        # other cases where lower case match still doesn't work
+        task_name_to_instruct['trec-covid'] = task_name_to_instruct['TRECCOVID']
+        task_name_to_instruct['climate-fever'] = task_name_to_instruct['ClimateFEVER']
+        task_name_to_instruct['dbpedia-entity'] = task_name_to_instruct['DBPedia']
+        task_name_to_instruct['webis-touche2020'] = task_name_to_instruct['Touche2020']
+        task_name_to_instruct['fiqa'] = task_name_to_instruct['FiQA2018']
+        task_name_to_instruct['quora'] = task_name_to_instruct['QuoraRetrieval']
+        # for miracl evaluation
+        task_name_to_instruct['miracl'] = 'Given a question, retrieve Wikipedia passages that answer the question'
+        return task_name_to_instruct[task_name]
+    raise ValueError(f"No instruction config for task {task_name} with type {task_type}")
+def get_detailed_instruct(task_description: str) -> str:
+    if not task_description:
+        return ''
+    return 'Instruct: {}\nQuery: '.format(task_description)
+def input_transform_func(tokenizer: PreTrainedTokenizerFast,
+                         examples: Dict[str, List],
+                         always_add_eos: bool,
+                         max_length: int,
+                         ) -> BatchEncoding:
+    if not always_add_eos:
+        batch_dict = tokenizer(
+            examples['input_texts'],
+            max_length=max_length if max_length else None,
+            padding=True,
+            return_token_type_ids=False,
+            truncation=True
+        )
+    else:
+        batch_dict = tokenizer(
+            examples['input_texts'],
+            max_length=max_length - 1 if max_length else None,
+            return_token_type_ids=False,
+            return_attention_mask=False,
+            padding=False,
+            truncation=True
+        )
+        # append eos_token_id to every input_ids, some texts in FiQA are empty
+        input_ids_list = []
+        for input_ids in batch_dict['input_ids']:
+            if not input_ids:
+                input_ids_list.append([tokenizer.eos_token_id])
+            elif input_ids[-1] != tokenizer.eos_token_id:
+                input_ids_list.append(input_ids + [tokenizer.eos_token_id])
+            else:
+                input_ids_list.append(input_ids)
+        batch_dict['input_ids'] = input_ids_list
+    return batch_dict
+def get_rank():
+    if not dist.is_available():
+        return 0
+    if not dist.is_initialized():
+        return 0
+    return dist.get_rank()
+def is_main():
+    return get_rank() == 0
+@torch.no_grad()
+def varsize_gather_nograd(x: torch.Tensor):
+    """gather tensors of different sizes along the first dimension"""
+    if not dist.is_initialized():
+        return x
+    # determine max size
+    size = torch.tensor([x.shape[0]], device=x.device, dtype=torch.int)
+    allsizes = [torch.zeros_like(size) for _ in range(dist.get_world_size())]
+    dist.all_gather(allsizes, size)
+    max_size = max([size.cpu().max() for size in allsizes])
+    padded = torch.empty(max_size, *x.shape[1:], dtype=x.dtype, device=x.device)
+    padded[: x.shape[0]] = x
+    output = [torch.zeros_like(padded) for _ in range(dist.get_world_size())]
+    dist.all_gather(output, padded)
+    output = [tensor[: allsizes[k]] for k, tensor in enumerate(output)]
+    output = torch.cat(output, dim=0)
+    return output
+SPECIAL_TOKENS = {
+                    't5': {
+                        'eos': '</s>',
+                    },
+                    'xlm-r': {
+                        'bos': '<s>',
+                        'eos': '</s>',
+                    },
+                    'mistral': {
+                        'bos': '<s>',
+                        'eos': '</s>',
+                    },
+                    'llama': {
+                        'bos': '<|begin_of_text|>',
+                        'eos': '<|end_of_text|>',
+                        'pad': '<|finetune_right_pad_id|>',
+                        'mask': "<|reserved_special_token_0|>",
+                    },
+                    'nvidia/NV-Embed-v2': {
+                        'bos': '<s>',
+                        'eos': '</s>',
+                    },
+                    'qwen2': {
+                        'bos': '<|im_start|>',
+                        'eos': '<|im_end|>',
+                    }
+                }

adhoc/eval_mteb/run_mteb.py ADDED Viewed

	@@ -0,0 +1,198 @@

+import copy
+import torch
+import torch.distributed as dist
+import tqdm
+import numpy as np
+import os
+from functools import partial
+from torch.utils.data import DataLoader
+from datasets import Dataset
+from transformers import AutoTokenizer, AutoModel, DataCollatorWithPadding
+from mteb import MTEB
+from adhoc.eval_mteb.e5mistral_prompt import load_e5mistral_prompt
+from src.arguments import ModelArguments, DataArguments, TrainingArguments, MTEBArguments
+from transformers import HfArgumentParser, AutoTokenizer
+from src.model.model_token_pooling import MMEBModel
+from adhoc.eval_mteb.mteb_utils import logger, pool, move_to_cuda, input_transform_func, varsize_gather_nograd, is_main, str2bool
+from src.model.processor import load_processor
+# (not effective here, add them in environment variables) for clustering: OpenBLAS warning: precompiled NUM_THREADS exceeded, adding auxiliary array for thread metadata.
+default_n_threads = 1
+os.environ['OPENBLAS_NUM_THREADS'] = f"{default_n_threads}"
+os.environ['MKL_NUM_THREADS'] = f"{default_n_threads}"
+os.environ['OMP_NUM_THREADS'] = f"{default_n_threads}"
+MTEB_TASKS_EN = [
+    "AmazonCounterfactualClassification", "AmazonPolarityClassification", "AmazonReviewsClassification", "Banking77Classification", "EmotionClassification", "ImdbClassification", "MassiveIntentClassification", "MassiveScenarioClassification", "MTOPDomainClassification", "MTOPIntentClassification", "ToxicConversationsClassification", "TweetSentimentExtractionClassification",
+    "ArxivClusteringP2P", "ArxivClusteringS2S", "BiorxivClusteringP2P", "BiorxivClusteringS2S", "MedrxivClusteringP2P", "MedrxivClusteringS2S", "RedditClustering", "RedditClusteringP2P", "StackExchangeClustering", "StackExchangeClusteringP2P", "TwentyNewsgroupsClustering",
+    "SprintDuplicateQuestions", "TwitterSemEval2015", "TwitterURLCorpus",
+    "AskUbuntuDupQuestions", "MindSmallReranking", "SciDocsRR", "StackOverflowDupQuestions",
+    "ArguAna", "ClimateFEVER", "CQADupstackAndroidRetrieval", "DBPedia", "FEVER", "FiQA2018", "HotpotQA", "MSMARCO", "NFCorpus", "NQ", "QuoraRetrieval", "SCIDOCS", "SciFact", "TRECCOVID", "Touche2020",
+    "BIOSSES", "SICK-R", "STS12", "STS13", "STS14", "STS15", "STS16", "STS17", "STS22", "STSBenchmark",
+    "SummEval"
+]
+class DenseEncoder(torch.nn.Module):
+    def __init__(self, model_args, mteb_args, max_length=512, **kwargs):
+        super().__init__()
+        self.max_length = max_length
+        self.pool_type = model_args.pooling
+        processor = load_processor(model_args)
+        model = MMEBModel.load(model_args)
+        processor.tokenizer.padding_side = "right"
+        model.eval()
+        model = model.to(mteb_args.device, dtype=torch.bfloat16)
+        self.encoder = model
+        self.tokenizer = processor.tokenizer
+        self.processor = processor
+        self.batch_size_per_device = mteb_args.batch_size_per_device
+        self.gpu_count = torch.cuda.device_count()
+        self.encoder.eval()
+        self.encoder.cuda()
+        self.query_prompt = ""
+        self.doc_prompt = ""
+        self.sep = ". "
+        if not torch.distributed.is_initialized() and self.gpu_count > 1:
+            self.encoder = torch.nn.DataParallel(self.encoder)
+    def encode_queries(self, sentences, **kwargs) -> np.ndarray:
+        return self.encode(sentences, self.query_prompt, is_query=True, **kwargs)
+    def encode_corpus(self, sentences, **kwargs) -> np.ndarray:
+        return self.encode(sentences, self.doc_prompt, is_query=False, **kwargs)
+    @torch.no_grad()
+    def encode(self, inputs, prompt=None, is_query=True, **kwargs) -> np.ndarray:
+        """ Returns a list of embeddings for the given sentences.
+        Args:
+            inputs (`List[str]`): List of sentences to encode
+            batch_size_per_device (`int`): Batch size for the encoding
+        Returns:
+            `List[np.ndarray]` or `List[tensor]`: List of embeddings for the given sentences
+        """
+        if isinstance(inputs[0], dict):
+            input_texts = [(doc["title"] + self.sep + doc["text"]).strip() if "title" in doc else doc["text"].strip() for doc in inputs]
+        else:
+            input_texts = copy.copy(inputs)
+        if torch.distributed.is_initialized() and len(input_texts) >= dist.get_world_size():
+            idx = np.array_split(range(len(input_texts)), dist.get_world_size())[dist.get_rank()]
+        else:
+            # in case of non-DDP or not enough sentences, all devices are running the same job, but no gathering in the end
+            idx = range(len(input_texts))
+        device_sentences = [input_texts[i] for i in idx]
+        # for tasks other than RET
+        if is_query and not prompt and self.query_prompt:
+            prompt = self.query_prompt
+        if prompt:
+            device_sentences_with_prompt = [prompt + (s['text'] if isinstance(s, dict) else s) for s in device_sentences]
+        else:
+            device_sentences_with_prompt = device_sentences
+        dataset: Dataset = Dataset.from_dict({'input_texts': device_sentences_with_prompt})
+        dataset.set_transform(partial(input_transform_func, self.tokenizer, max_length=self.max_length, always_add_eos=True))
+        data_collator = DataCollatorWithPadding(self.tokenizer, pad_to_multiple_of=1)
+        data_loader = DataLoader(
+            dataset,
+            batch_size=self.batch_size_per_device if torch.distributed.is_initialized() else self.batch_size_per_device * self.gpu_count,
+            shuffle=False,
+            drop_last=False,
+            num_workers=0,
+            collate_fn=data_collator,
+            pin_memory=True)
+        encoded_embeds = []
+        # for batch in data_loader:
+        for batch in tqdm.tqdm(data_loader, desc="encoding", miniters=10, disable=not is_main()):
+            # batch.data['is_causal'] = self.is_causal  # only needed for Qwen
+            # print(f"batch.data['is_causal']={batch.data['is_causal']}")
+            # print(self.tokenizer.decode(batch['input_ids'][0]))
+            # print(batch['input_ids'].numpy())
+            # print(batch)
+            batch = move_to_cuda(batch)
+            with torch.cuda.amp.autocast():
+                outputs = self.encoder.encode_input(batch)
+                encoded_embeds.append(outputs)
+        encoded_embeds = torch.cat(encoded_embeds, dim=0)
+        if torch.distributed.is_initialized() and len(inputs) >= dist.get_world_size():
+            encoded_embeds = varsize_gather_nograd(encoded_embeds)
+        encoded_embeds = encoded_embeds.cpu().numpy()
+        return encoded_embeds
+    def set_prompt(self, query_prompt: str, doc_prompt: str):
+        self.query_prompt = query_prompt
+        self.doc_prompt = doc_prompt
+def main():
+    parser = HfArgumentParser((ModelArguments, DataArguments, MTEBArguments, TrainingArguments))
+    model_args, data_args, mteb_args, training_args, remaining_args  = parser.parse_args_into_dataclasses(return_remaining_strings=True)
+    model_args: ModelArguments
+    data_args: DataArguments
+    mteb_args: MTEBArguments
+    assert mteb_args.eval_output_dir, 'eval_output_dir should be specified'
+    os.makedirs(mteb_args.eval_output_dir, exist_ok=True)
+    task_types = None
+    tasks = ['NFCorpus', 'FiQA2018', 'ArguAna', 'SciFact', 'SCIDOCS', 'Touche2020', 'TRECCOVID']
+    # tasks = ["BiorxivClusteringS2S", "MedrxivClusteringS2S", "RedditClustering", "StackExchangeClustering", "StackExchangeClusteringP2P", "TwentyNewsgroupsClustering"]
+    evaluation = MTEB(task_types=task_types, tasks=tasks, task_langs=["eng-Latn", "en"])
+    model = DenseEncoder(model_args, mteb_args, max_length=mteb_args.max_length)
+    for task_cls in evaluation.tasks:
+        task_name: str = task_cls.metadata.name
+        task_type: str = task_cls.metadata.type
+        # filter out not supported datasets
+        print(f"Evaluating MTEB: {task_type} - {task_name}")
+        # filter out not supported datasets
+        if task_name not in MTEB_TASKS_EN:
+            continue
+        eval_splits = task_cls.metadata.eval_splits
+        if "test" not in eval_splits:
+            logger.warning("Test split not found for task: {}, type: {}, eval_splits: {}".format(task_name, task_type, eval_splits))
+        eval_splits = ["test" if "test" in eval_splits else eval_splits[0]]
+        if mteb_args.prompt_family:
+            prompt_data = load_e5mistral_prompt(prompt_family=mteb_args.prompt_family, task_name=task_name, task_type=task_type)
+            query_prompt = prompt_data['q_prompt']
+            doc_prompt = prompt_data['d_prompt']
+            model.set_prompt(query_prompt=query_prompt, doc_prompt=doc_prompt)
+            logger.info('Set prompt: query={}, doc={}'.format(query_prompt, doc_prompt))
+        else:
+            logger.info('No prompt is set')
+        # disable l2 normalize for classification tasks, as it achieves slightly better results
+        if task_type == 'Classification':
+            logger.info('Set l2_normalize to False for classification task')
+            model.l2_normalize = False
+        else:
+            model.l2_normalize = True
+            logger.info('Set l2_normalize to {}'.format(model.l2_normalize))
+        sub_eval = MTEB(tasks=[task_name], task_langs=["eng-Latn", "en"], n_experiments=1)
+        logger.info('Running evaluation for task: {}, type: {}'.format(task_name, task_type))
+        if (torch.distributed.is_initialized() and torch.distributed.get_rank() == 0) or not torch.distributed.is_initialized():
+            mteb_result_folder = mteb_args.eval_output_dir
+        else:
+            mteb_result_folder = None
+        sub_eval.run(
+            model, eval_splits=eval_splits,
+            output_folder=mteb_result_folder
+        )
+if __name__ == '__main__':
+    main()

adhoc/gather_score_byckpt_aws.py ADDED Viewed

	@@ -0,0 +1,136 @@

+import os
+import json
+import re
+# Define the datasets
+datasets = [
+    "ImageNet-1K", "N24News", "HatefulMemes", "VOC2007", "SUN397", "Place365", "ImageNet-A", "ImageNet-R", "ObjectNet", "Country211",
+    "OK-VQA", "A-OKVQA", "DocVQA", "InfographicsVQA", "ChartQA", "Visual7W", "ScienceQA", "VizWiz", "GQA", "TextVQA",
+    "VisDial", "CIRR", "VisualNews_t2i", "VisualNews_i2t", "MSCOCO_t2i", "MSCOCO_i2t", "NIGHTS", "WebQA", "FashionIQ", "Wiki-SS-NQ", "OVEN", "EDIS",
+    "MSCOCO", "RefCOCO", "RefCOCO-Matching", "Visual7W-Pointing"
+]
+# Define the root directory containing the experiment directories
+checkpoint_paths = [
+    # v2 baselines
+    # "/fsx/home/ruimeng/runs/v3vec-baseline/gme2b/mmeb/",
+    # "/fsx/home/ruimeng/runs/v3vec-baseline/gme7b/mmeb/",
+    # "/fsx/home/ruimeng/runs/v3vec-baseline/lamra/mmeb/",
+    # "/fsx/home/ruimeng/runs/v3vec-baseline/colpali/mmeb/",
+    # unified data, qwenresize
+    # "/fsx/home/yeliu/runs/mmeb/qwen2vl_2B.mmeb20.qwenresize.lora8.bs1024pergpu128.GCq8p8.NormTemp002.lr2e5.step2kwarm100.8H100/checkpoint-2000/eval/",
+    # "/fsx/home/yeliu/runs/mmeb/qwen2vl_2B.video.qwenresize.lora8.bs1024pergpu128.GCq8p8.NormTemp002.lr2e5.step2kwarm100.8H100/checkpoint-2000/eval/",
+    # "/fsx/home/yeliu/runs/mmeb/qwen2vl_2B.mmeb20+vidore+visrag.qwenresize.lora8.bs1024pergpu128.GCq8p8.NormTemp002.lr2e5.step2kwarm100.8H100/checkpoint-2000/eval/",
+    # "/fsx/home/yeliu/runs/mmeb/qwen2vl_2B.mmeb20+visdoc+video.qwenresize.lora8.bs1024pergpu128.GCq8p8.NormTemp002.lr2e5.step2kwarm100.8H100/checkpoint-2000/eval/",
+    # "/fsx/home/yeliu/runs/mmeb/qwen2vl_2B.mmeb20+visdoc+video.qwenresize.lora16.IB128.bs1024pergpu128.GCq8p8.NormTemp002.lr2e5.step2kwarm100.8H100/checkpoint-2000/eval/",
+    # "/fsx/home/yeliu/runs/mmeb/qwen2vl_2B.mmeb20+visdoc+video.qwenresize.lora16.bs1024pergpu128.GCq8p8.NormTemp002.lr2e5.step2kwarm100.8H100/checkpoint-2000/eval/",
+    #   "/fsx/home/yeliu/runs/mmeb/qwen2vl_2B.visdoc.qwenresize.lora8.bs1024pergpu128.GCq8p8.NormTemp002.lr2e5.step2kwarm100.8H100/checkpoint-2000/eval/",
+    # "/fsx/home/yeliu/runs/mmeb/qwen2vl_2B.mmeb20+visdoc+video.qwenresize.lora16.noIB.bs1024pergpu128.GCq8p8.NormTemp002.lr2e5.step2kwarm100.8H100/checkpoint-2000/eval/",
+    # "/fsx/home/yeliu/runs/mmeb/qwen2vl_2B.mmeb20+visdoc+video.qwenresize.lora16.IB32.bs1024pergpu128.GCq8p8.NormTemp002.lr2e5.step2kwarm100.8H100/checkpoint-2000/eval/",
+    # "/fsx/home/yeliu/runs/mmeb/qwen2vl_2B.mmeb20+video.qwenresize.lora8.bs1024pergpu128.GCq8p8.NormTemp002.lr2e5.step2kwarm100.8H100/checkpoint-2000/eval/"
+    # "/fsx/home/yeliu/runs/mmeb/qwen2vl_2B.mmeb20+visdoc+video.qwenresize.lora16.IB32.bs1024pergpu128.GCq8p8.NormTemp002.lr2e5.step2kwarm100.8H100/checkpoint-2000/eval"
+    # "/fsx/home/yeliu/runs/mmeb/qwen2vl_2B.mmeb20+visdoc+video_v2.qwenresize.lora16.noIB.bs1024pergpu128.GCq8p8.NormTemp002.lr2e5.step2kwarm100.8H100/checkpoint-4000/eval"
+# "/fsx/home/yeliu/runs/mmeb/qwen2vl_2B.mmeb+video_v2+split_visdoc.qwenresize.lora8.bs1024pergpu128.GCq8p8.NormTemp002.lr2e5.step2kwarm100.8H100//checkpoint-1000/eval"
+#    "/fsx/home/yeliu/runs/mmeb/qwen2vl_202_2B.mmeb20+visdoc+video.qwenresize.lora32.bs1024pergpu128.GCq8p8.NormTemp002.lr2e5.step2kwarm100.8H100/checkpoint-2000/eval"
+# "/fsx/home/yeliu/runs/mmeb/qwen2vl_2B.mmeb20+visdoc+video.qwenresize.lora16.IB0.bs1024pergpu128.GCq8p8.NormTemp002.lr2e5.step2kwarm100.8H100/checkpoint-2000/eval"
+# "/fsx/home/yeliu/runs/mmeb/qwen2vl_2B.mmeb20+visdoc+video_v2.qwenresize.lora16.noIB.bs1024pergpu128.GCq8p8.NormTemp002.lr2e5.step2kwarm100.8H100/checkpoint-5000/eval"
+# "/fsx/home/yeliu/runs/mmeb/qwen2vl_2B.mmeb+video_v2+split_visdoc.qwenresize.lora8.bs1024pergpu128.GCq8p8.NormTemp002.lr2e5.step2kwarm100.8H100/checkpoint-5000/eval"
+# "/fsx/home/ruimeng/runs/mmeb/qwen2vl_2B-002-6.mmeb20_vidore1_videohound2_mteb15-v2-cap100k-rerun.qwenresize.lora16.bs1024pergpu128-ib64-droplast.GCq8p8.NormTemp002.lr5e5.step5kwarm200.maxlen2k.8H100/checkpoint-4000/eval"
+"/fsx/home/yeliu/runs/mmeb/qwen2vl_7B.mmeb+video_v2+split_visdoc.qwenresize.lora16.bs512pergpu64.GCq8p8.NormTemp002.lr2e5.step2kwarm100.8H100/checkpoint-3000/eval"
+]
+# Function to extract step number from checkpoint directory name
+def extract_step(checkpoint_name):
+    match = re.search(r'checkpoint-(\d+)', checkpoint_name)
+    return int(match.group(1)) if match else float('inf')
+# Dictionary to hold all gathered scores, organized by experiment
+gathered_scores_by_exp = {}
+# Loop through checkpoint directories
+for checkpoint_path in checkpoint_paths:
+    print(checkpoint_path)
+    step = extract_step(checkpoint_path)
+    experiment_dir = checkpoint_path.split("/")[-3]
+    # Check if it is a checkpoint directory, and a valid checkpoint dir
+    if str.isdigit(str(step)):
+        # Initialize a dictionary to store scores for this checkpoint
+        checkpoint_scores = {"experiment": experiment_dir, "checkpoint": str(step)}
+    else:
+        checkpoint_scores = {"experiment": experiment_dir, "checkpoint": "default"}
+    # Go through each dataset and check if the corresponding score file exists
+    for dataset in datasets:
+        score_file = os.path.join(checkpoint_path, f"{dataset}_score.json")  # Score file named like DatasetName_score.json
+        # Check if the score file exists
+        if os.path.isfile(score_file):
+            with open(score_file, "r") as f:
+                score_data = json.load(f)  # Load the score JSON
+                checkpoint_scores[dataset] = score_data.get("acc", "N/A")  # Assuming 'acc' is the key for accuracy
+        else:
+            checkpoint_scores[dataset] = "N/A"  # If no score file, set to 'N/A'
+    print(checkpoint_scores)
+    # Append the scores for this checkpoint to the respective experiment group
+    gathered_scores_by_exp[experiment_dir] = checkpoint_scores
+print('\n' * 5)
+# Print gathered scores in a comma-separated format
+header = ["experiment", "checkpoint"] + datasets
+print(",".join(header))  # Print header
+for experiment, scores in gathered_scores_by_exp.items():
+    row = [scores["experiment"], scores["checkpoint"]] + [str(scores[dataset]) for dataset in datasets]
+    print(",".join(row))  # Print each row of scores
+header = ["dataset"] + list(gathered_scores_by_exp.keys())
+print(",".join(header))  # Print header
+# Additional Block: Print results per experiment, transposed (dataset per row, step per column)
+# Print dataset names in the first column, and the scores for each checkpoint in subsequent columns
+for dataset in datasets:
+    row = []
+    for experiment, scores in gathered_scores_by_exp.items():
+        row.append(str(scores[dataset]))
+    print(",".join([dataset] + row))  # Print header
+import pandas as pd
+# Collect rows
+rows = []
+for dataset in datasets:
+    row = [dataset]
+    for experiment in gathered_scores_by_exp.keys():
+        row.append(gathered_scores_by_exp[experiment][dataset])
+    rows.append(row)
+# Create DataFrame
+df = pd.DataFrame(rows, columns=header)
+# Save to CSV
+df.to_csv("output_scores.csv", index=False)
+print("CSV saved to output_scores.csv")
+# header = ["dataset"] + list(gathered_scores_by_exp.keys())
+# print(",".join(header))  # Print header
+# # Additional Block: Print results per experiment, transposed (dataset per row, step per column)
+# # Print dataset names in the first column, and the scores for each checkpoint in subsequent columns
+# for dataset in datasets:
+#     print(",".join([dataset, str(scores[dataset])]))
+#     for experiment, scores in gathered_scores_by_exp.items():
+#         print(f"\nResults for {experiment}:")
+#

adhoc/hf_datasets.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from datasets import load_dataset
+# official example from https://huggingface.co/docs/datasets/en/stream
+def official_example():
+    dataset = load_dataset("ethz/food101", split="validation")
+    dataset = dataset.to_iterable_dataset()
+    dataset = dataset.shuffle(buffer_size=1024, seed=42)
+    # dataset = dataset.map(add_prefix, remove_columns=["image", "label"])  # this works
+    dataset = dataset.map(add_prefix, remove_columns=["image", "label"], drop_last_batch=True, batched=True, batch_size=1024)  # this also works
+    # dataset = load_dataset("ethz/food101", streaming=True)
+    for batch in dataset:
+        print(batch)
+        pass
+def add_prefix(example):
+    example['text'] = [f'label: {l}' for l in example['label']]
+    return example
+def data_prepare(batch_dict, *args, **kwargs):
+    return batch_dict
+def load_mmeb():
+    dataset = load_dataset("TIGER-Lab/MMEB-train", "OK-VQA", split="original")
+    dataset = dataset.select(range(1000))  # step 1 select (works)
+    dataset = dataset.to_iterable_dataset()
+    dataset = dataset.shuffle(buffer_size=1024 * 16, seed=42)  # step 2 shuffle (works)
+    dataset = dataset.map(lambda x: data_prepare(x), batched=True, batch_size=1024 * 4)  # cannot use drop_last_batch=True
+    # dataset = dataset._resolve_features()
+    for batch in dataset:
+        print(batch)
+        pass
+if __name__ == "__main__":
+    # official_example()
+    load_mmeb()

adhoc/merge_checkpoint.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from src.arguments import ModelArguments
+from transformers import HfArgumentParser, AutoProcessor
+from src.model.model_token_pooling import MMEBModel
+from src.model.processor import get_backbone_name, load_processor
+def main():
+    parser = HfArgumentParser(ModelArguments)
+    model_args, = parser.parse_args_into_dataclasses()
+    model_args: ModelArguments
+    model = MMEBModel.build(model_args)
+    model_backbone = get_backbone_name(hf_config=model.config)
+    setattr(model_args, "model_backbone", model_backbone)
+    # processor.tokenizer.padding_side = "right"
+    model = MMEBModel.load(model_args, is_trainable=False)
+    model.config.save_pretrained(f'{model_args.model_name}/full_model/', safe_serialization=False)
+    processor = load_processor(model_args)
+    processor.save_pretrained(f'{model_args.model_name}/full_model/', safe_serialization=False)
+    model.encoder._hf_peft_config_loaded = False
+    model.encoder.save_pretrained(f'{model_args.model_name}/full_model/', safe_serialization=False)
+if __name__ == "__main__":
+    main()

adhoc/plot.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import matplotlib.pyplot as plt
+# Data
+batch_sizes = [128, 256, 512, 1024]
+batch_perf = [49.5, 52.1, 54.3, 55.9]
+step_sizes = [1000, 2000, 4000, 8000]
+step_perf = [49.8, 52.0, 53.8, 55.3]
+num_crops = [2, 4, 8, 16]
+crop_perf = [47.1, 52.0, 54.2, 54.8]
+# Plot
+fig, axs = plt.subplots(1, 3, figsize=(10, 3))
+# Batch size subplot
+axs[0].plot(batch_sizes, batch_perf, marker='o', color='steelblue')
+axs[0].set_title('Batch Size Influence on Performance', fontsize=9, fontweight='bold')
+axs[0].set_xlabel('Batch Size')
+axs[0].set_ylabel('Performance (%)')
+# Step size subplot
+axs[1].plot(step_sizes, step_perf, marker='s', linestyle='--', color='green')
+axs[1].set_title('Step Size Influence on Performance', fontsize=9, fontweight='bold')
+axs[1].set_xlabel('Step Size')
+axs[1].set_ylabel('Performance (%)')
+# Number of crops subplot
+axs[2].plot(num_crops, crop_perf, marker='^', linestyle='-.', color='firebrick')
+axs[2].set_title('Number of Crops Influence on Performance', fontsize=9, fontweight='bold')
+axs[2].set_xlabel('Number of Crops')
+axs[2].set_ylabel('Performance (%)')
+# Tidy up
+for ax in axs:
+    ax.grid(True)
+plt.tight_layout()
+plt.show()
+plt.savefig("performance_plots_high_res.pdf", format='pdf', dpi=300)

adhoc/plot2.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import matplotlib.pyplot as plt
+import numpy as np
+# Data
+modalities = ["Image", "VisDoc", "Video"]
+lora_8 = [62.7, 52.5, 32.4]
+lora_16 = [63.2, 52.6, 33.5]
+lora_32 = [60.0, 52.1, 32.7]
+# Bar placement
+x = np.array([0, 1, 2])  # modality positions
+bar_width = 0.2
+offset = 0.24  # control spacing between LoRA bars
+# Font settings
+plt.rcParams['font.family'] = 'DejaVu Sans'
+plt.rcParams['font.size'] = 14
+# Create plot
+plt.figure(figsize=(7, 6))
+bars1 = plt.bar(x - offset, lora_8, bar_width, label='LoRA 8', color='#1f77b4')
+bars2 = plt.bar(x, lora_16, bar_width, label='LoRA 16', color='#ff7f0e')
+bars3 = plt.bar(x + offset, lora_32, bar_width, label='LoRA 32', color='#2ca02c')
+# Axes and labels
+plt.xticks(x, modalities, fontsize=16)
+plt.yticks(fontsize=16)
+plt.xlabel("Modality", fontsize=18)
+plt.ylabel("Performance", fontsize=18)
+plt.title("Performance under Different LoRA Ranks", fontsize=18)
+plt.ylim(30, 70)
+# Annotate bars
+for bars in [bars1, bars2, bars3]:
+    for bar in bars:
+        height = bar.get_height()
+        plt.text(bar.get_x() + bar.get_width() / 2, height + 0.5,
+                 f'{height:.1f}', ha='center', va='bottom', fontsize=14)
+# Legend without frame
+plt.legend(frameon=False, fontsize=14)
+plt.grid(axis='y', linestyle='--', alpha=0.6)
+plt.tight_layout()
+# Save as PDF
+plt.savefig("lora_rank_comparison_y30_wider.pdf", format='pdf', dpi=300)
+plt.show()

adhoc/test_ddp.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import os
+import torch
+import torch.distributed as dist
+import socket
+def main():
+    print(f"[Rank {os.environ.get('RANK')}] Hostname: {socket.gethostname()} | Master: {os.environ['MASTER_ADDR']}:{os.environ['MASTER_PORT']}")
+    rank = int(os.environ["RANK"])
+    local_rank = int(os.environ["LOCAL_RANK"])
+    world_size = int(os.environ["WORLD_SIZE"])
+    print(f"[rank {rank}] hostname: {os.uname().nodename}, MASTER_ADDR: {os.environ['MASTER_ADDR']}")
+    print(f"Starting rank {rank}, local rank {local_rank}, world size {world_size}")
+    dist.init_process_group("nccl")
+    torch.cuda.set_device(local_rank)
+    print(f"Hello from rank {rank} out of {world_size}")
+    dist.destroy_process_group()
+if __name__ == "__main__":
+    main()

adhoc/testset_stats.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import json
+import sys
+import numpy as np
+from src.arguments import ModelArguments, DataArguments, TrainingArguments
+from transformers import HfArgumentParser, AutoProcessor
+from src.dataset import EvalDataset
+import re
+def main():
+    for arg in sys.argv:
+        if arg.startswith("--local-rank="):
+            rank = arg.split("=")[1]
+            sys.argv.remove(arg)
+            sys.argv.append('--local_rank')
+            sys.argv.append(rank)
+    parser = HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
+    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    model_args: ModelArguments
+    data_args: DataArguments
+    training_args: TrainingArguments
+    datasets = [
+        "GQA",
+        # "ImageNet-1K", "N24News", "HatefulMemes", "VOC2007", "SUN397", "Place365", "ImageNet-A", "ImageNet-R",
+        # "ObjectNet", "Country211",
+        # "OK-VQA", "A-OKVQA", "DocVQA", "InfographicsVQA", "ChartQA", "Visual7W", "ScienceQA", "VizWiz", "GQA",
+        # "TextVQA",
+        # "VisDial", "CIRR", "VisualNews_t2i", "VisualNews_i2t", "MSCOCO_t2i", "MSCOCO_i2t", "NIGHTS", "WebQA",
+        # "FashionIQ", "Wiki-SS-NQ", "OVEN", "EDIS",
+        # "MSCOCO", "RefCOCO", "RefCOCO-Matching", "Visual7W-Pointing"
+    ]
+    # ToDo: This part of code is a little bit hacky. Need to refactor later.
+    for idx, subset in enumerate(datasets):
+        eval_qry_dataset = EvalDataset(
+            data_args=data_args,
+            model_args=model_args,
+            subset=subset,
+            text_field="qry_text",
+            img_path_field="qry_img_path",
+        )
+        eval_tgt_dataset = EvalDataset(
+            data_args=data_args,
+            model_args=model_args,
+            subset=subset,
+            text_field="tgt_text",
+            img_path_field="tgt_img_path",
+        )
+        tgttokens = []
+        tgtstr_lens = []
+        for tgt in eval_tgt_dataset:
+            # print(tgt)
+            tokens = re.split('[^a-zA-Z]', tgt[0])
+            tgttokens.append(tokens)
+            tgtstr_lens.append(len(tokens))
+            pass
+        print(f'dataset: {subset}')
+        print(f'tgt-avg-len: {np.mean(tgtstr_lens)}')
+        pass
+if __name__ == "__main__":
+    main()

adhoc/visual_doc/category_colpali_training.py ADDED Viewed

	@@ -0,0 +1,27 @@

+from datasets import load_dataset, Dataset
+from collections import defaultdict
+import os
+from tqdm import tqdm
+# Load dataset
+dataset = load_dataset("vidore/colpali_train_set", split="train")
+# Group by source
+source_splits = defaultdict(list)
+for example in tqdm(dataset):
+    source_splits[example['source']].append(example)
+# Output directory
+output_dir = "/fsx/sfr/data/MMEB/Visual_Doc/vidore"
+os.makedirs(output_dir, exist_ok=True)
+# Save each split as a Parquet file
+for source, examples in source_splits.items():
+    print(f"{source}: {len(examples)} examples")
+    file_path = os.path.join(output_dir, f"{source}.parquet")
+    # Convert to HuggingFace Dataset then save as Parquet
+    hf_dataset = Dataset.from_list(examples)
+    hf_dataset.to_parquet(file_path)
+print(f"Saved {len(source_splits)} source-based splits as Parquet to {output_dir}/")

adhoc/visual_doc/category_visrag_training.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from datasets import load_dataset, Dataset
+from collections import defaultdict
+import os
+from tqdm import tqdm
+# Base output directory
+base_output_dir = "/fsx/sfr/data/MMEB/Visual_Doc/visrag"
+# Dataset name to subfolder mapping
+datasets_to_process = {
+    'openbmb/VisRAG-Ret-Train-In-domain-data': 'Train_in_domain_data',
+}
+# Process each dataset
+for data_name, folder_name in datasets_to_process.items():
+    print(f"\nProcessing: {data_name}")
+    # Load dataset
+    dataset = load_dataset(data_name, split="train")
+    # Group by source
+    source_splits = defaultdict(list)
+    for example in tqdm(dataset):
+        source_splits[example['source']].append(example)
+    # Create output subfolder
+    output_dir = os.path.join(base_output_dir, folder_name)
+    os.makedirs(output_dir, exist_ok=True)
+    # Save each split as a Parquet file
+    for source, examples in source_splits.items():
+        print(f"{source}: {len(examples)} examples")
+        file_path = os.path.join(output_dir, f"{source}.parquet")
+        hf_dataset = Dataset.from_list(examples)
+        hf_dataset.to_parquet(file_path)
+    print(f"Saved {len(source_splits)} source-based splits to: {output_dir}/")

adhoc/visual_doc/check_corpus.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from datasets import load_dataset
+dataset_path = "/fsx/sfr/data/MMEB/Visual_Doc/vidore/Infographic-VQA.parquet"
+dataset = load_dataset("parquet", data_files={"train": dataset_path}, split="train")
+print(dataset[0])

adhoc/visual_doc/mmdoclong-doc.py ADDED Viewed

	@@ -0,0 +1,124 @@

+from datasets import load_dataset
+import fitz  # PyMuPDF
+from PIL import Image
+import os
+import json
+import base64
+from io import BytesIO
+import ast
+# Load dataset
+dataset = load_dataset("yubo2333/MMLongBench-Doc")["train"]
+# Directory containing PDFs
+pdf_dir = "/fsx/sfr/data/MMEB/Visual_Doc/mmlongbench/documents"
+def encode_image(image):
+    buffered = BytesIO()
+    image.save(buffered, format="PNG")
+    return base64.b64encode(buffered.getvalue()).decode()
+# Dictionary to store images
+all_images = {}
+processed_pdfs = {}
+pdf_corpus_mapping = {}  # Mapping from pdf_file_name to base corpus_id
+existing_corpus_ids = set()  # Track already added corpus-ids
+queries = []
+corpus = []
+qrels = []
+corpus_counter = 0
+# Process each PDF
+for qid, doc in enumerate(dataset):
+    pdf_file_name = doc["doc_id"]
+    pdf_path = os.path.join(pdf_dir, pdf_file_name)
+    if doc['evidence_pages'] == []:
+        continue
+    # Ensure the file exists before processing
+    if not os.path.exists(pdf_path):
+        print(f"Warning: PDF file {pdf_file_name} not found. Skipping.")
+        continue
+    if pdf_file_name not in processed_pdfs:
+        # Open the PDF
+        pdf_document = fitz.open(pdf_path)
+        images = []
+        # Convert each page to an image
+        for page_number in range(len(pdf_document)):
+            page = pdf_document[page_number]
+            pix = page.get_pixmap()
+            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+            images.append(img)
+        processed_pdfs[pdf_file_name] = images
+        pdf_corpus_mapping[pdf_file_name] = corpus_counter
+        corpus_counter += len(images)  # Increment by number of images
+    else:
+        images = processed_pdfs[pdf_file_name]
+    # Ensure pdf_file_name is in pdf_corpus_mapping before access
+    if pdf_file_name not in pdf_corpus_mapping:
+        print(f"Error: {pdf_file_name} not found in pdf_corpus_mapping. Skipping.")
+        continue
+    base_corpus_id = pdf_corpus_mapping[pdf_file_name]
+    all_images[pdf_file_name] = images
+    try:
+        evidence_pages = ast.literal_eval(doc['evidence_pages'])
+        if not isinstance(evidence_pages, list):
+            raise ValueError("Invalid evidence pages format")
+    except Exception as e:
+        print(f"Error parsing evidence pages for {pdf_file_name}: {e}")
+        continue
+    if len(evidence_pages) == 0:
+        continue
+    queries.append({
+        "query-id": qid,
+        "query": doc["question"],
+        "corpus_range": list(range(base_corpus_id, base_corpus_id + len(images)))
+    })
+    for img_id, _ in enumerate(images):
+        qrels.append({
+            'query-id': qid,
+            'corpus-id': base_corpus_id + img_id,
+            'score': 1
+        })
+    # Store encoded images in corpus if not already added
+    for img_id, image in enumerate(images):
+        corpus_id = base_corpus_id + img_id  # Fix corpus ID numbering
+        if corpus_id not in existing_corpus_ids:
+            corpus.append({
+                "corpus-id": corpus_id,
+                "image": encode_image(image)
+            })
+            existing_corpus_ids.add(corpus_id)
+# Function to save data in JSONL format
+def save_jsonl(filename, data):
+    with open(filename, "w", encoding="utf-8") as f:
+        for entry in data:
+            json.dump(entry, f)
+            f.write("\n")
+print('size of qrels:', len(qrels))
+print('size of queries:', len(queries))
+print('size of corpus:', len(corpus))
+save_dir = "/fsx/sfr/data/MMEB/Visual_Doc/mmlongbench/test-doc/"
+os.makedirs(save_dir, exist_ok=True)
+queries_file = "queries.jsonl"
+corpus_file = "corpus.jsonl"
+qrels_file = "qrels.jsonl"
+# Save to JSONL
+save_jsonl(os.path.join(save_dir, queries_file), queries)
+save_jsonl(os.path.join(save_dir, corpus_file), corpus)
+save_jsonl(os.path.join(save_dir, qrels_file), qrels)

adhoc/visual_doc/mmdoclong.py ADDED Viewed

	@@ -0,0 +1,124 @@

+from datasets import load_dataset
+import fitz  # PyMuPDF
+from PIL import Image
+import os
+import json
+import base64
+from io import BytesIO
+import ast
+# Load dataset
+dataset = load_dataset("yubo2333/MMLongBench-Doc")["train"]
+# Directory containing PDFs
+pdf_dir = "/fsx/sfr/data/MMEB/Visual_Doc/mmlongbench/documents"
+def encode_image(image):
+    buffered = BytesIO()
+    image.save(buffered, format="PNG")
+    return base64.b64encode(buffered.getvalue()).decode()
+# Dictionary to store images
+all_images = {}
+processed_pdfs = {}
+pdf_corpus_mapping = {}  # Mapping from pdf_file_name to base corpus_id
+existing_corpus_ids = set()  # Track already added corpus-ids
+queries = []
+corpus = []
+qrels = []
+corpus_counter = 0
+# Process each PDF
+for qid, doc in enumerate(dataset):
+    pdf_file_name = doc["doc_id"]
+    pdf_path = os.path.join(pdf_dir, pdf_file_name)
+    if doc['evidence_pages'] == []:
+        continue
+    # Ensure the file exists before processing
+    if not os.path.exists(pdf_path):
+        print(f"Warning: PDF file {pdf_file_name} not found. Skipping.")
+        continue
+    if pdf_file_name not in processed_pdfs:
+        # Open the PDF
+        pdf_document = fitz.open(pdf_path)
+        images = []
+        # Convert each page to an image
+        for page_number in range(len(pdf_document)):
+            page = pdf_document[page_number]
+            pix = page.get_pixmap()
+            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+            images.append(img)
+        processed_pdfs[pdf_file_name] = images
+        pdf_corpus_mapping[pdf_file_name] = corpus_counter
+        corpus_counter += len(images)  # Increment by number of images
+    else:
+        images = processed_pdfs[pdf_file_name]
+    # Ensure pdf_file_name is in pdf_corpus_mapping before access
+    if pdf_file_name not in pdf_corpus_mapping:
+        print(f"Error: {pdf_file_name} not found in pdf_corpus_mapping. Skipping.")
+        continue
+    base_corpus_id = pdf_corpus_mapping[pdf_file_name]
+    all_images[pdf_file_name] = images
+    try:
+        evidence_pages = ast.literal_eval(doc['evidence_pages'])
+        if not isinstance(evidence_pages, list):
+            raise ValueError("Invalid evidence pages format")
+    except Exception as e:
+        print(f"Error parsing evidence pages for {pdf_file_name}: {e}")
+        continue
+    if len(evidence_pages) == 0:
+        continue
+    queries.append({
+        "query-id": qid,
+        "query": doc["question"],
+        "corpus_range": list(range(base_corpus_id, base_corpus_id + len(images)))
+    })
+    for page_number in evidence_pages:
+        qrels.append({
+            'query-id': qid,
+            'corpus-id': base_corpus_id + int(page_number),
+            'score': 1
+        })
+    # Store encoded images in corpus if not already added
+    for img_id, image in enumerate(images):
+        corpus_id = base_corpus_id + img_id  # Fix corpus ID numbering
+        if corpus_id not in existing_corpus_ids:
+            corpus.append({
+                "corpus-id": corpus_id,
+                "image": encode_image(image)
+            })
+            existing_corpus_ids.add(corpus_id)
+# Function to save data in JSONL format
+def save_jsonl(filename, data):
+    with open(filename, "w", encoding="utf-8") as f:
+        for entry in data:
+            json.dump(entry, f)
+            f.write("\n")
+print('size of qrels:', len(qrels))
+print('size of queries:', len(queries))
+print('size of corpus:', len(corpus))
+save_dir = "/fsx/sfr/data/MMEB/Visual_Doc/mmlongbench/test/"
+os.makedirs(save_dir, exist_ok=True)
+queries_file = "queries.jsonl"
+corpus_file = "corpus.jsonl"
+qrels_file = "qrels.jsonl"
+# Save to JSONL
+save_jsonl(os.path.join(save_dir, queries_file), queries)
+save_jsonl(os.path.join(save_dir, corpus_file), corpus)
+save_jsonl(os.path.join(save_dir, qrels_file), qrels)

adhoc/visual_doc/vidoseek.py ADDED Viewed

	@@ -0,0 +1,117 @@

+from datasets import load_dataset
+import fitz  # PyMuPDF
+from PIL import Image
+import os
+import json
+import base64
+from io import BytesIO
+# Load dataset
+file_path = "/fsx/sfr/data/MMEB/Visual_Doc/ViDoSeek/vidoseek.json"
+with open(file_path, "r", encoding="utf-8") as f:
+    dataset = json.load(f)
+def encode_image(image):
+    buffered = BytesIO()
+    image.save(buffered, format="PNG")
+    return base64.b64encode(buffered.getvalue()).decode()
+pdf_dir = "/fsx/sfr/data/MMEB/Visual_Doc/ViDoSeek/vidoseek_pdf_document"
+all_images = {}
+processed_pdfs = {}
+pdf_corpus_mapping = {}  # Mapping from pdf_file_name to base corpus_id
+existing_corpus_ids = set()  # Track already added corpus-ids
+queries = []
+corpus = []
+qrels = []
+corpus_counter = 0
+# Process each PDF
+for qid, doc in enumerate(dataset['examples']):
+    pdf_file_name = doc["meta_info"]['file_name']
+    pdf_path = os.path.join(pdf_dir, pdf_file_name)
+    if doc['meta_info']['reference_page'] == []:
+        continue
+    if pdf_file_name not in processed_pdfs:
+        # Check if the file exists before reading
+        if os.path.exists(pdf_path):
+            # Open the PDF
+            pdf_document = fitz.open(pdf_path)
+            images = []
+            # Convert each page to an image
+            for page_number in range(len(pdf_document)):
+                page = pdf_document[page_number]
+                pix = page.get_pixmap()
+                img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+                images.append(img)
+            processed_pdfs[pdf_file_name] = images
+            pdf_corpus_mapping[pdf_file_name] = corpus_counter
+            corpus_counter += len(images)
+    else:
+        images = processed_pdfs[pdf_file_name]
+    base_corpus_id = pdf_corpus_mapping[pdf_file_name]
+    all_images[pdf_file_name] = images
+    queries.append({
+        "query-id": qid,
+        "query": doc["query"],
+        "corpus_range": list(range(base_corpus_id, base_corpus_id + len(images)))
+    })
+    # Assign qrels for pages in the same PDF (score = 2)
+    for img_id, _ in enumerate(images):
+        qrels.append({
+            'query-id': qid,
+            'corpus-id': base_corpus_id + img_id,
+            'score': 2
+        })
+    # Assign qrels for reference pages (score = 3)
+    for page_number in doc['meta_info']['reference_page']:
+        qrels.append({
+            'query-id': qid,
+            'corpus-id': base_corpus_id + int(page_number),
+            'score': 3
+        })
+    # Store encoded images in corpus if not already added
+    for img_id, image in enumerate(images):
+        corpus_id = base_corpus_id + img_id
+        if corpus_id not in existing_corpus_ids:
+            corpus.append({
+                "corpus-id": corpus_id,
+                "image": encode_image(image)
+            })
+            existing_corpus_ids.add(corpus_id)
+# Function to save data in JSONL format
+def save_jsonl(filename, data):
+    with open(filename, "w", encoding="utf-8") as f:
+        for entry in data:
+            json.dump(entry, f)
+            f.write("\n")
+print('size of qrels', len(qrels))
+print('size of queries', len(queries))
+print('size of corpus', len(corpus))
+save_dir = "/fsx/sfr/data/MMEB/Visual_Doc/ViDoSeek/test/"
+os.makedirs(save_dir, exist_ok=True)
+queries_file = "queries.jsonl"
+corpus_file = "corpus.jsonl"
+qrels_file = "qrels.jsonl"
+# Save to JSONL
+save_jsonl(save_dir + queries_file, queries)
+save_jsonl(save_dir + corpus_file, corpus)
+save_jsonl(save_dir + qrels_file, qrels)

experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-100/added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-100/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,7 @@

+{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system
+You are a helpful assistant.<|im_end|>
+{% endif %}<|im_start|>{{ message['role'] }}
+{% if message['content'] is string %}{{ message['content'] }}<|im_end|>
+{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>
+{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
+{% endif %}

experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-100/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-100/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_processor_type": "Qwen2_5_VLImageProcessor",
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "max_pixels": 1003520,
+  "merge_size": 2,
+  "min_pixels": 3136,
+  "patch_size": 14,
+  "processor_class": "Qwen2_5_VLProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "max_pixels": 1003520,
+    "min_pixels": 3136
+  },
+  "temporal_patch_size": 2
+}

experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-100/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-100/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,208 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "processor_class": "Qwen2_5_VLProcessor",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-100/trainer_state.json ADDED Viewed

	@@ -0,0 +1,734 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.05202913631633715,
+  "eval_steps": 500,
+  "global_step": 100,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0005202913631633715,
+      "grad_norm": 7.347542762756348,
+      "learning_rate": 0.0,
+      "loss": 1.428,
+      "step": 1
+    },
+    {
+      "epoch": 0.001040582726326743,
+      "grad_norm": 8.964370727539062,
+      "learning_rate": 5e-06,
+      "loss": 1.3459,
+      "step": 2
+    },
+    {
+      "epoch": 0.0015608740894901144,
+      "grad_norm": 10.382317543029785,
+      "learning_rate": 1e-05,
+      "loss": 1.54,
+      "step": 3
+    },
+    {
+      "epoch": 0.002081165452653486,
+      "grad_norm": 9.52104663848877,
+      "learning_rate": 1.5e-05,
+      "loss": 1.5728,
+      "step": 4
+    },
+    {
+      "epoch": 0.0026014568158168575,
+      "grad_norm": 8.74624252319336,
+      "learning_rate": 2e-05,
+      "loss": 1.5368,
+      "step": 5
+    },
+    {
+      "epoch": 0.003121748178980229,
+      "grad_norm": 7.444849491119385,
+      "learning_rate": 2.5e-05,
+      "loss": 1.2919,
+      "step": 6
+    },
+    {
+      "epoch": 0.0036420395421436005,
+      "grad_norm": 8.439070701599121,
+      "learning_rate": 3e-05,
+      "loss": 1.1753,
+      "step": 7
+    },
+    {
+      "epoch": 0.004162330905306972,
+      "grad_norm": 8.195757865905762,
+      "learning_rate": 3.5000000000000004e-05,
+      "loss": 1.2146,
+      "step": 8
+    },
+    {
+      "epoch": 0.004682622268470343,
+      "grad_norm": 9.419265747070312,
+      "learning_rate": 4e-05,
+      "loss": 1.4365,
+      "step": 9
+    },
+    {
+      "epoch": 0.005202913631633715,
+      "grad_norm": 9.609909057617188,
+      "learning_rate": 4.4999999999999996e-05,
+      "loss": 1.3843,
+      "step": 10
+    },
+    {
+      "epoch": 0.005723204994797087,
+      "grad_norm": 9.44714069366455,
+      "learning_rate": 5e-05,
+      "loss": 1.2305,
+      "step": 11
+    },
+    {
+      "epoch": 0.006243496357960458,
+      "grad_norm": 7.349897384643555,
+      "learning_rate": 5.5e-05,
+      "loss": 1.0253,
+      "step": 12
+    },
+    {
+      "epoch": 0.006763787721123829,
+      "grad_norm": 8.391256332397461,
+      "learning_rate": 6e-05,
+      "loss": 1.2242,
+      "step": 13
+    },
+    {
+      "epoch": 0.007284079084287201,
+      "grad_norm": 8.2301025390625,
+      "learning_rate": 6.500000000000001e-05,
+      "loss": 1.3285,
+      "step": 14
+    },
+    {
+      "epoch": 0.007804370447450572,
+      "grad_norm": 7.3472981452941895,
+      "learning_rate": 7.000000000000001e-05,
+      "loss": 1.2109,
+      "step": 15
+    },
+    {
+      "epoch": 0.008324661810613945,
+      "grad_norm": 6.808696746826172,
+      "learning_rate": 7.5e-05,
+      "loss": 0.8487,
+      "step": 16
+    },
+    {
+      "epoch": 0.008844953173777315,
+      "grad_norm": 7.667227268218994,
+      "learning_rate": 8e-05,
+      "loss": 1.1392,
+      "step": 17
+    },
+    {
+      "epoch": 0.009365244536940686,
+      "grad_norm": 7.13895845413208,
+      "learning_rate": 8.5e-05,
+      "loss": 1.0382,
+      "step": 18
+    },
+    {
+      "epoch": 0.009885535900104058,
+      "grad_norm": 8.155549049377441,
+      "learning_rate": 8.999999999999999e-05,
+      "loss": 1.0287,
+      "step": 19
+    },
+    {
+      "epoch": 0.01040582726326743,
+      "grad_norm": 6.322030544281006,
+      "learning_rate": 9.5e-05,
+      "loss": 0.8726,
+      "step": 20
+    },
+    {
+      "epoch": 0.010926118626430802,
+      "grad_norm": 6.219326019287109,
+      "learning_rate": 0.0001,
+      "loss": 0.8133,
+      "step": 21
+    },
+    {
+      "epoch": 0.011446409989594173,
+      "grad_norm": 3.4698593616485596,
+      "learning_rate": 0.000105,
+      "loss": 0.7479,
+      "step": 22
+    },
+    {
+      "epoch": 0.011966701352757543,
+      "grad_norm": 3.6907284259796143,
+      "learning_rate": 0.00011,
+      "loss": 0.8183,
+      "step": 23
+    },
+    {
+      "epoch": 0.012486992715920915,
+      "grad_norm": 5.981033802032471,
+      "learning_rate": 0.000115,
+      "loss": 0.587,
+      "step": 24
+    },
+    {
+      "epoch": 0.013007284079084287,
+      "grad_norm": 4.62821626663208,
+      "learning_rate": 0.00012,
+      "loss": 0.6687,
+      "step": 25
+    },
+    {
+      "epoch": 0.013527575442247659,
+      "grad_norm": 4.285324573516846,
+      "learning_rate": 0.000125,
+      "loss": 0.6252,
+      "step": 26
+    },
+    {
+      "epoch": 0.01404786680541103,
+      "grad_norm": 4.518625736236572,
+      "learning_rate": 0.00013000000000000002,
+      "loss": 0.5654,
+      "step": 27
+    },
+    {
+      "epoch": 0.014568158168574402,
+      "grad_norm": 3.4108848571777344,
+      "learning_rate": 0.000135,
+      "loss": 0.6086,
+      "step": 28
+    },
+    {
+      "epoch": 0.015088449531737774,
+      "grad_norm": 2.748203754425049,
+      "learning_rate": 0.00014000000000000001,
+      "loss": 0.552,
+      "step": 29
+    },
+    {
+      "epoch": 0.015608740894901144,
+      "grad_norm": 2.817368507385254,
+      "learning_rate": 0.000145,
+      "loss": 0.6438,
+      "step": 30
+    },
+    {
+      "epoch": 0.016129032258064516,
+      "grad_norm": 2.5259974002838135,
+      "learning_rate": 0.00015,
+      "loss": 0.7379,
+      "step": 31
+    },
+    {
+      "epoch": 0.01664932362122789,
+      "grad_norm": 2.2101669311523438,
+      "learning_rate": 0.000155,
+      "loss": 0.4164,
+      "step": 32
+    },
+    {
+      "epoch": 0.01716961498439126,
+      "grad_norm": 1.9261822700500488,
+      "learning_rate": 0.00016,
+      "loss": 0.2381,
+      "step": 33
+    },
+    {
+      "epoch": 0.01768990634755463,
+      "grad_norm": 3.6622889041900635,
+      "learning_rate": 0.000165,
+      "loss": 0.868,
+      "step": 34
+    },
+    {
+      "epoch": 0.018210197710718003,
+      "grad_norm": 3.7180657386779785,
+      "learning_rate": 0.00017,
+      "loss": 0.6459,
+      "step": 35
+    },
+    {
+      "epoch": 0.018730489073881373,
+      "grad_norm": 1.89342200756073,
+      "learning_rate": 0.000175,
+      "loss": 0.3684,
+      "step": 36
+    },
+    {
+      "epoch": 0.019250780437044746,
+      "grad_norm": 2.9859375953674316,
+      "learning_rate": 0.00017999999999999998,
+      "loss": 0.6406,
+      "step": 37
+    },
+    {
+      "epoch": 0.019771071800208116,
+      "grad_norm": 2.1704893112182617,
+      "learning_rate": 0.000185,
+      "loss": 0.399,
+      "step": 38
+    },
+    {
+      "epoch": 0.02029136316337149,
+      "grad_norm": 1.5741156339645386,
+      "learning_rate": 0.00019,
+      "loss": 0.2802,
+      "step": 39
+    },
+    {
+      "epoch": 0.02081165452653486,
+      "grad_norm": 1.5053398609161377,
+      "learning_rate": 0.00019500000000000002,
+      "loss": 0.2899,
+      "step": 40
+    },
+    {
+      "epoch": 0.02133194588969823,
+      "grad_norm": 2.4964590072631836,
+      "learning_rate": 0.0002,
+      "loss": 0.4765,
+      "step": 41
+    },
+    {
+      "epoch": 0.021852237252861603,
+      "grad_norm": 1.7406848669052124,
+      "learning_rate": 0.000205,
+      "loss": 0.3226,
+      "step": 42
+    },
+    {
+      "epoch": 0.022372528616024973,
+      "grad_norm": 4.920353412628174,
+      "learning_rate": 0.00021,
+      "loss": 0.8643,
+      "step": 43
+    },
+    {
+      "epoch": 0.022892819979188347,
+      "grad_norm": 5.375717639923096,
+      "learning_rate": 0.000215,
+      "loss": 0.654,
+      "step": 44
+    },
+    {
+      "epoch": 0.023413111342351717,
+      "grad_norm": 4.912171840667725,
+      "learning_rate": 0.00022,
+      "loss": 0.5138,
+      "step": 45
+    },
+    {
+      "epoch": 0.023933402705515087,
+      "grad_norm": 1.8745571374893188,
+      "learning_rate": 0.00022500000000000002,
+      "loss": 0.194,
+      "step": 46
+    },
+    {
+      "epoch": 0.02445369406867846,
+      "grad_norm": 3.949474811553955,
+      "learning_rate": 0.00023,
+      "loss": 0.642,
+      "step": 47
+    },
+    {
+      "epoch": 0.02497398543184183,
+      "grad_norm": 3.1853504180908203,
+      "learning_rate": 0.000235,
+      "loss": 0.5319,
+      "step": 48
+    },
+    {
+      "epoch": 0.025494276795005204,
+      "grad_norm": 1.6487188339233398,
+      "learning_rate": 0.00024,
+      "loss": 0.2386,
+      "step": 49
+    },
+    {
+      "epoch": 0.026014568158168574,
+      "grad_norm": 2.2893128395080566,
+      "learning_rate": 0.000245,
+      "loss": 0.3759,
+      "step": 50
+    },
+    {
+      "epoch": 0.026534859521331947,
+      "grad_norm": 1.7786861658096313,
+      "learning_rate": 0.00025,
+      "loss": 0.4172,
+      "step": 51
+    },
+    {
+      "epoch": 0.027055150884495317,
+      "grad_norm": 2.229330062866211,
+      "learning_rate": 0.000255,
+      "loss": 0.48,
+      "step": 52
+    },
+    {
+      "epoch": 0.027575442247658687,
+      "grad_norm": 3.2765936851501465,
+      "learning_rate": 0.00026000000000000003,
+      "loss": 0.5127,
+      "step": 53
+    },
+    {
+      "epoch": 0.02809573361082206,
+      "grad_norm": 2.407878875732422,
+      "learning_rate": 0.00026500000000000004,
+      "loss": 0.4979,
+      "step": 54
+    },
+    {
+      "epoch": 0.02861602497398543,
+      "grad_norm": 2.218383312225342,
+      "learning_rate": 0.00027,
+      "loss": 0.4228,
+      "step": 55
+    },
+    {
+      "epoch": 0.029136316337148804,
+      "grad_norm": 1.7399003505706787,
+      "learning_rate": 0.000275,
+      "loss": 0.3607,
+      "step": 56
+    },
+    {
+      "epoch": 0.029656607700312174,
+      "grad_norm": 1.4118911027908325,
+      "learning_rate": 0.00028000000000000003,
+      "loss": 0.2743,
+      "step": 57
+    },
+    {
+      "epoch": 0.030176899063475548,
+      "grad_norm": 2.2282633781433105,
+      "learning_rate": 0.000285,
+      "loss": 0.3152,
+      "step": 58
+    },
+    {
+      "epoch": 0.030697190426638918,
+      "grad_norm": 1.9690927267074585,
+      "learning_rate": 0.00029,
+      "loss": 0.236,
+      "step": 59
+    },
+    {
+      "epoch": 0.031217481789802288,
+      "grad_norm": 1.8251880407333374,
+      "learning_rate": 0.000295,
+      "loss": 0.2945,
+      "step": 60
+    },
+    {
+      "epoch": 0.03173777315296566,
+      "grad_norm": 2.371242046356201,
+      "learning_rate": 0.0003,
+      "loss": 0.3196,
+      "step": 61
+    },
+    {
+      "epoch": 0.03225806451612903,
+      "grad_norm": 2.302980899810791,
+      "learning_rate": 0.000305,
+      "loss": 0.2548,
+      "step": 62
+    },
+    {
+      "epoch": 0.032778355879292405,
+      "grad_norm": 1.5861401557922363,
+      "learning_rate": 0.00031,
+      "loss": 0.3465,
+      "step": 63
+    },
+    {
+      "epoch": 0.03329864724245578,
+      "grad_norm": 2.5026137828826904,
+      "learning_rate": 0.000315,
+      "loss": 0.3962,
+      "step": 64
+    },
+    {
+      "epoch": 0.033818938605619145,
+      "grad_norm": 2.0949132442474365,
+      "learning_rate": 0.00032,
+      "loss": 0.3963,
+      "step": 65
+    },
+    {
+      "epoch": 0.03433922996878252,
+      "grad_norm": 1.5639926195144653,
+      "learning_rate": 0.00032500000000000004,
+      "loss": 0.1823,
+      "step": 66
+    },
+    {
+      "epoch": 0.03485952133194589,
+      "grad_norm": 2.0358474254608154,
+      "learning_rate": 0.00033,
+      "loss": 0.2778,
+      "step": 67
+    },
+    {
+      "epoch": 0.03537981269510926,
+      "grad_norm": 1.1801868677139282,
+      "learning_rate": 0.000335,
+      "loss": 0.197,
+      "step": 68
+    },
+    {
+      "epoch": 0.03590010405827263,
+      "grad_norm": 1.996211290359497,
+      "learning_rate": 0.00034,
+      "loss": 0.3872,
+      "step": 69
+    },
+    {
+      "epoch": 0.036420395421436005,
+      "grad_norm": 1.555777668952942,
+      "learning_rate": 0.000345,
+      "loss": 0.2224,
+      "step": 70
+    },
+    {
+      "epoch": 0.03694068678459938,
+      "grad_norm": 1.497721791267395,
+      "learning_rate": 0.00035,
+      "loss": 0.2542,
+      "step": 71
+    },
+    {
+      "epoch": 0.037460978147762745,
+      "grad_norm": 1.0776859521865845,
+      "learning_rate": 0.000355,
+      "loss": 0.1237,
+      "step": 72
+    },
+    {
+      "epoch": 0.03798126951092612,
+      "grad_norm": 1.9728138446807861,
+      "learning_rate": 0.00035999999999999997,
+      "loss": 0.2316,
+      "step": 73
+    },
+    {
+      "epoch": 0.03850156087408949,
+      "grad_norm": 1.0327483415603638,
+      "learning_rate": 0.000365,
+      "loss": 0.1536,
+      "step": 74
+    },
+    {
+      "epoch": 0.03902185223725286,
+      "grad_norm": 3.0641462802886963,
+      "learning_rate": 0.00037,
+      "loss": 0.3374,
+      "step": 75
+    },
+    {
+      "epoch": 0.03954214360041623,
+      "grad_norm": 1.374601125717163,
+      "learning_rate": 0.000375,
+      "loss": 0.1633,
+      "step": 76
+    },
+    {
+      "epoch": 0.040062434963579606,
+      "grad_norm": 3.1142971515655518,
+      "learning_rate": 0.00038,
+      "loss": 0.3083,
+      "step": 77
+    },
+    {
+      "epoch": 0.04058272632674298,
+      "grad_norm": 1.792457103729248,
+      "learning_rate": 0.00038500000000000003,
+      "loss": 0.2413,
+      "step": 78
+    },
+    {
+      "epoch": 0.041103017689906346,
+      "grad_norm": 1.4155240058898926,
+      "learning_rate": 0.00039000000000000005,
+      "loss": 0.334,
+      "step": 79
+    },
+    {
+      "epoch": 0.04162330905306972,
+      "grad_norm": 2.6872141361236572,
+      "learning_rate": 0.000395,
+      "loss": 0.328,
+      "step": 80
+    },
+    {
+      "epoch": 0.04214360041623309,
+      "grad_norm": 1.301841378211975,
+      "learning_rate": 0.0004,
+      "loss": 0.3391,
+      "step": 81
+    },
+    {
+      "epoch": 0.04266389177939646,
+      "grad_norm": 1.5664141178131104,
+      "learning_rate": 0.00040500000000000003,
+      "loss": 0.2262,
+      "step": 82
+    },
+    {
+      "epoch": 0.04318418314255983,
+      "grad_norm": 1.6563435792922974,
+      "learning_rate": 0.00041,
+      "loss": 0.3451,
+      "step": 83
+    },
+    {
+      "epoch": 0.043704474505723206,
+      "grad_norm": 1.2458600997924805,
+      "learning_rate": 0.000415,
+      "loss": 0.186,
+      "step": 84
+    },
+    {
+      "epoch": 0.04422476586888657,
+      "grad_norm": 1.3530123233795166,
+      "learning_rate": 0.00042,
+      "loss": 0.2447,
+      "step": 85
+    },
+    {
+      "epoch": 0.044745057232049947,
+      "grad_norm": 1.341471791267395,
+      "learning_rate": 0.000425,
+      "loss": 0.293,
+      "step": 86
+    },
+    {
+      "epoch": 0.04526534859521332,
+      "grad_norm": 1.2903335094451904,
+      "learning_rate": 0.00043,
+      "loss": 0.1058,
+      "step": 87
+    },
+    {
+      "epoch": 0.045785639958376693,
+      "grad_norm": 1.2263115644454956,
+      "learning_rate": 0.000435,
+      "loss": 0.1733,
+      "step": 88
+    },
+    {
+      "epoch": 0.04630593132154006,
+      "grad_norm": 2.077279806137085,
+      "learning_rate": 0.00044,
+      "loss": 0.1786,
+      "step": 89
+    },
+    {
+      "epoch": 0.046826222684703434,
+      "grad_norm": 1.2153059244155884,
+      "learning_rate": 0.00044500000000000003,
+      "loss": 0.202,
+      "step": 90
+    },
+    {
+      "epoch": 0.04734651404786681,
+      "grad_norm": 1.4943453073501587,
+      "learning_rate": 0.00045000000000000004,
+      "loss": 0.2266,
+      "step": 91
+    },
+    {
+      "epoch": 0.047866805411030174,
+      "grad_norm": 1.2306129932403564,
+      "learning_rate": 0.000455,
+      "loss": 0.1646,
+      "step": 92
+    },
+    {
+      "epoch": 0.04838709677419355,
+      "grad_norm": 0.9076014757156372,
+      "learning_rate": 0.00046,
+      "loss": 0.1725,
+      "step": 93
+    },
+    {
+      "epoch": 0.04890738813735692,
+      "grad_norm": 1.7097628116607666,
+      "learning_rate": 0.000465,
+      "loss": 0.2515,
+      "step": 94
+    },
+    {
+      "epoch": 0.049427679500520294,
+      "grad_norm": 1.1039310693740845,
+      "learning_rate": 0.00047,
+      "loss": 0.173,
+      "step": 95
+    },
+    {
+      "epoch": 0.04994797086368366,
+      "grad_norm": 1.1415642499923706,
+      "learning_rate": 0.000475,
+      "loss": 0.1644,
+      "step": 96
+    },
+    {
+      "epoch": 0.050468262226847034,
+      "grad_norm": 1.2579185962677002,
+      "learning_rate": 0.00048,
+      "loss": 0.1811,
+      "step": 97
+    },
+    {
+      "epoch": 0.05098855359001041,
+      "grad_norm": 1.0912175178527832,
+      "learning_rate": 0.00048499999999999997,
+      "loss": 0.1661,
+      "step": 98
+    },
+    {
+      "epoch": 0.051508844953173774,
+      "grad_norm": 1.124626874923706,
+      "learning_rate": 0.00049,
+      "loss": 0.204,
+      "step": 99
+    },
+    {
+      "epoch": 0.05202913631633715,
+      "grad_norm": 0.776817262172699,
+      "learning_rate": 0.000495,
+      "loss": 0.1299,
+      "step": 100
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 1000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 128,
+  "trial_name": null,
+  "trial_params": null
+}

experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-100/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-400/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-500/added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-500/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,7 @@

+{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system
+You are a helpful assistant.<|im_end|>
+{% endif %}<|im_start|>{{ message['role'] }}
+{% if message['content'] is string %}{{ message['content'] }}<|im_end|>
+{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>
+{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
+{% endif %}

experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-500/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-500/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_processor_type": "Qwen2_5_VLImageProcessor",
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "max_pixels": 1003520,
+  "merge_size": 2,
+  "min_pixels": 3136,
+  "patch_size": 14,
+  "processor_class": "Qwen2_5_VLProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "max_pixels": 1003520,
+    "min_pixels": 3136
+  },
+  "temporal_patch_size": 2
+}

experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-500/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-500/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,208 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "processor_class": "Qwen2_5_VLProcessor",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-500/trainer_state.json ADDED Viewed

	@@ -0,0 +1,3534 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.2601456815816857,
+  "eval_steps": 500,
+  "global_step": 500,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0005202913631633715,
+      "grad_norm": 7.347542762756348,
+      "learning_rate": 0.0,
+      "loss": 1.428,
+      "step": 1
+    },
+    {
+      "epoch": 0.001040582726326743,
+      "grad_norm": 8.964370727539062,
+      "learning_rate": 5e-06,
+      "loss": 1.3459,
+      "step": 2
+    },
+    {
+      "epoch": 0.0015608740894901144,
+      "grad_norm": 10.382317543029785,
+      "learning_rate": 1e-05,
+      "loss": 1.54,
+      "step": 3
+    },
+    {
+      "epoch": 0.002081165452653486,
+      "grad_norm": 9.52104663848877,
+      "learning_rate": 1.5e-05,
+      "loss": 1.5728,
+      "step": 4
+    },
+    {
+      "epoch": 0.0026014568158168575,
+      "grad_norm": 8.74624252319336,
+      "learning_rate": 2e-05,
+      "loss": 1.5368,
+      "step": 5
+    },
+    {
+      "epoch": 0.003121748178980229,
+      "grad_norm": 7.444849491119385,
+      "learning_rate": 2.5e-05,
+      "loss": 1.2919,
+      "step": 6
+    },
+    {
+      "epoch": 0.0036420395421436005,
+      "grad_norm": 8.439070701599121,
+      "learning_rate": 3e-05,
+      "loss": 1.1753,
+      "step": 7
+    },
+    {
+      "epoch": 0.004162330905306972,
+      "grad_norm": 8.195757865905762,
+      "learning_rate": 3.5000000000000004e-05,
+      "loss": 1.2146,
+      "step": 8
+    },
+    {
+      "epoch": 0.004682622268470343,
+      "grad_norm": 9.419265747070312,
+      "learning_rate": 4e-05,
+      "loss": 1.4365,
+      "step": 9
+    },
+    {
+      "epoch": 0.005202913631633715,
+      "grad_norm": 9.609909057617188,
+      "learning_rate": 4.4999999999999996e-05,
+      "loss": 1.3843,
+      "step": 10
+    },
+    {
+      "epoch": 0.005723204994797087,
+      "grad_norm": 9.44714069366455,
+      "learning_rate": 5e-05,
+      "loss": 1.2305,
+      "step": 11
+    },
+    {
+      "epoch": 0.006243496357960458,
+      "grad_norm": 7.349897384643555,
+      "learning_rate": 5.5e-05,
+      "loss": 1.0253,
+      "step": 12
+    },
+    {
+      "epoch": 0.006763787721123829,
+      "grad_norm": 8.391256332397461,
+      "learning_rate": 6e-05,
+      "loss": 1.2242,
+      "step": 13
+    },
+    {
+      "epoch": 0.007284079084287201,
+      "grad_norm": 8.2301025390625,
+      "learning_rate": 6.500000000000001e-05,
+      "loss": 1.3285,
+      "step": 14
+    },
+    {
+      "epoch": 0.007804370447450572,
+      "grad_norm": 7.3472981452941895,
+      "learning_rate": 7.000000000000001e-05,
+      "loss": 1.2109,
+      "step": 15
+    },
+    {
+      "epoch": 0.008324661810613945,
+      "grad_norm": 6.808696746826172,
+      "learning_rate": 7.5e-05,
+      "loss": 0.8487,
+      "step": 16
+    },
+    {
+      "epoch": 0.008844953173777315,
+      "grad_norm": 7.667227268218994,
+      "learning_rate": 8e-05,
+      "loss": 1.1392,
+      "step": 17
+    },
+    {
+      "epoch": 0.009365244536940686,
+      "grad_norm": 7.13895845413208,
+      "learning_rate": 8.5e-05,
+      "loss": 1.0382,
+      "step": 18
+    },
+    {
+      "epoch": 0.009885535900104058,
+      "grad_norm": 8.155549049377441,
+      "learning_rate": 8.999999999999999e-05,
+      "loss": 1.0287,
+      "step": 19
+    },
+    {
+      "epoch": 0.01040582726326743,
+      "grad_norm": 6.322030544281006,
+      "learning_rate": 9.5e-05,
+      "loss": 0.8726,
+      "step": 20
+    },
+    {
+      "epoch": 0.010926118626430802,
+      "grad_norm": 6.219326019287109,
+      "learning_rate": 0.0001,
+      "loss": 0.8133,
+      "step": 21
+    },
+    {
+      "epoch": 0.011446409989594173,
+      "grad_norm": 3.4698593616485596,
+      "learning_rate": 0.000105,
+      "loss": 0.7479,
+      "step": 22
+    },
+    {
+      "epoch": 0.011966701352757543,
+      "grad_norm": 3.6907284259796143,
+      "learning_rate": 0.00011,
+      "loss": 0.8183,
+      "step": 23
+    },
+    {
+      "epoch": 0.012486992715920915,
+      "grad_norm": 5.981033802032471,
+      "learning_rate": 0.000115,
+      "loss": 0.587,
+      "step": 24
+    },
+    {
+      "epoch": 0.013007284079084287,
+      "grad_norm": 4.62821626663208,
+      "learning_rate": 0.00012,
+      "loss": 0.6687,
+      "step": 25
+    },
+    {
+      "epoch": 0.013527575442247659,
+      "grad_norm": 4.285324573516846,
+      "learning_rate": 0.000125,
+      "loss": 0.6252,
+      "step": 26
+    },
+    {
+      "epoch": 0.01404786680541103,
+      "grad_norm": 4.518625736236572,
+      "learning_rate": 0.00013000000000000002,
+      "loss": 0.5654,
+      "step": 27
+    },
+    {
+      "epoch": 0.014568158168574402,
+      "grad_norm": 3.4108848571777344,
+      "learning_rate": 0.000135,
+      "loss": 0.6086,
+      "step": 28
+    },
+    {
+      "epoch": 0.015088449531737774,
+      "grad_norm": 2.748203754425049,
+      "learning_rate": 0.00014000000000000001,
+      "loss": 0.552,
+      "step": 29
+    },
+    {
+      "epoch": 0.015608740894901144,
+      "grad_norm": 2.817368507385254,
+      "learning_rate": 0.000145,
+      "loss": 0.6438,
+      "step": 30
+    },
+    {
+      "epoch": 0.016129032258064516,
+      "grad_norm": 2.5259974002838135,
+      "learning_rate": 0.00015,
+      "loss": 0.7379,
+      "step": 31
+    },
+    {
+      "epoch": 0.01664932362122789,
+      "grad_norm": 2.2101669311523438,
+      "learning_rate": 0.000155,
+      "loss": 0.4164,
+      "step": 32
+    },
+    {
+      "epoch": 0.01716961498439126,
+      "grad_norm": 1.9261822700500488,
+      "learning_rate": 0.00016,
+      "loss": 0.2381,
+      "step": 33
+    },
+    {
+      "epoch": 0.01768990634755463,
+      "grad_norm": 3.6622889041900635,
+      "learning_rate": 0.000165,
+      "loss": 0.868,
+      "step": 34
+    },
+    {
+      "epoch": 0.018210197710718003,
+      "grad_norm": 3.7180657386779785,
+      "learning_rate": 0.00017,
+      "loss": 0.6459,
+      "step": 35
+    },
+    {
+      "epoch": 0.018730489073881373,
+      "grad_norm": 1.89342200756073,
+      "learning_rate": 0.000175,
+      "loss": 0.3684,
+      "step": 36
+    },
+    {
+      "epoch": 0.019250780437044746,
+      "grad_norm": 2.9859375953674316,
+      "learning_rate": 0.00017999999999999998,
+      "loss": 0.6406,
+      "step": 37
+    },
+    {
+      "epoch": 0.019771071800208116,
+      "grad_norm": 2.1704893112182617,
+      "learning_rate": 0.000185,
+      "loss": 0.399,
+      "step": 38
+    },
+    {
+      "epoch": 0.02029136316337149,
+      "grad_norm": 1.5741156339645386,
+      "learning_rate": 0.00019,
+      "loss": 0.2802,
+      "step": 39
+    },
+    {
+      "epoch": 0.02081165452653486,
+      "grad_norm": 1.5053398609161377,
+      "learning_rate": 0.00019500000000000002,
+      "loss": 0.2899,
+      "step": 40
+    },
+    {
+      "epoch": 0.02133194588969823,
+      "grad_norm": 2.4964590072631836,
+      "learning_rate": 0.0002,
+      "loss": 0.4765,
+      "step": 41
+    },
+    {
+      "epoch": 0.021852237252861603,
+      "grad_norm": 1.7406848669052124,
+      "learning_rate": 0.000205,
+      "loss": 0.3226,
+      "step": 42
+    },
+    {
+      "epoch": 0.022372528616024973,
+      "grad_norm": 4.920353412628174,
+      "learning_rate": 0.00021,
+      "loss": 0.8643,
+      "step": 43
+    },
+    {
+      "epoch": 0.022892819979188347,
+      "grad_norm": 5.375717639923096,
+      "learning_rate": 0.000215,
+      "loss": 0.654,
+      "step": 44
+    },
+    {
+      "epoch": 0.023413111342351717,
+      "grad_norm": 4.912171840667725,
+      "learning_rate": 0.00022,
+      "loss": 0.5138,
+      "step": 45
+    },
+    {
+      "epoch": 0.023933402705515087,
+      "grad_norm": 1.8745571374893188,
+      "learning_rate": 0.00022500000000000002,
+      "loss": 0.194,
+      "step": 46
+    },
+    {
+      "epoch": 0.02445369406867846,
+      "grad_norm": 3.949474811553955,
+      "learning_rate": 0.00023,
+      "loss": 0.642,
+      "step": 47
+    },
+    {
+      "epoch": 0.02497398543184183,
+      "grad_norm": 3.1853504180908203,
+      "learning_rate": 0.000235,
+      "loss": 0.5319,
+      "step": 48
+    },
+    {
+      "epoch": 0.025494276795005204,
+      "grad_norm": 1.6487188339233398,
+      "learning_rate": 0.00024,
+      "loss": 0.2386,
+      "step": 49
+    },
+    {
+      "epoch": 0.026014568158168574,
+      "grad_norm": 2.2893128395080566,
+      "learning_rate": 0.000245,
+      "loss": 0.3759,
+      "step": 50
+    },
+    {
+      "epoch": 0.026534859521331947,
+      "grad_norm": 1.7786861658096313,
+      "learning_rate": 0.00025,
+      "loss": 0.4172,
+      "step": 51
+    },
+    {
+      "epoch": 0.027055150884495317,
+      "grad_norm": 2.229330062866211,
+      "learning_rate": 0.000255,
+      "loss": 0.48,
+      "step": 52
+    },
+    {
+      "epoch": 0.027575442247658687,
+      "grad_norm": 3.2765936851501465,
+      "learning_rate": 0.00026000000000000003,
+      "loss": 0.5127,
+      "step": 53
+    },
+    {
+      "epoch": 0.02809573361082206,
+      "grad_norm": 2.407878875732422,
+      "learning_rate": 0.00026500000000000004,
+      "loss": 0.4979,
+      "step": 54
+    },
+    {
+      "epoch": 0.02861602497398543,
+      "grad_norm": 2.218383312225342,
+      "learning_rate": 0.00027,
+      "loss": 0.4228,
+      "step": 55
+    },
+    {
+      "epoch": 0.029136316337148804,
+      "grad_norm": 1.7399003505706787,
+      "learning_rate": 0.000275,
+      "loss": 0.3607,
+      "step": 56
+    },
+    {
+      "epoch": 0.029656607700312174,
+      "grad_norm": 1.4118911027908325,
+      "learning_rate": 0.00028000000000000003,
+      "loss": 0.2743,
+      "step": 57
+    },
+    {
+      "epoch": 0.030176899063475548,
+      "grad_norm": 2.2282633781433105,
+      "learning_rate": 0.000285,
+      "loss": 0.3152,
+      "step": 58
+    },
+    {
+      "epoch": 0.030697190426638918,
+      "grad_norm": 1.9690927267074585,
+      "learning_rate": 0.00029,
+      "loss": 0.236,
+      "step": 59
+    },
+    {
+      "epoch": 0.031217481789802288,
+      "grad_norm": 1.8251880407333374,
+      "learning_rate": 0.000295,
+      "loss": 0.2945,
+      "step": 60
+    },
+    {
+      "epoch": 0.03173777315296566,
+      "grad_norm": 2.371242046356201,
+      "learning_rate": 0.0003,
+      "loss": 0.3196,
+      "step": 61
+    },
+    {
+      "epoch": 0.03225806451612903,
+      "grad_norm": 2.302980899810791,
+      "learning_rate": 0.000305,
+      "loss": 0.2548,
+      "step": 62
+    },
+    {
+      "epoch": 0.032778355879292405,
+      "grad_norm": 1.5861401557922363,
+      "learning_rate": 0.00031,
+      "loss": 0.3465,
+      "step": 63
+    },
+    {
+      "epoch": 0.03329864724245578,
+      "grad_norm": 2.5026137828826904,
+      "learning_rate": 0.000315,
+      "loss": 0.3962,
+      "step": 64
+    },
+    {
+      "epoch": 0.033818938605619145,
+      "grad_norm": 2.0949132442474365,
+      "learning_rate": 0.00032,
+      "loss": 0.3963,
+      "step": 65
+    },
+    {
+      "epoch": 0.03433922996878252,
+      "grad_norm": 1.5639926195144653,
+      "learning_rate": 0.00032500000000000004,
+      "loss": 0.1823,
+      "step": 66
+    },
+    {
+      "epoch": 0.03485952133194589,
+      "grad_norm": 2.0358474254608154,
+      "learning_rate": 0.00033,
+      "loss": 0.2778,
+      "step": 67
+    },
+    {
+      "epoch": 0.03537981269510926,
+      "grad_norm": 1.1801868677139282,
+      "learning_rate": 0.000335,
+      "loss": 0.197,
+      "step": 68
+    },
+    {
+      "epoch": 0.03590010405827263,
+      "grad_norm": 1.996211290359497,
+      "learning_rate": 0.00034,
+      "loss": 0.3872,
+      "step": 69
+    },
+    {
+      "epoch": 0.036420395421436005,
+      "grad_norm": 1.555777668952942,
+      "learning_rate": 0.000345,
+      "loss": 0.2224,
+      "step": 70
+    },
+    {
+      "epoch": 0.03694068678459938,
+      "grad_norm": 1.497721791267395,
+      "learning_rate": 0.00035,
+      "loss": 0.2542,
+      "step": 71
+    },
+    {
+      "epoch": 0.037460978147762745,
+      "grad_norm": 1.0776859521865845,
+      "learning_rate": 0.000355,
+      "loss": 0.1237,
+      "step": 72
+    },
+    {
+      "epoch": 0.03798126951092612,
+      "grad_norm": 1.9728138446807861,
+      "learning_rate": 0.00035999999999999997,
+      "loss": 0.2316,
+      "step": 73
+    },
+    {
+      "epoch": 0.03850156087408949,
+      "grad_norm": 1.0327483415603638,
+      "learning_rate": 0.000365,
+      "loss": 0.1536,
+      "step": 74
+    },
+    {
+      "epoch": 0.03902185223725286,
+      "grad_norm": 3.0641462802886963,
+      "learning_rate": 0.00037,
+      "loss": 0.3374,
+      "step": 75
+    },
+    {
+      "epoch": 0.03954214360041623,
+      "grad_norm": 1.374601125717163,
+      "learning_rate": 0.000375,
+      "loss": 0.1633,
+      "step": 76
+    },
+    {
+      "epoch": 0.040062434963579606,
+      "grad_norm": 3.1142971515655518,
+      "learning_rate": 0.00038,
+      "loss": 0.3083,
+      "step": 77
+    },
+    {
+      "epoch": 0.04058272632674298,
+      "grad_norm": 1.792457103729248,
+      "learning_rate": 0.00038500000000000003,
+      "loss": 0.2413,
+      "step": 78
+    },
+    {
+      "epoch": 0.041103017689906346,
+      "grad_norm": 1.4155240058898926,
+      "learning_rate": 0.00039000000000000005,
+      "loss": 0.334,
+      "step": 79
+    },
+    {
+      "epoch": 0.04162330905306972,
+      "grad_norm": 2.6872141361236572,
+      "learning_rate": 0.000395,
+      "loss": 0.328,
+      "step": 80
+    },
+    {
+      "epoch": 0.04214360041623309,
+      "grad_norm": 1.301841378211975,
+      "learning_rate": 0.0004,
+      "loss": 0.3391,
+      "step": 81
+    },
+    {
+      "epoch": 0.04266389177939646,
+      "grad_norm": 1.5664141178131104,
+      "learning_rate": 0.00040500000000000003,
+      "loss": 0.2262,
+      "step": 82
+    },
+    {
+      "epoch": 0.04318418314255983,
+      "grad_norm": 1.6563435792922974,
+      "learning_rate": 0.00041,
+      "loss": 0.3451,
+      "step": 83
+    },
+    {
+      "epoch": 0.043704474505723206,
+      "grad_norm": 1.2458600997924805,
+      "learning_rate": 0.000415,
+      "loss": 0.186,
+      "step": 84
+    },
+    {
+      "epoch": 0.04422476586888657,
+      "grad_norm": 1.3530123233795166,
+      "learning_rate": 0.00042,
+      "loss": 0.2447,
+      "step": 85
+    },
+    {
+      "epoch": 0.044745057232049947,
+      "grad_norm": 1.341471791267395,
+      "learning_rate": 0.000425,
+      "loss": 0.293,
+      "step": 86
+    },
+    {
+      "epoch": 0.04526534859521332,
+      "grad_norm": 1.2903335094451904,
+      "learning_rate": 0.00043,
+      "loss": 0.1058,
+      "step": 87
+    },
+    {
+      "epoch": 0.045785639958376693,
+      "grad_norm": 1.2263115644454956,
+      "learning_rate": 0.000435,
+      "loss": 0.1733,
+      "step": 88
+    },
+    {
+      "epoch": 0.04630593132154006,
+      "grad_norm": 2.077279806137085,
+      "learning_rate": 0.00044,
+      "loss": 0.1786,
+      "step": 89
+    },
+    {
+      "epoch": 0.046826222684703434,
+      "grad_norm": 1.2153059244155884,
+      "learning_rate": 0.00044500000000000003,
+      "loss": 0.202,
+      "step": 90
+    },
+    {
+      "epoch": 0.04734651404786681,
+      "grad_norm": 1.4943453073501587,
+      "learning_rate": 0.00045000000000000004,
+      "loss": 0.2266,
+      "step": 91
+    },
+    {
+      "epoch": 0.047866805411030174,
+      "grad_norm": 1.2306129932403564,
+      "learning_rate": 0.000455,
+      "loss": 0.1646,
+      "step": 92
+    },
+    {
+      "epoch": 0.04838709677419355,
+      "grad_norm": 0.9076014757156372,
+      "learning_rate": 0.00046,
+      "loss": 0.1725,
+      "step": 93
+    },
+    {
+      "epoch": 0.04890738813735692,
+      "grad_norm": 1.7097628116607666,
+      "learning_rate": 0.000465,
+      "loss": 0.2515,
+      "step": 94
+    },
+    {
+      "epoch": 0.049427679500520294,
+      "grad_norm": 1.1039310693740845,
+      "learning_rate": 0.00047,
+      "loss": 0.173,
+      "step": 95
+    },
+    {
+      "epoch": 0.04994797086368366,
+      "grad_norm": 1.1415642499923706,
+      "learning_rate": 0.000475,
+      "loss": 0.1644,
+      "step": 96
+    },
+    {
+      "epoch": 0.050468262226847034,
+      "grad_norm": 1.2579185962677002,
+      "learning_rate": 0.00048,
+      "loss": 0.1811,
+      "step": 97
+    },
+    {
+      "epoch": 0.05098855359001041,
+      "grad_norm": 1.0912175178527832,
+      "learning_rate": 0.00048499999999999997,
+      "loss": 0.1661,
+      "step": 98
+    },
+    {
+      "epoch": 0.051508844953173774,
+      "grad_norm": 1.124626874923706,
+      "learning_rate": 0.00049,
+      "loss": 0.204,
+      "step": 99
+    },
+    {
+      "epoch": 0.05202913631633715,
+      "grad_norm": 0.776817262172699,
+      "learning_rate": 0.000495,
+      "loss": 0.1299,
+      "step": 100
+    },
+    {
+      "epoch": 0.05254942767950052,
+      "grad_norm": 1.7208999395370483,
+      "learning_rate": 0.0005,
+      "loss": 0.2042,
+      "step": 101
+    },
+    {
+      "epoch": 0.053069719042663895,
+      "grad_norm": 0.8992323875427246,
+      "learning_rate": 0.0004994444444444445,
+      "loss": 0.133,
+      "step": 102
+    },
+    {
+      "epoch": 0.05359001040582726,
+      "grad_norm": 1.1753864288330078,
+      "learning_rate": 0.0004988888888888889,
+      "loss": 0.1929,
+      "step": 103
+    },
+    {
+      "epoch": 0.054110301768990635,
+      "grad_norm": 0.8430110812187195,
+      "learning_rate": 0.0004983333333333334,
+      "loss": 0.1562,
+      "step": 104
+    },
+    {
+      "epoch": 0.05463059313215401,
+      "grad_norm": 0.7993983626365662,
+      "learning_rate": 0.0004977777777777778,
+      "loss": 0.1191,
+      "step": 105
+    },
+    {
+      "epoch": 0.055150884495317375,
+      "grad_norm": 0.7009360194206238,
+      "learning_rate": 0.0004972222222222222,
+      "loss": 0.0774,
+      "step": 106
+    },
+    {
+      "epoch": 0.05567117585848075,
+      "grad_norm": 1.1701024770736694,
+      "learning_rate": 0.0004966666666666666,
+      "loss": 0.1479,
+      "step": 107
+    },
+    {
+      "epoch": 0.05619146722164412,
+      "grad_norm": 1.1719233989715576,
+      "learning_rate": 0.0004961111111111111,
+      "loss": 0.1862,
+      "step": 108
+    },
+    {
+      "epoch": 0.056711758584807495,
+      "grad_norm": 0.7088543772697449,
+      "learning_rate": 0.0004955555555555556,
+      "loss": 0.089,
+      "step": 109
+    },
+    {
+      "epoch": 0.05723204994797086,
+      "grad_norm": 0.7498027086257935,
+      "learning_rate": 0.000495,
+      "loss": 0.1664,
+      "step": 110
+    },
+    {
+      "epoch": 0.057752341311134235,
+      "grad_norm": 1.3316175937652588,
+      "learning_rate": 0.0004944444444444445,
+      "loss": 0.1582,
+      "step": 111
+    },
+    {
+      "epoch": 0.05827263267429761,
+      "grad_norm": 1.1741178035736084,
+      "learning_rate": 0.0004938888888888889,
+      "loss": 0.1409,
+      "step": 112
+    },
+    {
+      "epoch": 0.058792924037460975,
+      "grad_norm": 0.8257745504379272,
+      "learning_rate": 0.0004933333333333334,
+      "loss": 0.1443,
+      "step": 113
+    },
+    {
+      "epoch": 0.05931321540062435,
+      "grad_norm": 0.7418781518936157,
+      "learning_rate": 0.0004927777777777777,
+      "loss": 0.0851,
+      "step": 114
+    },
+    {
+      "epoch": 0.05983350676378772,
+      "grad_norm": 0.7079729437828064,
+      "learning_rate": 0.0004922222222222222,
+      "loss": 0.1046,
+      "step": 115
+    },
+    {
+      "epoch": 0.060353798126951096,
+      "grad_norm": 0.8635478019714355,
+      "learning_rate": 0.0004916666666666666,
+      "loss": 0.1176,
+      "step": 116
+    },
+    {
+      "epoch": 0.06087408949011446,
+      "grad_norm": 0.9280768632888794,
+      "learning_rate": 0.0004911111111111111,
+      "loss": 0.1064,
+      "step": 117
+    },
+    {
+      "epoch": 0.061394380853277836,
+      "grad_norm": 1.0225319862365723,
+      "learning_rate": 0.0004905555555555556,
+      "loss": 0.1482,
+      "step": 118
+    },
+    {
+      "epoch": 0.06191467221644121,
+      "grad_norm": 0.6103273630142212,
+      "learning_rate": 0.00049,
+      "loss": 0.0657,
+      "step": 119
+    },
+    {
+      "epoch": 0.062434963579604576,
+      "grad_norm": 0.7268538475036621,
+      "learning_rate": 0.0004894444444444445,
+      "loss": 0.147,
+      "step": 120
+    },
+    {
+      "epoch": 0.06295525494276795,
+      "grad_norm": 0.7652425765991211,
+      "learning_rate": 0.0004888888888888889,
+      "loss": 0.1118,
+      "step": 121
+    },
+    {
+      "epoch": 0.06347554630593132,
+      "grad_norm": 0.7623610496520996,
+      "learning_rate": 0.0004883333333333333,
+      "loss": 0.1252,
+      "step": 122
+    },
+    {
+      "epoch": 0.0639958376690947,
+      "grad_norm": 0.9734853506088257,
+      "learning_rate": 0.0004877777777777778,
+      "loss": 0.1418,
+      "step": 123
+    },
+    {
+      "epoch": 0.06451612903225806,
+      "grad_norm": 0.8588402271270752,
+      "learning_rate": 0.0004872222222222222,
+      "loss": 0.0848,
+      "step": 124
+    },
+    {
+      "epoch": 0.06503642039542143,
+      "grad_norm": 0.5615188479423523,
+      "learning_rate": 0.0004866666666666667,
+      "loss": 0.1006,
+      "step": 125
+    },
+    {
+      "epoch": 0.06555671175858481,
+      "grad_norm": 0.9584555625915527,
+      "learning_rate": 0.0004861111111111111,
+      "loss": 0.1728,
+      "step": 126
+    },
+    {
+      "epoch": 0.06607700312174818,
+      "grad_norm": 0.6202451586723328,
+      "learning_rate": 0.0004855555555555556,
+      "loss": 0.0931,
+      "step": 127
+    },
+    {
+      "epoch": 0.06659729448491156,
+      "grad_norm": 0.6236227750778198,
+      "learning_rate": 0.00048499999999999997,
+      "loss": 0.0674,
+      "step": 128
+    },
+    {
+      "epoch": 0.06711758584807492,
+      "grad_norm": 0.66746985912323,
+      "learning_rate": 0.00048444444444444446,
+      "loss": 0.1218,
+      "step": 129
+    },
+    {
+      "epoch": 0.06763787721123829,
+      "grad_norm": 0.5942522883415222,
+      "learning_rate": 0.0004838888888888889,
+      "loss": 0.0748,
+      "step": 130
+    },
+    {
+      "epoch": 0.06815816857440167,
+      "grad_norm": 0.6593474745750427,
+      "learning_rate": 0.00048333333333333334,
+      "loss": 0.1078,
+      "step": 131
+    },
+    {
+      "epoch": 0.06867845993756504,
+      "grad_norm": 0.9823837876319885,
+      "learning_rate": 0.0004827777777777778,
+      "loss": 0.143,
+      "step": 132
+    },
+    {
+      "epoch": 0.0691987513007284,
+      "grad_norm": 0.6464436054229736,
+      "learning_rate": 0.0004822222222222222,
+      "loss": 0.1289,
+      "step": 133
+    },
+    {
+      "epoch": 0.06971904266389178,
+      "grad_norm": 0.8930130004882812,
+      "learning_rate": 0.0004816666666666667,
+      "loss": 0.1437,
+      "step": 134
+    },
+    {
+      "epoch": 0.07023933402705515,
+      "grad_norm": 0.6195514798164368,
+      "learning_rate": 0.0004811111111111111,
+      "loss": 0.0743,
+      "step": 135
+    },
+    {
+      "epoch": 0.07075962539021852,
+      "grad_norm": 0.5456336736679077,
+      "learning_rate": 0.0004805555555555556,
+      "loss": 0.1139,
+      "step": 136
+    },
+    {
+      "epoch": 0.0712799167533819,
+      "grad_norm": 0.5359215140342712,
+      "learning_rate": 0.00048,
+      "loss": 0.0969,
+      "step": 137
+    },
+    {
+      "epoch": 0.07180020811654526,
+      "grad_norm": 0.8201822638511658,
+      "learning_rate": 0.00047944444444444445,
+      "loss": 0.0993,
+      "step": 138
+    },
+    {
+      "epoch": 0.07232049947970863,
+      "grad_norm": 0.6110750436782837,
+      "learning_rate": 0.0004788888888888889,
+      "loss": 0.1011,
+      "step": 139
+    },
+    {
+      "epoch": 0.07284079084287201,
+      "grad_norm": 0.48351359367370605,
+      "learning_rate": 0.0004783333333333333,
+      "loss": 0.0869,
+      "step": 140
+    },
+    {
+      "epoch": 0.07336108220603538,
+      "grad_norm": 0.6683951020240784,
+      "learning_rate": 0.0004777777777777778,
+      "loss": 0.0814,
+      "step": 141
+    },
+    {
+      "epoch": 0.07388137356919876,
+      "grad_norm": 0.742268443107605,
+      "learning_rate": 0.00047722222222222225,
+      "loss": 0.0894,
+      "step": 142
+    },
+    {
+      "epoch": 0.07440166493236212,
+      "grad_norm": 0.6042747497558594,
+      "learning_rate": 0.0004766666666666667,
+      "loss": 0.0794,
+      "step": 143
+    },
+    {
+      "epoch": 0.07492195629552549,
+      "grad_norm": 0.6750574111938477,
+      "learning_rate": 0.0004761111111111111,
+      "loss": 0.0801,
+      "step": 144
+    },
+    {
+      "epoch": 0.07544224765868887,
+      "grad_norm": 0.6264745593070984,
+      "learning_rate": 0.00047555555555555556,
+      "loss": 0.1127,
+      "step": 145
+    },
+    {
+      "epoch": 0.07596253902185224,
+      "grad_norm": 0.7027119994163513,
+      "learning_rate": 0.000475,
+      "loss": 0.1043,
+      "step": 146
+    },
+    {
+      "epoch": 0.0764828303850156,
+      "grad_norm": 0.5967740416526794,
+      "learning_rate": 0.00047444444444444444,
+      "loss": 0.1012,
+      "step": 147
+    },
+    {
+      "epoch": 0.07700312174817898,
+      "grad_norm": 0.6070584058761597,
+      "learning_rate": 0.00047388888888888893,
+      "loss": 0.1279,
+      "step": 148
+    },
+    {
+      "epoch": 0.07752341311134235,
+      "grad_norm": 0.5560263991355896,
+      "learning_rate": 0.00047333333333333336,
+      "loss": 0.0862,
+      "step": 149
+    },
+    {
+      "epoch": 0.07804370447450572,
+      "grad_norm": 0.5680839419364929,
+      "learning_rate": 0.0004727777777777778,
+      "loss": 0.0542,
+      "step": 150
+    },
+    {
+      "epoch": 0.0785639958376691,
+      "grad_norm": 0.7852948904037476,
+      "learning_rate": 0.00047222222222222224,
+      "loss": 0.1563,
+      "step": 151
+    },
+    {
+      "epoch": 0.07908428720083246,
+      "grad_norm": 0.6082651019096375,
+      "learning_rate": 0.0004716666666666667,
+      "loss": 0.0897,
+      "step": 152
+    },
+    {
+      "epoch": 0.07960457856399583,
+      "grad_norm": 0.43691495060920715,
+      "learning_rate": 0.0004711111111111111,
+      "loss": 0.0923,
+      "step": 153
+    },
+    {
+      "epoch": 0.08012486992715921,
+      "grad_norm": 0.5423274040222168,
+      "learning_rate": 0.00047055555555555555,
+      "loss": 0.1048,
+      "step": 154
+    },
+    {
+      "epoch": 0.08064516129032258,
+      "grad_norm": 0.5422453284263611,
+      "learning_rate": 0.00047,
+      "loss": 0.0818,
+      "step": 155
+    },
+    {
+      "epoch": 0.08116545265348596,
+      "grad_norm": 0.3782746493816376,
+      "learning_rate": 0.0004694444444444445,
+      "loss": 0.0763,
+      "step": 156
+    },
+    {
+      "epoch": 0.08168574401664933,
+      "grad_norm": 0.735381543636322,
+      "learning_rate": 0.0004688888888888889,
+      "loss": 0.1249,
+      "step": 157
+    },
+    {
+      "epoch": 0.08220603537981269,
+      "grad_norm": 0.43137192726135254,
+      "learning_rate": 0.00046833333333333335,
+      "loss": 0.0509,
+      "step": 158
+    },
+    {
+      "epoch": 0.08272632674297607,
+      "grad_norm": 0.49553734064102173,
+      "learning_rate": 0.0004677777777777778,
+      "loss": 0.059,
+      "step": 159
+    },
+    {
+      "epoch": 0.08324661810613944,
+      "grad_norm": 0.8710311651229858,
+      "learning_rate": 0.0004672222222222222,
+      "loss": 0.1079,
+      "step": 160
+    },
+    {
+      "epoch": 0.0837669094693028,
+      "grad_norm": 0.3895374536514282,
+      "learning_rate": 0.00046666666666666666,
+      "loss": 0.0761,
+      "step": 161
+    },
+    {
+      "epoch": 0.08428720083246619,
+      "grad_norm": 0.6220502257347107,
+      "learning_rate": 0.0004661111111111111,
+      "loss": 0.1077,
+      "step": 162
+    },
+    {
+      "epoch": 0.08480749219562955,
+      "grad_norm": 0.43123388290405273,
+      "learning_rate": 0.0004655555555555556,
+      "loss": 0.0809,
+      "step": 163
+    },
+    {
+      "epoch": 0.08532778355879292,
+      "grad_norm": 0.5482419729232788,
+      "learning_rate": 0.000465,
+      "loss": 0.0887,
+      "step": 164
+    },
+    {
+      "epoch": 0.0858480749219563,
+      "grad_norm": 0.3709481358528137,
+      "learning_rate": 0.00046444444444444446,
+      "loss": 0.0492,
+      "step": 165
+    },
+    {
+      "epoch": 0.08636836628511967,
+      "grad_norm": 0.3871099650859833,
+      "learning_rate": 0.0004638888888888889,
+      "loss": 0.0525,
+      "step": 166
+    },
+    {
+      "epoch": 0.08688865764828303,
+      "grad_norm": 0.49930575489997864,
+      "learning_rate": 0.00046333333333333334,
+      "loss": 0.0456,
+      "step": 167
+    },
+    {
+      "epoch": 0.08740894901144641,
+      "grad_norm": 0.35331490635871887,
+      "learning_rate": 0.0004627777777777778,
+      "loss": 0.0556,
+      "step": 168
+    },
+    {
+      "epoch": 0.08792924037460978,
+      "grad_norm": 0.3593418300151825,
+      "learning_rate": 0.0004622222222222222,
+      "loss": 0.0664,
+      "step": 169
+    },
+    {
+      "epoch": 0.08844953173777315,
+      "grad_norm": 0.38897961378097534,
+      "learning_rate": 0.0004616666666666667,
+      "loss": 0.0849,
+      "step": 170
+    },
+    {
+      "epoch": 0.08896982310093653,
+      "grad_norm": 0.4496786296367645,
+      "learning_rate": 0.00046111111111111114,
+      "loss": 0.0768,
+      "step": 171
+    },
+    {
+      "epoch": 0.08949011446409989,
+      "grad_norm": 0.43698763847351074,
+      "learning_rate": 0.0004605555555555556,
+      "loss": 0.062,
+      "step": 172
+    },
+    {
+      "epoch": 0.09001040582726327,
+      "grad_norm": 0.3045942485332489,
+      "learning_rate": 0.00046,
+      "loss": 0.0466,
+      "step": 173
+    },
+    {
+      "epoch": 0.09053069719042664,
+      "grad_norm": 0.3364112079143524,
+      "learning_rate": 0.00045944444444444445,
+      "loss": 0.051,
+      "step": 174
+    },
+    {
+      "epoch": 0.09105098855359,
+      "grad_norm": 0.7610157132148743,
+      "learning_rate": 0.0004588888888888889,
+      "loss": 0.0752,
+      "step": 175
+    },
+    {
+      "epoch": 0.09157127991675339,
+      "grad_norm": 0.4646570682525635,
+      "learning_rate": 0.0004583333333333333,
+      "loss": 0.1027,
+      "step": 176
+    },
+    {
+      "epoch": 0.09209157127991675,
+      "grad_norm": 0.8062249422073364,
+      "learning_rate": 0.0004577777777777778,
+      "loss": 0.1622,
+      "step": 177
+    },
+    {
+      "epoch": 0.09261186264308012,
+      "grad_norm": 0.3921089470386505,
+      "learning_rate": 0.0004572222222222222,
+      "loss": 0.042,
+      "step": 178
+    },
+    {
+      "epoch": 0.0931321540062435,
+      "grad_norm": 0.3350071310997009,
+      "learning_rate": 0.0004566666666666667,
+      "loss": 0.0461,
+      "step": 179
+    },
+    {
+      "epoch": 0.09365244536940687,
+      "grad_norm": 0.272399365901947,
+      "learning_rate": 0.0004561111111111111,
+      "loss": 0.0443,
+      "step": 180
+    },
+    {
+      "epoch": 0.09417273673257023,
+      "grad_norm": 0.33471840620040894,
+      "learning_rate": 0.00045555555555555556,
+      "loss": 0.0504,
+      "step": 181
+    },
+    {
+      "epoch": 0.09469302809573361,
+      "grad_norm": 0.3427852392196655,
+      "learning_rate": 0.000455,
+      "loss": 0.0475,
+      "step": 182
+    },
+    {
+      "epoch": 0.09521331945889698,
+      "grad_norm": 0.40719184279441833,
+      "learning_rate": 0.00045444444444444444,
+      "loss": 0.0595,
+      "step": 183
+    },
+    {
+      "epoch": 0.09573361082206035,
+      "grad_norm": 0.35792386531829834,
+      "learning_rate": 0.00045388888888888893,
+      "loss": 0.0593,
+      "step": 184
+    },
+    {
+      "epoch": 0.09625390218522373,
+      "grad_norm": 0.47860586643218994,
+      "learning_rate": 0.0004533333333333333,
+      "loss": 0.0787,
+      "step": 185
+    },
+    {
+      "epoch": 0.0967741935483871,
+      "grad_norm": 0.5289556980133057,
+      "learning_rate": 0.0004527777777777778,
+      "loss": 0.0756,
+      "step": 186
+    },
+    {
+      "epoch": 0.09729448491155047,
+      "grad_norm": 0.4445546567440033,
+      "learning_rate": 0.00045222222222222224,
+      "loss": 0.0611,
+      "step": 187
+    },
+    {
+      "epoch": 0.09781477627471384,
+      "grad_norm": 0.4470248222351074,
+      "learning_rate": 0.0004516666666666667,
+      "loss": 0.0784,
+      "step": 188
+    },
+    {
+      "epoch": 0.09833506763787721,
+      "grad_norm": 0.4186774790287018,
+      "learning_rate": 0.0004511111111111111,
+      "loss": 0.0421,
+      "step": 189
+    },
+    {
+      "epoch": 0.09885535900104059,
+      "grad_norm": 0.28850093483924866,
+      "learning_rate": 0.00045055555555555555,
+      "loss": 0.0414,
+      "step": 190
+    },
+    {
+      "epoch": 0.09937565036420395,
+      "grad_norm": 0.3566621243953705,
+      "learning_rate": 0.00045000000000000004,
+      "loss": 0.0511,
+      "step": 191
+    },
+    {
+      "epoch": 0.09989594172736732,
+      "grad_norm": 0.4454294741153717,
+      "learning_rate": 0.0004494444444444444,
+      "loss": 0.0995,
+      "step": 192
+    },
+    {
+      "epoch": 0.1004162330905307,
+      "grad_norm": 0.45749202370643616,
+      "learning_rate": 0.0004488888888888889,
+      "loss": 0.0946,
+      "step": 193
+    },
+    {
+      "epoch": 0.10093652445369407,
+      "grad_norm": 0.2874762713909149,
+      "learning_rate": 0.0004483333333333333,
+      "loss": 0.0546,
+      "step": 194
+    },
+    {
+      "epoch": 0.10145681581685743,
+      "grad_norm": 0.26859250664711,
+      "learning_rate": 0.0004477777777777778,
+      "loss": 0.0329,
+      "step": 195
+    },
+    {
+      "epoch": 0.10197710718002082,
+      "grad_norm": 0.3758945167064667,
+      "learning_rate": 0.0004472222222222222,
+      "loss": 0.0721,
+      "step": 196
+    },
+    {
+      "epoch": 0.10249739854318418,
+      "grad_norm": 0.3250490725040436,
+      "learning_rate": 0.00044666666666666666,
+      "loss": 0.0454,
+      "step": 197
+    },
+    {
+      "epoch": 0.10301768990634755,
+      "grad_norm": 0.43297529220581055,
+      "learning_rate": 0.00044611111111111115,
+      "loss": 0.0705,
+      "step": 198
+    },
+    {
+      "epoch": 0.10353798126951093,
+      "grad_norm": 0.2871391773223877,
+      "learning_rate": 0.00044555555555555554,
+      "loss": 0.0389,
+      "step": 199
+    },
+    {
+      "epoch": 0.1040582726326743,
+      "grad_norm": 0.3059026896953583,
+      "learning_rate": 0.00044500000000000003,
+      "loss": 0.0532,
+      "step": 200
+    },
+    {
+      "epoch": 0.10457856399583768,
+      "grad_norm": 0.37971311807632446,
+      "learning_rate": 0.0004444444444444444,
+      "loss": 0.0805,
+      "step": 201
+    },
+    {
+      "epoch": 0.10509885535900104,
+      "grad_norm": 0.3776862621307373,
+      "learning_rate": 0.0004438888888888889,
+      "loss": 0.0936,
+      "step": 202
+    },
+    {
+      "epoch": 0.10561914672216441,
+      "grad_norm": 0.33026885986328125,
+      "learning_rate": 0.00044333333333333334,
+      "loss": 0.0453,
+      "step": 203
+    },
+    {
+      "epoch": 0.10613943808532779,
+      "grad_norm": 0.36573582887649536,
+      "learning_rate": 0.0004427777777777778,
+      "loss": 0.0375,
+      "step": 204
+    },
+    {
+      "epoch": 0.10665972944849116,
+      "grad_norm": 0.5324421525001526,
+      "learning_rate": 0.00044222222222222227,
+      "loss": 0.0572,
+      "step": 205
+    },
+    {
+      "epoch": 0.10718002081165452,
+      "grad_norm": 0.2825300395488739,
+      "learning_rate": 0.00044166666666666665,
+      "loss": 0.043,
+      "step": 206
+    },
+    {
+      "epoch": 0.1077003121748179,
+      "grad_norm": 0.5899777412414551,
+      "learning_rate": 0.00044111111111111114,
+      "loss": 0.0601,
+      "step": 207
+    },
+    {
+      "epoch": 0.10822060353798127,
+      "grad_norm": 0.4580536186695099,
+      "learning_rate": 0.0004405555555555555,
+      "loss": 0.0962,
+      "step": 208
+    },
+    {
+      "epoch": 0.10874089490114464,
+      "grad_norm": 0.28349611163139343,
+      "learning_rate": 0.00044,
+      "loss": 0.0499,
+      "step": 209
+    },
+    {
+      "epoch": 0.10926118626430802,
+      "grad_norm": 0.35658761858940125,
+      "learning_rate": 0.0004394444444444445,
+      "loss": 0.063,
+      "step": 210
+    },
+    {
+      "epoch": 0.10978147762747138,
+      "grad_norm": 0.28881627321243286,
+      "learning_rate": 0.0004388888888888889,
+      "loss": 0.0468,
+      "step": 211
+    },
+    {
+      "epoch": 0.11030176899063475,
+      "grad_norm": 0.3207852840423584,
+      "learning_rate": 0.0004383333333333334,
+      "loss": 0.0437,
+      "step": 212
+    },
+    {
+      "epoch": 0.11082206035379813,
+      "grad_norm": 0.3225831985473633,
+      "learning_rate": 0.00043777777777777776,
+      "loss": 0.045,
+      "step": 213
+    },
+    {
+      "epoch": 0.1113423517169615,
+      "grad_norm": 0.3248918056488037,
+      "learning_rate": 0.00043722222222222225,
+      "loss": 0.0725,
+      "step": 214
+    },
+    {
+      "epoch": 0.11186264308012488,
+      "grad_norm": 0.45690080523490906,
+      "learning_rate": 0.00043666666666666664,
+      "loss": 0.0485,
+      "step": 215
+    },
+    {
+      "epoch": 0.11238293444328824,
+      "grad_norm": 0.41606688499450684,
+      "learning_rate": 0.00043611111111111113,
+      "loss": 0.0902,
+      "step": 216
+    },
+    {
+      "epoch": 0.11290322580645161,
+      "grad_norm": 0.2519379258155823,
+      "learning_rate": 0.0004355555555555555,
+      "loss": 0.0347,
+      "step": 217
+    },
+    {
+      "epoch": 0.11342351716961499,
+      "grad_norm": 0.2908113896846771,
+      "learning_rate": 0.000435,
+      "loss": 0.0557,
+      "step": 218
+    },
+    {
+      "epoch": 0.11394380853277836,
+      "grad_norm": 0.3034886121749878,
+      "learning_rate": 0.0004344444444444445,
+      "loss": 0.0774,
+      "step": 219
+    },
+    {
+      "epoch": 0.11446409989594172,
+      "grad_norm": 0.3579472005367279,
+      "learning_rate": 0.0004338888888888889,
+      "loss": 0.0717,
+      "step": 220
+    },
+    {
+      "epoch": 0.1149843912591051,
+      "grad_norm": 0.33985862135887146,
+      "learning_rate": 0.00043333333333333337,
+      "loss": 0.0611,
+      "step": 221
+    },
+    {
+      "epoch": 0.11550468262226847,
+      "grad_norm": 0.42294999957084656,
+      "learning_rate": 0.00043277777777777775,
+      "loss": 0.0824,
+      "step": 222
+    },
+    {
+      "epoch": 0.11602497398543184,
+      "grad_norm": 0.33317992091178894,
+      "learning_rate": 0.00043222222222222224,
+      "loss": 0.0631,
+      "step": 223
+    },
+    {
+      "epoch": 0.11654526534859522,
+      "grad_norm": 0.347391813993454,
+      "learning_rate": 0.0004316666666666667,
+      "loss": 0.0726,
+      "step": 224
+    },
+    {
+      "epoch": 0.11706555671175858,
+      "grad_norm": 0.4332979917526245,
+      "learning_rate": 0.0004311111111111111,
+      "loss": 0.0493,
+      "step": 225
+    },
+    {
+      "epoch": 0.11758584807492195,
+      "grad_norm": 0.2794676721096039,
+      "learning_rate": 0.0004305555555555556,
+      "loss": 0.0428,
+      "step": 226
+    },
+    {
+      "epoch": 0.11810613943808533,
+      "grad_norm": 0.2665698826313019,
+      "learning_rate": 0.00043,
+      "loss": 0.0628,
+      "step": 227
+    },
+    {
+      "epoch": 0.1186264308012487,
+      "grad_norm": 0.47581610083580017,
+      "learning_rate": 0.0004294444444444445,
+      "loss": 0.0412,
+      "step": 228
+    },
+    {
+      "epoch": 0.11914672216441206,
+      "grad_norm": 0.356357216835022,
+      "learning_rate": 0.00042888888888888886,
+      "loss": 0.0309,
+      "step": 229
+    },
+    {
+      "epoch": 0.11966701352757544,
+      "grad_norm": 0.2871776819229126,
+      "learning_rate": 0.00042833333333333335,
+      "loss": 0.0702,
+      "step": 230
+    },
+    {
+      "epoch": 0.12018730489073881,
+      "grad_norm": 0.18419012427330017,
+      "learning_rate": 0.0004277777777777778,
+      "loss": 0.0279,
+      "step": 231
+    },
+    {
+      "epoch": 0.12070759625390219,
+      "grad_norm": 0.33823081851005554,
+      "learning_rate": 0.00042722222222222223,
+      "loss": 0.0315,
+      "step": 232
+    },
+    {
+      "epoch": 0.12122788761706556,
+      "grad_norm": 0.33040091395378113,
+      "learning_rate": 0.0004266666666666667,
+      "loss": 0.0699,
+      "step": 233
+    },
+    {
+      "epoch": 0.12174817898022892,
+      "grad_norm": 0.3405701518058777,
+      "learning_rate": 0.0004261111111111111,
+      "loss": 0.0649,
+      "step": 234
+    },
+    {
+      "epoch": 0.1222684703433923,
+      "grad_norm": 0.49750658869743347,
+      "learning_rate": 0.0004255555555555556,
+      "loss": 0.0546,
+      "step": 235
+    },
+    {
+      "epoch": 0.12278876170655567,
+      "grad_norm": 0.4337189495563507,
+      "learning_rate": 0.000425,
+      "loss": 0.04,
+      "step": 236
+    },
+    {
+      "epoch": 0.12330905306971904,
+      "grad_norm": 0.4933618903160095,
+      "learning_rate": 0.00042444444444444447,
+      "loss": 0.0754,
+      "step": 237
+    },
+    {
+      "epoch": 0.12382934443288242,
+      "grad_norm": 0.2554134130477905,
+      "learning_rate": 0.0004238888888888889,
+      "loss": 0.0466,
+      "step": 238
+    },
+    {
+      "epoch": 0.12434963579604578,
+      "grad_norm": 0.34308987855911255,
+      "learning_rate": 0.00042333333333333334,
+      "loss": 0.0621,
+      "step": 239
+    },
+    {
+      "epoch": 0.12486992715920915,
+      "grad_norm": 0.3505181670188904,
+      "learning_rate": 0.0004227777777777778,
+      "loss": 0.0646,
+      "step": 240
+    },
+    {
+      "epoch": 0.12539021852237253,
+      "grad_norm": 0.36588212847709656,
+      "learning_rate": 0.0004222222222222222,
+      "loss": 0.0528,
+      "step": 241
+    },
+    {
+      "epoch": 0.1259105098855359,
+      "grad_norm": 0.4307107627391815,
+      "learning_rate": 0.0004216666666666667,
+      "loss": 0.0573,
+      "step": 242
+    },
+    {
+      "epoch": 0.12643080124869926,
+      "grad_norm": 0.37300869822502136,
+      "learning_rate": 0.0004211111111111111,
+      "loss": 0.0486,
+      "step": 243
+    },
+    {
+      "epoch": 0.12695109261186263,
+      "grad_norm": 0.30476123094558716,
+      "learning_rate": 0.0004205555555555556,
+      "loss": 0.0508,
+      "step": 244
+    },
+    {
+      "epoch": 0.12747138397502603,
+      "grad_norm": 0.3410852551460266,
+      "learning_rate": 0.00042,
+      "loss": 0.0678,
+      "step": 245
+    },
+    {
+      "epoch": 0.1279916753381894,
+      "grad_norm": 0.2417326420545578,
+      "learning_rate": 0.00041944444444444445,
+      "loss": 0.0359,
+      "step": 246
+    },
+    {
+      "epoch": 0.12851196670135276,
+      "grad_norm": 0.27660486102104187,
+      "learning_rate": 0.0004188888888888889,
+      "loss": 0.0445,
+      "step": 247
+    },
+    {
+      "epoch": 0.12903225806451613,
+      "grad_norm": 0.2319687157869339,
+      "learning_rate": 0.00041833333333333333,
+      "loss": 0.0486,
+      "step": 248
+    },
+    {
+      "epoch": 0.1295525494276795,
+      "grad_norm": 0.3000936210155487,
+      "learning_rate": 0.0004177777777777778,
+      "loss": 0.05,
+      "step": 249
+    },
+    {
+      "epoch": 0.13007284079084286,
+      "grad_norm": 0.31069836020469666,
+      "learning_rate": 0.0004172222222222222,
+      "loss": 0.0412,
+      "step": 250
+    },
+    {
+      "epoch": 0.13059313215400625,
+      "grad_norm": 0.24123403429985046,
+      "learning_rate": 0.0004166666666666667,
+      "loss": 0.0573,
+      "step": 251
+    },
+    {
+      "epoch": 0.13111342351716962,
+      "grad_norm": 0.2845012843608856,
+      "learning_rate": 0.00041611111111111113,
+      "loss": 0.0322,
+      "step": 252
+    },
+    {
+      "epoch": 0.13163371488033299,
+      "grad_norm": 0.3060798943042755,
+      "learning_rate": 0.00041555555555555557,
+      "loss": 0.0328,
+      "step": 253
+    },
+    {
+      "epoch": 0.13215400624349635,
+      "grad_norm": 0.3751870393753052,
+      "learning_rate": 0.000415,
+      "loss": 0.0481,
+      "step": 254
+    },
+    {
+      "epoch": 0.13267429760665972,
+      "grad_norm": 0.27469107508659363,
+      "learning_rate": 0.00041444444444444444,
+      "loss": 0.0415,
+      "step": 255
+    },
+    {
+      "epoch": 0.1331945889698231,
+      "grad_norm": 0.32122480869293213,
+      "learning_rate": 0.0004138888888888889,
+      "loss": 0.0641,
+      "step": 256
+    },
+    {
+      "epoch": 0.13371488033298648,
+      "grad_norm": 0.34307950735092163,
+      "learning_rate": 0.0004133333333333333,
+      "loss": 0.0623,
+      "step": 257
+    },
+    {
+      "epoch": 0.13423517169614985,
+      "grad_norm": 0.25482696294784546,
+      "learning_rate": 0.0004127777777777778,
+      "loss": 0.044,
+      "step": 258
+    },
+    {
+      "epoch": 0.1347554630593132,
+      "grad_norm": 0.4288344383239746,
+      "learning_rate": 0.00041222222222222224,
+      "loss": 0.0757,
+      "step": 259
+    },
+    {
+      "epoch": 0.13527575442247658,
+      "grad_norm": 0.24957087635993958,
+      "learning_rate": 0.0004116666666666667,
+      "loss": 0.0328,
+      "step": 260
+    },
+    {
+      "epoch": 0.13579604578563995,
+      "grad_norm": 0.14633908867835999,
+      "learning_rate": 0.0004111111111111111,
+      "loss": 0.0279,
+      "step": 261
+    },
+    {
+      "epoch": 0.13631633714880334,
+      "grad_norm": 0.2976965606212616,
+      "learning_rate": 0.00041055555555555555,
+      "loss": 0.0452,
+      "step": 262
+    },
+    {
+      "epoch": 0.1368366285119667,
+      "grad_norm": 0.3640998601913452,
+      "learning_rate": 0.00041,
+      "loss": 0.0493,
+      "step": 263
+    },
+    {
+      "epoch": 0.13735691987513007,
+      "grad_norm": 0.5083469748497009,
+      "learning_rate": 0.00040944444444444443,
+      "loss": 0.0788,
+      "step": 264
+    },
+    {
+      "epoch": 0.13787721123829344,
+      "grad_norm": 0.24888603389263153,
+      "learning_rate": 0.0004088888888888889,
+      "loss": 0.0641,
+      "step": 265
+    },
+    {
+      "epoch": 0.1383975026014568,
+      "grad_norm": 0.2294796109199524,
+      "learning_rate": 0.00040833333333333336,
+      "loss": 0.0532,
+      "step": 266
+    },
+    {
+      "epoch": 0.1389177939646202,
+      "grad_norm": 0.2386179268360138,
+      "learning_rate": 0.0004077777777777778,
+      "loss": 0.0292,
+      "step": 267
+    },
+    {
+      "epoch": 0.13943808532778357,
+      "grad_norm": 0.23145556449890137,
+      "learning_rate": 0.00040722222222222223,
+      "loss": 0.0499,
+      "step": 268
+    },
+    {
+      "epoch": 0.13995837669094693,
+      "grad_norm": 0.23750337958335876,
+      "learning_rate": 0.00040666666666666667,
+      "loss": 0.0393,
+      "step": 269
+    },
+    {
+      "epoch": 0.1404786680541103,
+      "grad_norm": 0.2392527312040329,
+      "learning_rate": 0.0004061111111111111,
+      "loss": 0.0486,
+      "step": 270
+    },
+    {
+      "epoch": 0.14099895941727367,
+      "grad_norm": 0.26626136898994446,
+      "learning_rate": 0.00040555555555555554,
+      "loss": 0.0385,
+      "step": 271
+    },
+    {
+      "epoch": 0.14151925078043703,
+      "grad_norm": 0.2984270751476288,
+      "learning_rate": 0.00040500000000000003,
+      "loss": 0.049,
+      "step": 272
+    },
+    {
+      "epoch": 0.14203954214360043,
+      "grad_norm": 0.2629552483558655,
+      "learning_rate": 0.00040444444444444447,
+      "loss": 0.0639,
+      "step": 273
+    },
+    {
+      "epoch": 0.1425598335067638,
+      "grad_norm": 0.2580653429031372,
+      "learning_rate": 0.0004038888888888889,
+      "loss": 0.0397,
+      "step": 274
+    },
+    {
+      "epoch": 0.14308012486992716,
+      "grad_norm": 0.2550220787525177,
+      "learning_rate": 0.00040333333333333334,
+      "loss": 0.058,
+      "step": 275
+    },
+    {
+      "epoch": 0.14360041623309053,
+      "grad_norm": 0.15206913650035858,
+      "learning_rate": 0.0004027777777777778,
+      "loss": 0.0293,
+      "step": 276
+    },
+    {
+      "epoch": 0.1441207075962539,
+      "grad_norm": 0.26907533407211304,
+      "learning_rate": 0.0004022222222222222,
+      "loss": 0.0474,
+      "step": 277
+    },
+    {
+      "epoch": 0.14464099895941726,
+      "grad_norm": 0.32724031805992126,
+      "learning_rate": 0.00040166666666666665,
+      "loss": 0.0489,
+      "step": 278
+    },
+    {
+      "epoch": 0.14516129032258066,
+      "grad_norm": 0.298304945230484,
+      "learning_rate": 0.0004011111111111111,
+      "loss": 0.0502,
+      "step": 279
+    },
+    {
+      "epoch": 0.14568158168574402,
+      "grad_norm": 0.24465495347976685,
+      "learning_rate": 0.0004005555555555556,
+      "loss": 0.0375,
+      "step": 280
+    },
+    {
+      "epoch": 0.1462018730489074,
+      "grad_norm": 0.3101779818534851,
+      "learning_rate": 0.0004,
+      "loss": 0.0486,
+      "step": 281
+    },
+    {
+      "epoch": 0.14672216441207075,
+      "grad_norm": 0.3175954520702362,
+      "learning_rate": 0.00039944444444444446,
+      "loss": 0.0456,
+      "step": 282
+    },
+    {
+      "epoch": 0.14724245577523412,
+      "grad_norm": 0.27732956409454346,
+      "learning_rate": 0.0003988888888888889,
+      "loss": 0.0372,
+      "step": 283
+    },
+    {
+      "epoch": 0.14776274713839752,
+      "grad_norm": 0.5436331033706665,
+      "learning_rate": 0.00039833333333333333,
+      "loss": 0.0913,
+      "step": 284
+    },
+    {
+      "epoch": 0.14828303850156088,
+      "grad_norm": 0.38064879179000854,
+      "learning_rate": 0.00039777777777777777,
+      "loss": 0.0879,
+      "step": 285
+    },
+    {
+      "epoch": 0.14880332986472425,
+      "grad_norm": 0.20538319647312164,
+      "learning_rate": 0.0003972222222222222,
+      "loss": 0.0379,
+      "step": 286
+    },
+    {
+      "epoch": 0.14932362122788762,
+      "grad_norm": 0.3068322241306305,
+      "learning_rate": 0.0003966666666666667,
+      "loss": 0.0614,
+      "step": 287
+    },
+    {
+      "epoch": 0.14984391259105098,
+      "grad_norm": 0.2988760769367218,
+      "learning_rate": 0.00039611111111111113,
+      "loss": 0.0347,
+      "step": 288
+    },
+    {
+      "epoch": 0.15036420395421435,
+      "grad_norm": 0.24667970836162567,
+      "learning_rate": 0.00039555555555555557,
+      "loss": 0.0481,
+      "step": 289
+    },
+    {
+      "epoch": 0.15088449531737774,
+      "grad_norm": 0.3291466236114502,
+      "learning_rate": 0.000395,
+      "loss": 0.0585,
+      "step": 290
+    },
+    {
+      "epoch": 0.1514047866805411,
+      "grad_norm": 0.2097170203924179,
+      "learning_rate": 0.00039444444444444444,
+      "loss": 0.0267,
+      "step": 291
+    },
+    {
+      "epoch": 0.15192507804370448,
+      "grad_norm": 0.33159908652305603,
+      "learning_rate": 0.00039388888888888893,
+      "loss": 0.0328,
+      "step": 292
+    },
+    {
+      "epoch": 0.15244536940686784,
+      "grad_norm": 0.2823585867881775,
+      "learning_rate": 0.0003933333333333333,
+      "loss": 0.0368,
+      "step": 293
+    },
+    {
+      "epoch": 0.1529656607700312,
+      "grad_norm": 0.1939367949962616,
+      "learning_rate": 0.0003927777777777778,
+      "loss": 0.0338,
+      "step": 294
+    },
+    {
+      "epoch": 0.15348595213319458,
+      "grad_norm": 0.23737141489982605,
+      "learning_rate": 0.00039222222222222225,
+      "loss": 0.0522,
+      "step": 295
+    },
+    {
+      "epoch": 0.15400624349635797,
+      "grad_norm": 0.29729461669921875,
+      "learning_rate": 0.0003916666666666667,
+      "loss": 0.0409,
+      "step": 296
+    },
+    {
+      "epoch": 0.15452653485952134,
+      "grad_norm": 0.186125710606575,
+      "learning_rate": 0.0003911111111111111,
+      "loss": 0.035,
+      "step": 297
+    },
+    {
+      "epoch": 0.1550468262226847,
+      "grad_norm": 0.23367059230804443,
+      "learning_rate": 0.00039055555555555556,
+      "loss": 0.0471,
+      "step": 298
+    },
+    {
+      "epoch": 0.15556711758584807,
+      "grad_norm": 0.2210577130317688,
+      "learning_rate": 0.00039000000000000005,
+      "loss": 0.0349,
+      "step": 299
+    },
+    {
+      "epoch": 0.15608740894901144,
+      "grad_norm": 0.5184774994850159,
+      "learning_rate": 0.00038944444444444443,
+      "loss": 0.091,
+      "step": 300
+    },
+    {
+      "epoch": 0.15660770031217483,
+      "grad_norm": 0.2522483170032501,
+      "learning_rate": 0.0003888888888888889,
+      "loss": 0.0491,
+      "step": 301
+    },
+    {
+      "epoch": 0.1571279916753382,
+      "grad_norm": 0.21878671646118164,
+      "learning_rate": 0.0003883333333333333,
+      "loss": 0.0393,
+      "step": 302
+    },
+    {
+      "epoch": 0.15764828303850156,
+      "grad_norm": 0.23364581167697906,
+      "learning_rate": 0.0003877777777777778,
+      "loss": 0.044,
+      "step": 303
+    },
+    {
+      "epoch": 0.15816857440166493,
+      "grad_norm": 0.2022484838962555,
+      "learning_rate": 0.00038722222222222223,
+      "loss": 0.0479,
+      "step": 304
+    },
+    {
+      "epoch": 0.1586888657648283,
+      "grad_norm": 0.2595768868923187,
+      "learning_rate": 0.00038666666666666667,
+      "loss": 0.0393,
+      "step": 305
+    },
+    {
+      "epoch": 0.15920915712799166,
+      "grad_norm": 0.44060519337654114,
+      "learning_rate": 0.00038611111111111116,
+      "loss": 0.0543,
+      "step": 306
+    },
+    {
+      "epoch": 0.15972944849115506,
+      "grad_norm": 0.2032414823770523,
+      "learning_rate": 0.00038555555555555554,
+      "loss": 0.0346,
+      "step": 307
+    },
+    {
+      "epoch": 0.16024973985431842,
+      "grad_norm": 0.3000059723854065,
+      "learning_rate": 0.00038500000000000003,
+      "loss": 0.0427,
+      "step": 308
+    },
+    {
+      "epoch": 0.1607700312174818,
+      "grad_norm": 0.18585479259490967,
+      "learning_rate": 0.0003844444444444444,
+      "loss": 0.0272,
+      "step": 309
+    },
+    {
+      "epoch": 0.16129032258064516,
+      "grad_norm": 0.22494898736476898,
+      "learning_rate": 0.0003838888888888889,
+      "loss": 0.0232,
+      "step": 310
+    },
+    {
+      "epoch": 0.16181061394380852,
+      "grad_norm": 0.19582317769527435,
+      "learning_rate": 0.00038333333333333334,
+      "loss": 0.04,
+      "step": 311
+    },
+    {
+      "epoch": 0.16233090530697192,
+      "grad_norm": 0.4270728826522827,
+      "learning_rate": 0.0003827777777777778,
+      "loss": 0.0619,
+      "step": 312
+    },
+    {
+      "epoch": 0.16285119667013528,
+      "grad_norm": 0.20349858701229095,
+      "learning_rate": 0.0003822222222222223,
+      "loss": 0.0421,
+      "step": 313
+    },
+    {
+      "epoch": 0.16337148803329865,
+      "grad_norm": 0.3676919639110565,
+      "learning_rate": 0.00038166666666666666,
+      "loss": 0.0472,
+      "step": 314
+    },
+    {
+      "epoch": 0.16389177939646202,
+      "grad_norm": 0.32425612211227417,
+      "learning_rate": 0.00038111111111111115,
+      "loss": 0.0663,
+      "step": 315
+    },
+    {
+      "epoch": 0.16441207075962538,
+      "grad_norm": 0.26070570945739746,
+      "learning_rate": 0.00038055555555555553,
+      "loss": 0.0551,
+      "step": 316
+    },
+    {
+      "epoch": 0.16493236212278875,
+      "grad_norm": 0.2385774403810501,
+      "learning_rate": 0.00038,
+      "loss": 0.0508,
+      "step": 317
+    },
+    {
+      "epoch": 0.16545265348595214,
+      "grad_norm": 0.2036454826593399,
+      "learning_rate": 0.0003794444444444444,
+      "loss": 0.0393,
+      "step": 318
+    },
+    {
+      "epoch": 0.1659729448491155,
+      "grad_norm": 0.21891064941883087,
+      "learning_rate": 0.0003788888888888889,
+      "loss": 0.0347,
+      "step": 319
+    },
+    {
+      "epoch": 0.16649323621227888,
+      "grad_norm": 0.18101496994495392,
+      "learning_rate": 0.0003783333333333334,
+      "loss": 0.0295,
+      "step": 320
+    },
+    {
+      "epoch": 0.16701352757544224,
+      "grad_norm": 0.19484540820121765,
+      "learning_rate": 0.00037777777777777777,
+      "loss": 0.0313,
+      "step": 321
+    },
+    {
+      "epoch": 0.1675338189386056,
+      "grad_norm": 0.22532738745212555,
+      "learning_rate": 0.00037722222222222226,
+      "loss": 0.0382,
+      "step": 322
+    },
+    {
+      "epoch": 0.16805411030176898,
+      "grad_norm": 0.2155781388282776,
+      "learning_rate": 0.00037666666666666664,
+      "loss": 0.0352,
+      "step": 323
+    },
+    {
+      "epoch": 0.16857440166493237,
+      "grad_norm": 0.22214792668819427,
+      "learning_rate": 0.00037611111111111113,
+      "loss": 0.0425,
+      "step": 324
+    },
+    {
+      "epoch": 0.16909469302809574,
+      "grad_norm": 0.2648473083972931,
+      "learning_rate": 0.0003755555555555555,
+      "loss": 0.0422,
+      "step": 325
+    },
+    {
+      "epoch": 0.1696149843912591,
+      "grad_norm": 0.22539383172988892,
+      "learning_rate": 0.000375,
+      "loss": 0.0366,
+      "step": 326
+    },
+    {
+      "epoch": 0.17013527575442247,
+      "grad_norm": 0.19195836782455444,
+      "learning_rate": 0.0003744444444444445,
+      "loss": 0.0279,
+      "step": 327
+    },
+    {
+      "epoch": 0.17065556711758584,
+      "grad_norm": 0.2254018783569336,
+      "learning_rate": 0.0003738888888888889,
+      "loss": 0.0455,
+      "step": 328
+    },
+    {
+      "epoch": 0.17117585848074923,
+      "grad_norm": 0.2259969264268875,
+      "learning_rate": 0.0003733333333333334,
+      "loss": 0.0301,
+      "step": 329
+    },
+    {
+      "epoch": 0.1716961498439126,
+      "grad_norm": 0.20350702106952667,
+      "learning_rate": 0.00037277777777777776,
+      "loss": 0.0459,
+      "step": 330
+    },
+    {
+      "epoch": 0.17221644120707597,
+      "grad_norm": 0.17894725501537323,
+      "learning_rate": 0.00037222222222222225,
+      "loss": 0.0325,
+      "step": 331
+    },
+    {
+      "epoch": 0.17273673257023933,
+      "grad_norm": 0.22505900263786316,
+      "learning_rate": 0.00037166666666666663,
+      "loss": 0.0404,
+      "step": 332
+    },
+    {
+      "epoch": 0.1732570239334027,
+      "grad_norm": 0.10483799874782562,
+      "learning_rate": 0.0003711111111111111,
+      "loss": 0.0153,
+      "step": 333
+    },
+    {
+      "epoch": 0.17377731529656607,
+      "grad_norm": 0.1504441499710083,
+      "learning_rate": 0.0003705555555555556,
+      "loss": 0.0281,
+      "step": 334
+    },
+    {
+      "epoch": 0.17429760665972946,
+      "grad_norm": 0.22857385873794556,
+      "learning_rate": 0.00037,
+      "loss": 0.0257,
+      "step": 335
+    },
+    {
+      "epoch": 0.17481789802289283,
+      "grad_norm": 0.19890117645263672,
+      "learning_rate": 0.0003694444444444445,
+      "loss": 0.0275,
+      "step": 336
+    },
+    {
+      "epoch": 0.1753381893860562,
+      "grad_norm": 0.17106270790100098,
+      "learning_rate": 0.00036888888888888887,
+      "loss": 0.0371,
+      "step": 337
+    },
+    {
+      "epoch": 0.17585848074921956,
+      "grad_norm": 0.3300045430660248,
+      "learning_rate": 0.00036833333333333336,
+      "loss": 0.0481,
+      "step": 338
+    },
+    {
+      "epoch": 0.17637877211238293,
+      "grad_norm": 0.26582735776901245,
+      "learning_rate": 0.00036777777777777774,
+      "loss": 0.0398,
+      "step": 339
+    },
+    {
+      "epoch": 0.1768990634755463,
+      "grad_norm": 0.20054687559604645,
+      "learning_rate": 0.00036722222222222223,
+      "loss": 0.0288,
+      "step": 340
+    },
+    {
+      "epoch": 0.1774193548387097,
+      "grad_norm": 0.15207038819789886,
+      "learning_rate": 0.00036666666666666667,
+      "loss": 0.0352,
+      "step": 341
+    },
+    {
+      "epoch": 0.17793964620187305,
+      "grad_norm": 0.13785234093666077,
+      "learning_rate": 0.0003661111111111111,
+      "loss": 0.027,
+      "step": 342
+    },
+    {
+      "epoch": 0.17845993756503642,
+      "grad_norm": 0.16440491378307343,
+      "learning_rate": 0.0003655555555555556,
+      "loss": 0.0327,
+      "step": 343
+    },
+    {
+      "epoch": 0.17898022892819979,
+      "grad_norm": 0.15854951739311218,
+      "learning_rate": 0.000365,
+      "loss": 0.0254,
+      "step": 344
+    },
+    {
+      "epoch": 0.17950052029136315,
+      "grad_norm": 0.1805776059627533,
+      "learning_rate": 0.00036444444444444447,
+      "loss": 0.0425,
+      "step": 345
+    },
+    {
+      "epoch": 0.18002081165452655,
+      "grad_norm": 0.4681404232978821,
+      "learning_rate": 0.00036388888888888886,
+      "loss": 0.0588,
+      "step": 346
+    },
+    {
+      "epoch": 0.1805411030176899,
+      "grad_norm": 0.16028301417827606,
+      "learning_rate": 0.00036333333333333335,
+      "loss": 0.0215,
+      "step": 347
+    },
+    {
+      "epoch": 0.18106139438085328,
+      "grad_norm": 0.16450455784797668,
+      "learning_rate": 0.0003627777777777778,
+      "loss": 0.0245,
+      "step": 348
+    },
+    {
+      "epoch": 0.18158168574401665,
+      "grad_norm": 0.2902337312698364,
+      "learning_rate": 0.0003622222222222222,
+      "loss": 0.0475,
+      "step": 349
+    },
+    {
+      "epoch": 0.18210197710718,
+      "grad_norm": 0.27946949005126953,
+      "learning_rate": 0.0003616666666666667,
+      "loss": 0.0449,
+      "step": 350
+    },
+    {
+      "epoch": 0.18262226847034338,
+      "grad_norm": 0.17264722287654877,
+      "learning_rate": 0.0003611111111111111,
+      "loss": 0.0331,
+      "step": 351
+    },
+    {
+      "epoch": 0.18314255983350677,
+      "grad_norm": 0.24759423732757568,
+      "learning_rate": 0.0003605555555555556,
+      "loss": 0.0385,
+      "step": 352
+    },
+    {
+      "epoch": 0.18366285119667014,
+      "grad_norm": 0.14519743621349335,
+      "learning_rate": 0.00035999999999999997,
+      "loss": 0.0209,
+      "step": 353
+    },
+    {
+      "epoch": 0.1841831425598335,
+      "grad_norm": 0.27116432785987854,
+      "learning_rate": 0.00035944444444444446,
+      "loss": 0.0419,
+      "step": 354
+    },
+    {
+      "epoch": 0.18470343392299687,
+      "grad_norm": 0.1809036135673523,
+      "learning_rate": 0.0003588888888888889,
+      "loss": 0.0423,
+      "step": 355
+    },
+    {
+      "epoch": 0.18522372528616024,
+      "grad_norm": 0.23334546387195587,
+      "learning_rate": 0.00035833333333333333,
+      "loss": 0.0483,
+      "step": 356
+    },
+    {
+      "epoch": 0.18574401664932363,
+      "grad_norm": 0.12199573218822479,
+      "learning_rate": 0.00035777777777777777,
+      "loss": 0.0294,
+      "step": 357
+    },
+    {
+      "epoch": 0.186264308012487,
+      "grad_norm": 0.24833369255065918,
+      "learning_rate": 0.0003572222222222222,
+      "loss": 0.0328,
+      "step": 358
+    },
+    {
+      "epoch": 0.18678459937565037,
+      "grad_norm": 0.46454137563705444,
+      "learning_rate": 0.0003566666666666667,
+      "loss": 0.0598,
+      "step": 359
+    },
+    {
+      "epoch": 0.18730489073881373,
+      "grad_norm": 0.19704070687294006,
+      "learning_rate": 0.0003561111111111111,
+      "loss": 0.0278,
+      "step": 360
+    },
+    {
+      "epoch": 0.1878251821019771,
+      "grad_norm": 0.18981118500232697,
+      "learning_rate": 0.00035555555555555557,
+      "loss": 0.0404,
+      "step": 361
+    },
+    {
+      "epoch": 0.18834547346514047,
+      "grad_norm": 0.20858381688594818,
+      "learning_rate": 0.000355,
+      "loss": 0.0343,
+      "step": 362
+    },
+    {
+      "epoch": 0.18886576482830386,
+      "grad_norm": 0.14601057767868042,
+      "learning_rate": 0.00035444444444444445,
+      "loss": 0.0293,
+      "step": 363
+    },
+    {
+      "epoch": 0.18938605619146723,
+      "grad_norm": 0.1576007604598999,
+      "learning_rate": 0.0003538888888888889,
+      "loss": 0.0328,
+      "step": 364
+    },
+    {
+      "epoch": 0.1899063475546306,
+      "grad_norm": 0.18504372239112854,
+      "learning_rate": 0.0003533333333333333,
+      "loss": 0.0315,
+      "step": 365
+    },
+    {
+      "epoch": 0.19042663891779396,
+      "grad_norm": 0.16126742959022522,
+      "learning_rate": 0.0003527777777777778,
+      "loss": 0.0277,
+      "step": 366
+    },
+    {
+      "epoch": 0.19094693028095733,
+      "grad_norm": 0.22791104018688202,
+      "learning_rate": 0.00035222222222222225,
+      "loss": 0.0452,
+      "step": 367
+    },
+    {
+      "epoch": 0.1914672216441207,
+      "grad_norm": 0.2693690359592438,
+      "learning_rate": 0.0003516666666666667,
+      "loss": 0.0427,
+      "step": 368
+    },
+    {
+      "epoch": 0.1919875130072841,
+      "grad_norm": 0.1650257259607315,
+      "learning_rate": 0.0003511111111111111,
+      "loss": 0.0241,
+      "step": 369
+    },
+    {
+      "epoch": 0.19250780437044746,
+      "grad_norm": 0.22772428393363953,
+      "learning_rate": 0.00035055555555555556,
+      "loss": 0.0483,
+      "step": 370
+    },
+    {
+      "epoch": 0.19302809573361082,
+      "grad_norm": 0.24612616002559662,
+      "learning_rate": 0.00035,
+      "loss": 0.042,
+      "step": 371
+    },
+    {
+      "epoch": 0.1935483870967742,
+      "grad_norm": 0.22736461460590363,
+      "learning_rate": 0.00034944444444444443,
+      "loss": 0.0487,
+      "step": 372
+    },
+    {
+      "epoch": 0.19406867845993755,
+      "grad_norm": 0.23257088661193848,
+      "learning_rate": 0.0003488888888888889,
+      "loss": 0.0515,
+      "step": 373
+    },
+    {
+      "epoch": 0.19458896982310095,
+      "grad_norm": 0.2097531259059906,
+      "learning_rate": 0.00034833333333333336,
+      "loss": 0.0352,
+      "step": 374
+    },
+    {
+      "epoch": 0.19510926118626432,
+      "grad_norm": 0.28301218152046204,
+      "learning_rate": 0.0003477777777777778,
+      "loss": 0.042,
+      "step": 375
+    },
+    {
+      "epoch": 0.19562955254942768,
+      "grad_norm": 0.18338818848133087,
+      "learning_rate": 0.00034722222222222224,
+      "loss": 0.0285,
+      "step": 376
+    },
+    {
+      "epoch": 0.19614984391259105,
+      "grad_norm": 0.21453578770160675,
+      "learning_rate": 0.00034666666666666667,
+      "loss": 0.0557,
+      "step": 377
+    },
+    {
+      "epoch": 0.19667013527575442,
+      "grad_norm": 0.16289933025836945,
+      "learning_rate": 0.0003461111111111111,
+      "loss": 0.0337,
+      "step": 378
+    },
+    {
+      "epoch": 0.19719042663891778,
+      "grad_norm": 0.19443009793758392,
+      "learning_rate": 0.00034555555555555555,
+      "loss": 0.0314,
+      "step": 379
+    },
+    {
+      "epoch": 0.19771071800208118,
+      "grad_norm": 0.24147702753543854,
+      "learning_rate": 0.000345,
+      "loss": 0.051,
+      "step": 380
+    },
+    {
+      "epoch": 0.19823100936524454,
+      "grad_norm": 0.19166404008865356,
+      "learning_rate": 0.0003444444444444445,
+      "loss": 0.041,
+      "step": 381
+    },
+    {
+      "epoch": 0.1987513007284079,
+      "grad_norm": 0.26511725783348083,
+      "learning_rate": 0.0003438888888888889,
+      "loss": 0.043,
+      "step": 382
+    },
+    {
+      "epoch": 0.19927159209157128,
+      "grad_norm": 0.19884304702281952,
+      "learning_rate": 0.00034333333333333335,
+      "loss": 0.0388,
+      "step": 383
+    },
+    {
+      "epoch": 0.19979188345473464,
+      "grad_norm": 0.1897716373205185,
+      "learning_rate": 0.0003427777777777778,
+      "loss": 0.0288,
+      "step": 384
+    },
+    {
+      "epoch": 0.20031217481789804,
+      "grad_norm": 0.228108212351799,
+      "learning_rate": 0.0003422222222222222,
+      "loss": 0.0208,
+      "step": 385
+    },
+    {
+      "epoch": 0.2008324661810614,
+      "grad_norm": 0.22205443680286407,
+      "learning_rate": 0.00034166666666666666,
+      "loss": 0.0449,
+      "step": 386
+    },
+    {
+      "epoch": 0.20135275754422477,
+      "grad_norm": 0.2547477185726166,
+      "learning_rate": 0.0003411111111111111,
+      "loss": 0.0419,
+      "step": 387
+    },
+    {
+      "epoch": 0.20187304890738814,
+      "grad_norm": 0.26517170667648315,
+      "learning_rate": 0.0003405555555555556,
+      "loss": 0.0419,
+      "step": 388
+    },
+    {
+      "epoch": 0.2023933402705515,
+      "grad_norm": 0.37391191720962524,
+      "learning_rate": 0.00034,
+      "loss": 0.0593,
+      "step": 389
+    },
+    {
+      "epoch": 0.20291363163371487,
+      "grad_norm": 0.18347249925136566,
+      "learning_rate": 0.00033944444444444446,
+      "loss": 0.0283,
+      "step": 390
+    },
+    {
+      "epoch": 0.20343392299687826,
+      "grad_norm": 0.20623968541622162,
+      "learning_rate": 0.0003388888888888889,
+      "loss": 0.0308,
+      "step": 391
+    },
+    {
+      "epoch": 0.20395421436004163,
+      "grad_norm": 0.25673940777778625,
+      "learning_rate": 0.00033833333333333334,
+      "loss": 0.0571,
+      "step": 392
+    },
+    {
+      "epoch": 0.204474505723205,
+      "grad_norm": 0.12756018340587616,
+      "learning_rate": 0.00033777777777777777,
+      "loss": 0.0291,
+      "step": 393
+    },
+    {
+      "epoch": 0.20499479708636836,
+      "grad_norm": 0.18630138039588928,
+      "learning_rate": 0.0003372222222222222,
+      "loss": 0.0353,
+      "step": 394
+    },
+    {
+      "epoch": 0.20551508844953173,
+      "grad_norm": 0.1463075578212738,
+      "learning_rate": 0.0003366666666666667,
+      "loss": 0.0274,
+      "step": 395
+    },
+    {
+      "epoch": 0.2060353798126951,
+      "grad_norm": 0.21908532083034515,
+      "learning_rate": 0.00033611111111111114,
+      "loss": 0.0366,
+      "step": 396
+    },
+    {
+      "epoch": 0.2065556711758585,
+      "grad_norm": 0.23834265768527985,
+      "learning_rate": 0.0003355555555555556,
+      "loss": 0.0228,
+      "step": 397
+    },
+    {
+      "epoch": 0.20707596253902186,
+      "grad_norm": 0.264460951089859,
+      "learning_rate": 0.000335,
+      "loss": 0.0412,
+      "step": 398
+    },
+    {
+      "epoch": 0.20759625390218522,
+      "grad_norm": 0.29063087701797485,
+      "learning_rate": 0.00033444444444444445,
+      "loss": 0.0493,
+      "step": 399
+    },
+    {
+      "epoch": 0.2081165452653486,
+      "grad_norm": 0.1892634779214859,
+      "learning_rate": 0.0003338888888888889,
+      "loss": 0.0336,
+      "step": 400
+    },
+    {
+      "epoch": 0.20863683662851196,
+      "grad_norm": 0.2367408573627472,
+      "learning_rate": 0.0003333333333333333,
+      "loss": 0.0332,
+      "step": 401
+    },
+    {
+      "epoch": 0.20915712799167535,
+      "grad_norm": 0.1723126769065857,
+      "learning_rate": 0.0003327777777777778,
+      "loss": 0.0319,
+      "step": 402
+    },
+    {
+      "epoch": 0.20967741935483872,
+      "grad_norm": 0.14627283811569214,
+      "learning_rate": 0.0003322222222222222,
+      "loss": 0.0266,
+      "step": 403
+    },
+    {
+      "epoch": 0.21019771071800208,
+      "grad_norm": 0.1599954068660736,
+      "learning_rate": 0.0003316666666666667,
+      "loss": 0.0388,
+      "step": 404
+    },
+    {
+      "epoch": 0.21071800208116545,
+      "grad_norm": 0.37434253096580505,
+      "learning_rate": 0.0003311111111111111,
+      "loss": 0.0653,
+      "step": 405
+    },
+    {
+      "epoch": 0.21123829344432882,
+      "grad_norm": 0.1519968956708908,
+      "learning_rate": 0.00033055555555555556,
+      "loss": 0.0289,
+      "step": 406
+    },
+    {
+      "epoch": 0.21175858480749218,
+      "grad_norm": 0.14485976099967957,
+      "learning_rate": 0.00033,
+      "loss": 0.0161,
+      "step": 407
+    },
+    {
+      "epoch": 0.21227887617065558,
+      "grad_norm": 0.3291303217411041,
+      "learning_rate": 0.00032944444444444444,
+      "loss": 0.0527,
+      "step": 408
+    },
+    {
+      "epoch": 0.21279916753381894,
+      "grad_norm": 0.19733606278896332,
+      "learning_rate": 0.0003288888888888889,
+      "loss": 0.0405,
+      "step": 409
+    },
+    {
+      "epoch": 0.2133194588969823,
+      "grad_norm": 0.2552485764026642,
+      "learning_rate": 0.0003283333333333333,
+      "loss": 0.0339,
+      "step": 410
+    },
+    {
+      "epoch": 0.21383975026014568,
+      "grad_norm": 0.14234775304794312,
+      "learning_rate": 0.0003277777777777778,
+      "loss": 0.0294,
+      "step": 411
+    },
+    {
+      "epoch": 0.21436004162330904,
+      "grad_norm": 0.2233223021030426,
+      "learning_rate": 0.00032722222222222224,
+      "loss": 0.0492,
+      "step": 412
+    },
+    {
+      "epoch": 0.2148803329864724,
+      "grad_norm": 0.11738775670528412,
+      "learning_rate": 0.0003266666666666667,
+      "loss": 0.0247,
+      "step": 413
+    },
+    {
+      "epoch": 0.2154006243496358,
+      "grad_norm": 0.1777840107679367,
+      "learning_rate": 0.0003261111111111111,
+      "loss": 0.0241,
+      "step": 414
+    },
+    {
+      "epoch": 0.21592091571279917,
+      "grad_norm": 0.20584549009799957,
+      "learning_rate": 0.00032555555555555555,
+      "loss": 0.0251,
+      "step": 415
+    },
+    {
+      "epoch": 0.21644120707596254,
+      "grad_norm": 0.16335804760456085,
+      "learning_rate": 0.00032500000000000004,
+      "loss": 0.0232,
+      "step": 416
+    },
+    {
+      "epoch": 0.2169614984391259,
+      "grad_norm": 0.1476750373840332,
+      "learning_rate": 0.0003244444444444444,
+      "loss": 0.0259,
+      "step": 417
+    },
+    {
+      "epoch": 0.21748178980228927,
+      "grad_norm": 0.25620049238204956,
+      "learning_rate": 0.0003238888888888889,
+      "loss": 0.056,
+      "step": 418
+    },
+    {
+      "epoch": 0.21800208116545267,
+      "grad_norm": 0.2029629945755005,
+      "learning_rate": 0.0003233333333333333,
+      "loss": 0.0318,
+      "step": 419
+    },
+    {
+      "epoch": 0.21852237252861603,
+      "grad_norm": 0.3485390245914459,
+      "learning_rate": 0.0003227777777777778,
+      "loss": 0.0371,
+      "step": 420
+    },
+    {
+      "epoch": 0.2190426638917794,
+      "grad_norm": 0.11944156140089035,
+      "learning_rate": 0.0003222222222222222,
+      "loss": 0.0225,
+      "step": 421
+    },
+    {
+      "epoch": 0.21956295525494277,
+      "grad_norm": 0.1591196358203888,
+      "learning_rate": 0.00032166666666666666,
+      "loss": 0.0312,
+      "step": 422
+    },
+    {
+      "epoch": 0.22008324661810613,
+      "grad_norm": 0.1827545315027237,
+      "learning_rate": 0.00032111111111111115,
+      "loss": 0.0343,
+      "step": 423
+    },
+    {
+      "epoch": 0.2206035379812695,
+      "grad_norm": 0.21761400997638702,
+      "learning_rate": 0.00032055555555555554,
+      "loss": 0.0407,
+      "step": 424
+    },
+    {
+      "epoch": 0.2211238293444329,
+      "grad_norm": 0.18732213973999023,
+      "learning_rate": 0.00032,
+      "loss": 0.0395,
+      "step": 425
+    },
+    {
+      "epoch": 0.22164412070759626,
+      "grad_norm": 0.12878796458244324,
+      "learning_rate": 0.0003194444444444444,
+      "loss": 0.0234,
+      "step": 426
+    },
+    {
+      "epoch": 0.22216441207075963,
+      "grad_norm": 0.29317036271095276,
+      "learning_rate": 0.0003188888888888889,
+      "loss": 0.0436,
+      "step": 427
+    },
+    {
+      "epoch": 0.222684703433923,
+      "grad_norm": 0.27346885204315186,
+      "learning_rate": 0.00031833333333333334,
+      "loss": 0.0359,
+      "step": 428
+    },
+    {
+      "epoch": 0.22320499479708636,
+      "grad_norm": 0.12804454565048218,
+      "learning_rate": 0.0003177777777777778,
+      "loss": 0.0269,
+      "step": 429
+    },
+    {
+      "epoch": 0.22372528616024975,
+      "grad_norm": 0.2954390347003937,
+      "learning_rate": 0.00031722222222222227,
+      "loss": 0.0429,
+      "step": 430
+    },
+    {
+      "epoch": 0.22424557752341312,
+      "grad_norm": 0.12796026468276978,
+      "learning_rate": 0.00031666666666666665,
+      "loss": 0.0346,
+      "step": 431
+    },
+    {
+      "epoch": 0.2247658688865765,
+      "grad_norm": 0.24340416491031647,
+      "learning_rate": 0.00031611111111111114,
+      "loss": 0.0322,
+      "step": 432
+    },
+    {
+      "epoch": 0.22528616024973985,
+      "grad_norm": 0.13648621737957,
+      "learning_rate": 0.0003155555555555555,
+      "loss": 0.0238,
+      "step": 433
+    },
+    {
+      "epoch": 0.22580645161290322,
+      "grad_norm": 0.17342010140419006,
+      "learning_rate": 0.000315,
+      "loss": 0.0222,
+      "step": 434
+    },
+    {
+      "epoch": 0.22632674297606659,
+      "grad_norm": 0.21049854159355164,
+      "learning_rate": 0.0003144444444444445,
+      "loss": 0.022,
+      "step": 435
+    },
+    {
+      "epoch": 0.22684703433922998,
+      "grad_norm": 0.24159543216228485,
+      "learning_rate": 0.0003138888888888889,
+      "loss": 0.0359,
+      "step": 436
+    },
+    {
+      "epoch": 0.22736732570239335,
+      "grad_norm": 0.18714144825935364,
+      "learning_rate": 0.0003133333333333334,
+      "loss": 0.0386,
+      "step": 437
+    },
+    {
+      "epoch": 0.2278876170655567,
+      "grad_norm": 0.24189646542072296,
+      "learning_rate": 0.00031277777777777776,
+      "loss": 0.0382,
+      "step": 438
+    },
+    {
+      "epoch": 0.22840790842872008,
+      "grad_norm": 0.16704939305782318,
+      "learning_rate": 0.00031222222222222225,
+      "loss": 0.0443,
+      "step": 439
+    },
+    {
+      "epoch": 0.22892819979188345,
+      "grad_norm": 0.20545163750648499,
+      "learning_rate": 0.00031166666666666663,
+      "loss": 0.041,
+      "step": 440
+    },
+    {
+      "epoch": 0.2294484911550468,
+      "grad_norm": 0.16772353649139404,
+      "learning_rate": 0.0003111111111111111,
+      "loss": 0.0275,
+      "step": 441
+    },
+    {
+      "epoch": 0.2299687825182102,
+      "grad_norm": 0.22355173528194427,
+      "learning_rate": 0.0003105555555555555,
+      "loss": 0.0352,
+      "step": 442
+    },
+    {
+      "epoch": 0.23048907388137357,
+      "grad_norm": 0.24697473645210266,
+      "learning_rate": 0.00031,
+      "loss": 0.0351,
+      "step": 443
+    },
+    {
+      "epoch": 0.23100936524453694,
+      "grad_norm": 0.17634686827659607,
+      "learning_rate": 0.0003094444444444445,
+      "loss": 0.0274,
+      "step": 444
+    },
+    {
+      "epoch": 0.2315296566077003,
+      "grad_norm": 0.24014054238796234,
+      "learning_rate": 0.0003088888888888889,
+      "loss": 0.0268,
+      "step": 445
+    },
+    {
+      "epoch": 0.23204994797086367,
+      "grad_norm": 0.12158364802598953,
+      "learning_rate": 0.00030833333333333337,
+      "loss": 0.0238,
+      "step": 446
+    },
+    {
+      "epoch": 0.23257023933402707,
+      "grad_norm": 0.24085555970668793,
+      "learning_rate": 0.00030777777777777775,
+      "loss": 0.0332,
+      "step": 447
+    },
+    {
+      "epoch": 0.23309053069719043,
+      "grad_norm": 0.17350415885448456,
+      "learning_rate": 0.00030722222222222224,
+      "loss": 0.0418,
+      "step": 448
+    },
+    {
+      "epoch": 0.2336108220603538,
+      "grad_norm": 0.10665347427129745,
+      "learning_rate": 0.0003066666666666667,
+      "loss": 0.0217,
+      "step": 449
+    },
+    {
+      "epoch": 0.23413111342351717,
+      "grad_norm": 0.23732754588127136,
+      "learning_rate": 0.0003061111111111111,
+      "loss": 0.0406,
+      "step": 450
+    },
+    {
+      "epoch": 0.23465140478668053,
+      "grad_norm": 0.09794217348098755,
+      "learning_rate": 0.0003055555555555556,
+      "loss": 0.0207,
+      "step": 451
+    },
+    {
+      "epoch": 0.2351716961498439,
+      "grad_norm": 0.20581063628196716,
+      "learning_rate": 0.000305,
+      "loss": 0.0339,
+      "step": 452
+    },
+    {
+      "epoch": 0.2356919875130073,
+      "grad_norm": 0.17121030390262604,
+      "learning_rate": 0.0003044444444444445,
+      "loss": 0.0325,
+      "step": 453
+    },
+    {
+      "epoch": 0.23621227887617066,
+      "grad_norm": 0.16894112527370453,
+      "learning_rate": 0.00030388888888888886,
+      "loss": 0.033,
+      "step": 454
+    },
+    {
+      "epoch": 0.23673257023933403,
+      "grad_norm": 0.09503252059221268,
+      "learning_rate": 0.00030333333333333335,
+      "loss": 0.0156,
+      "step": 455
+    },
+    {
+      "epoch": 0.2372528616024974,
+      "grad_norm": 0.2337169647216797,
+      "learning_rate": 0.0003027777777777778,
+      "loss": 0.0208,
+      "step": 456
+    },
+    {
+      "epoch": 0.23777315296566076,
+      "grad_norm": 0.20605909824371338,
+      "learning_rate": 0.0003022222222222222,
+      "loss": 0.034,
+      "step": 457
+    },
+    {
+      "epoch": 0.23829344432882413,
+      "grad_norm": 0.15843386948108673,
+      "learning_rate": 0.0003016666666666667,
+      "loss": 0.0298,
+      "step": 458
+    },
+    {
+      "epoch": 0.23881373569198752,
+      "grad_norm": 0.1802842915058136,
+      "learning_rate": 0.0003011111111111111,
+      "loss": 0.0216,
+      "step": 459
+    },
+    {
+      "epoch": 0.2393340270551509,
+      "grad_norm": 0.13717086613178253,
+      "learning_rate": 0.0003005555555555556,
+      "loss": 0.0249,
+      "step": 460
+    },
+    {
+      "epoch": 0.23985431841831426,
+      "grad_norm": 0.19162088632583618,
+      "learning_rate": 0.0003,
+      "loss": 0.0475,
+      "step": 461
+    },
+    {
+      "epoch": 0.24037460978147762,
+      "grad_norm": 0.23011524975299835,
+      "learning_rate": 0.00029944444444444446,
+      "loss": 0.0313,
+      "step": 462
+    },
+    {
+      "epoch": 0.240894901144641,
+      "grad_norm": 0.18215711414813995,
+      "learning_rate": 0.0002988888888888889,
+      "loss": 0.0378,
+      "step": 463
+    },
+    {
+      "epoch": 0.24141519250780438,
+      "grad_norm": 0.3314879834651947,
+      "learning_rate": 0.00029833333333333334,
+      "loss": 0.0386,
+      "step": 464
+    },
+    {
+      "epoch": 0.24193548387096775,
+      "grad_norm": 0.18399035930633545,
+      "learning_rate": 0.0002977777777777778,
+      "loss": 0.037,
+      "step": 465
+    },
+    {
+      "epoch": 0.24245577523413112,
+      "grad_norm": 0.3071196377277374,
+      "learning_rate": 0.0002972222222222222,
+      "loss": 0.0362,
+      "step": 466
+    },
+    {
+      "epoch": 0.24297606659729448,
+      "grad_norm": 0.13809853792190552,
+      "learning_rate": 0.0002966666666666667,
+      "loss": 0.0276,
+      "step": 467
+    },
+    {
+      "epoch": 0.24349635796045785,
+      "grad_norm": 0.24184127151966095,
+      "learning_rate": 0.0002961111111111111,
+      "loss": 0.0419,
+      "step": 468
+    },
+    {
+      "epoch": 0.24401664932362122,
+      "grad_norm": 0.1667579561471939,
+      "learning_rate": 0.0002955555555555556,
+      "loss": 0.0318,
+      "step": 469
+    },
+    {
+      "epoch": 0.2445369406867846,
+      "grad_norm": 0.1575225442647934,
+      "learning_rate": 0.000295,
+      "loss": 0.0305,
+      "step": 470
+    },
+    {
+      "epoch": 0.24505723204994798,
+      "grad_norm": 0.17671610414981842,
+      "learning_rate": 0.00029444444444444445,
+      "loss": 0.034,
+      "step": 471
+    },
+    {
+      "epoch": 0.24557752341311134,
+      "grad_norm": 0.138526052236557,
+      "learning_rate": 0.0002938888888888889,
+      "loss": 0.0157,
+      "step": 472
+    },
+    {
+      "epoch": 0.2460978147762747,
+      "grad_norm": 0.27597323060035706,
+      "learning_rate": 0.0002933333333333333,
+      "loss": 0.0381,
+      "step": 473
+    },
+    {
+      "epoch": 0.24661810613943808,
+      "grad_norm": 0.15420523285865784,
+      "learning_rate": 0.0002927777777777778,
+      "loss": 0.0226,
+      "step": 474
+    },
+    {
+      "epoch": 0.24713839750260147,
+      "grad_norm": 0.20491866767406464,
+      "learning_rate": 0.0002922222222222222,
+      "loss": 0.0161,
+      "step": 475
+    },
+    {
+      "epoch": 0.24765868886576484,
+      "grad_norm": 0.14067193865776062,
+      "learning_rate": 0.0002916666666666667,
+      "loss": 0.0335,
+      "step": 476
+    },
+    {
+      "epoch": 0.2481789802289282,
+      "grad_norm": 0.19436928629875183,
+      "learning_rate": 0.00029111111111111113,
+      "loss": 0.0435,
+      "step": 477
+    },
+    {
+      "epoch": 0.24869927159209157,
+      "grad_norm": 0.19192419946193695,
+      "learning_rate": 0.00029055555555555556,
+      "loss": 0.031,
+      "step": 478
+    },
+    {
+      "epoch": 0.24921956295525494,
+      "grad_norm": 0.1773335337638855,
+      "learning_rate": 0.00029,
+      "loss": 0.03,
+      "step": 479
+    },
+    {
+      "epoch": 0.2497398543184183,
+      "grad_norm": 0.19989162683486938,
+      "learning_rate": 0.00028944444444444444,
+      "loss": 0.0232,
+      "step": 480
+    },
+    {
+      "epoch": 0.25026014568158167,
+      "grad_norm": 0.1678122878074646,
+      "learning_rate": 0.0002888888888888889,
+      "loss": 0.0324,
+      "step": 481
+    },
+    {
+      "epoch": 0.25078043704474506,
+      "grad_norm": 0.15694859623908997,
+      "learning_rate": 0.0002883333333333333,
+      "loss": 0.0186,
+      "step": 482
+    },
+    {
+      "epoch": 0.2513007284079084,
+      "grad_norm": 0.1930493712425232,
+      "learning_rate": 0.0002877777777777778,
+      "loss": 0.0354,
+      "step": 483
+    },
+    {
+      "epoch": 0.2518210197710718,
+      "grad_norm": 0.29496946930885315,
+      "learning_rate": 0.00028722222222222224,
+      "loss": 0.0499,
+      "step": 484
+    },
+    {
+      "epoch": 0.2523413111342352,
+      "grad_norm": 0.1735425591468811,
+      "learning_rate": 0.0002866666666666667,
+      "loss": 0.0307,
+      "step": 485
+    },
+    {
+      "epoch": 0.25286160249739853,
+      "grad_norm": 0.2830154299736023,
+      "learning_rate": 0.0002861111111111111,
+      "loss": 0.0375,
+      "step": 486
+    },
+    {
+      "epoch": 0.2533818938605619,
+      "grad_norm": 0.13438007235527039,
+      "learning_rate": 0.00028555555555555555,
+      "loss": 0.0327,
+      "step": 487
+    },
+    {
+      "epoch": 0.25390218522372526,
+      "grad_norm": 0.2650485336780548,
+      "learning_rate": 0.000285,
+      "loss": 0.0493,
+      "step": 488
+    },
+    {
+      "epoch": 0.25442247658688866,
+      "grad_norm": 0.17854094505310059,
+      "learning_rate": 0.0002844444444444444,
+      "loss": 0.0325,
+      "step": 489
+    },
+    {
+      "epoch": 0.25494276795005205,
+      "grad_norm": 0.14844731986522675,
+      "learning_rate": 0.0002838888888888889,
+      "loss": 0.0362,
+      "step": 490
+    },
+    {
+      "epoch": 0.2554630593132154,
+      "grad_norm": 0.14285333454608917,
+      "learning_rate": 0.00028333333333333335,
+      "loss": 0.0312,
+      "step": 491
+    },
+    {
+      "epoch": 0.2559833506763788,
+      "grad_norm": 0.3904401361942291,
+      "learning_rate": 0.0002827777777777778,
+      "loss": 0.0506,
+      "step": 492
+    },
+    {
+      "epoch": 0.2565036420395421,
+      "grad_norm": 0.24693432450294495,
+      "learning_rate": 0.00028222222222222223,
+      "loss": 0.0356,
+      "step": 493
+    },
+    {
+      "epoch": 0.2570239334027055,
+      "grad_norm": 0.1814284324645996,
+      "learning_rate": 0.00028166666666666666,
+      "loss": 0.0379,
+      "step": 494
+    },
+    {
+      "epoch": 0.2575442247658689,
+      "grad_norm": 0.1869429349899292,
+      "learning_rate": 0.0002811111111111111,
+      "loss": 0.0384,
+      "step": 495
+    },
+    {
+      "epoch": 0.25806451612903225,
+      "grad_norm": 0.13896095752716064,
+      "learning_rate": 0.00028055555555555554,
+      "loss": 0.0334,
+      "step": 496
+    },
+    {
+      "epoch": 0.25858480749219565,
+      "grad_norm": 0.13905422389507294,
+      "learning_rate": 0.00028000000000000003,
+      "loss": 0.0276,
+      "step": 497
+    },
+    {
+      "epoch": 0.259105098855359,
+      "grad_norm": 0.16455614566802979,
+      "learning_rate": 0.00027944444444444447,
+      "loss": 0.0338,
+      "step": 498
+    },
+    {
+      "epoch": 0.2596253902185224,
+      "grad_norm": 0.22037294507026672,
+      "learning_rate": 0.0002788888888888889,
+      "loss": 0.0492,
+      "step": 499
+    },
+    {
+      "epoch": 0.2601456815816857,
+      "grad_norm": 0.12147378921508789,
+      "learning_rate": 0.00027833333333333334,
+      "loss": 0.033,
+      "step": 500
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 1000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 128,
+  "trial_name": null,
+  "trial_params": null
+}

experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-500/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-600/added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-600/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,7 @@

+{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system
+You are a helpful assistant.<|im_end|>
+{% endif %}<|im_start|>{{ message['role'] }}
+{% if message['content'] is string %}{{ message['content'] }}<|im_end|>
+{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>
+{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
+{% endif %}

experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-600/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_processor_type": "Qwen2_5_VLImageProcessor",
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "max_pixels": 1003520,
+  "merge_size": 2,
+  "min_pixels": 3136,
+  "patch_size": 14,
+  "processor_class": "Qwen2_5_VLProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "max_pixels": 1003520,
+    "min_pixels": 3136
+  },
+  "temporal_patch_size": 2
+}

experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-600/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-600/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,208 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "processor_class": "Qwen2_5_VLProcessor",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-600/trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-700/added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-700/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,7 @@

+{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system
+You are a helpful assistant.<|im_end|>
+{% endif %}<|im_start|>{{ message['role'] }}
+{% if message['content'] is string %}{{ message['content'] }}<|im_end|>
+{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>
+{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
+{% endif %}

experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-700/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-700/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_processor_type": "Qwen2_5_VLImageProcessor",
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "max_pixels": 1003520,
+  "merge_size": 2,
+  "min_pixels": 3136,
+  "patch_size": 14,
+  "processor_class": "Qwen2_5_VLProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "max_pixels": 1003520,
+    "min_pixels": 3136
+  },
+  "temporal_patch_size": 2
+}

experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-700/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-700/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,208 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "processor_class": "Qwen2_5_VLProcessor",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-700/trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-700/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

experiments/Qwen2_5vl_3B_add_distill_0.2_0.6_DISTILL_FLOOR_0_12_3_h100_3_Classifier_Layer12_V5_i_ret/checkpoint-800/added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}