diff --git a/viscop_qwen2.5_7b_MultiVisCoP_DomainProbes-withGeneral-8probeeach-EGO-EXO-DEPTHexpert_train-ViSCoP-projector-LLM_LoRA_ProbeDiversityLoss-v1-10xWeight/README.md b/viscop_qwen2.5_7b_MultiVisCoP_DomainProbes-withGeneral-8probeeach-EGO-EXO-DEPTHexpert_train-ViSCoP-projector-LLM_LoRA_ProbeDiversityLoss-v1-10xWeight/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/viscop_qwen2.5_7b_MultiVisCoP_DomainProbes-withGeneral-8probeeach-EGO-EXO-DEPTHexpert_train-ViSCoP-projector-LLM_LoRA_ProbeDiversityLoss-v1-10xWeight/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/viscop_qwen2.5_7b_MultiVisCoP_DomainProbes-withGeneral-8probeeach-EGO-EXO-DEPTHexpert_train-ViSCoP-projector-LLM_LoRA_ProbeDiversityLoss-v1-10xWeight/adapter_config.json b/viscop_qwen2.5_7b_MultiVisCoP_DomainProbes-withGeneral-8probeeach-EGO-EXO-DEPTHexpert_train-ViSCoP-projector-LLM_LoRA_ProbeDiversityLoss-v1-10xWeight/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..67a9e1a88b1544572644691d5f9002ed3e0be5b1 --- /dev/null +++ b/viscop_qwen2.5_7b_MultiVisCoP_DomainProbes-withGeneral-8probeeach-EGO-EXO-DEPTHexpert_train-ViSCoP-projector-LLM_LoRA_ProbeDiversityLoss-v1-10xWeight/adapter_config.json @@ -0,0 +1,18 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "/vast/temp/dreilly1/videollama3-image_7b_local", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 16, + "lora_dropout": 0.05, + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "revision": null, + "target_modules": ".*model\\.layers\\..*\\.(v_proj|o_proj|q_proj|down_proj|k_proj|up_proj|gate_proj)$", + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/viscop_qwen2.5_7b_MultiVisCoP_DomainProbes-withGeneral-8probeeach-EGO-EXO-DEPTHexpert_train-ViSCoP-projector-LLM_LoRA_ProbeDiversityLoss-v1-10xWeight/adapter_model.bin b/viscop_qwen2.5_7b_MultiVisCoP_DomainProbes-withGeneral-8probeeach-EGO-EXO-DEPTHexpert_train-ViSCoP-projector-LLM_LoRA_ProbeDiversityLoss-v1-10xWeight/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..58ca7817d0002fbd9b514202886b587804e5fdb0 --- /dev/null +++ b/viscop_qwen2.5_7b_MultiVisCoP_DomainProbes-withGeneral-8probeeach-EGO-EXO-DEPTHexpert_train-ViSCoP-projector-LLM_LoRA_ProbeDiversityLoss-v1-10xWeight/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ea0d3ac9694cbc47276699b9f351e6bccc34e7ff4761e5229d4fc4d9c7b3bb0 +size 323097578 diff --git a/viscop_qwen2.5_7b_MultiVisCoP_DomainProbes-withGeneral-8probeeach-EGO-EXO-DEPTHexpert_train-ViSCoP-projector-LLM_LoRA_ProbeDiversityLoss-v1-10xWeight/config.json b/viscop_qwen2.5_7b_MultiVisCoP_DomainProbes-withGeneral-8probeeach-EGO-EXO-DEPTHexpert_train-ViSCoP-projector-LLM_LoRA_ProbeDiversityLoss-v1-10xWeight/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ec389b20e91372b320f475bc1431bc9731996355 --- /dev/null +++ b/viscop_qwen2.5_7b_MultiVisCoP_DomainProbes-withGeneral-8probeeach-EGO-EXO-DEPTHexpert_train-ViSCoP-projector-LLM_LoRA_ProbeDiversityLoss-v1-10xWeight/config.json @@ -0,0 +1,140 @@ +{ + "_attn_implementation_autoset": true, + "_name_or_path": "/vast/temp/dreilly1/videollama3-image_7b_local", + "architectures": [ + "Videollama3Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "DAMO-NLP-SG/VideoLLaMA3-7B-Image--configuration_videollama3.Videollama3Qwen2Config", + "AutoModelForCausalLM": "DAMO-NLP-SG/VideoLLaMA3-7B-Image--modeling_videollama3.Videollama3Qwen2ForCausalLM" + }, + "bos_token_id": 151643, + "domain_names": [ + "egocentric", + "depth", + "exocentric" + ], + "enable_probe_diversity_loss": true, + "eos_token_id": 151645, + "hidden_act": "silu", + "hidden_size": 3584, + "image_aspect_ratio": "square", + "image_size": -1, + "image_token_index": 151665, + "image_token_length": 1, + "include_general_domain": true, + "include_visual_probes": true, + "include_visual_tokens": true, + "initializer_range": 0.02, + "interaction_module": "cross_attention", + "interaction_module_layers": null, + "intermediate_size": 18944, + "is_alignment": false, + "llm_lr": 1e-05, + "max_frames": 180, + "max_position_embeddings": 32768, + "max_window_layers": 28, + "mm_hidden_size": 1152, + "mm_projector_lr": 1e-05, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_encoder": "DAMO-NLP-SG/SigLIP-NaViT", + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -1, + "model_type": "viscop_qwen2", + "num_attention_heads": 28, + "num_hidden_layers": 28, + "num_key_value_heads": 4, + "num_visual_probes": 8, + "probe_token_index": 151668, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000.0, + "sliding_window": null, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 16384, + "tokenizer_padding_side": "right", + "torch_dtype": "bfloat16", + "transformers_version": "4.46.3", + "use_cache": true, + "use_mm_proj": true, + "use_sliding_window": false, + "use_token_compression": false, + "viscop_type": "multi-viscop", + "vision_encoder": "DAMO-NLP-SG/SigLIP-NaViT", + "vision_encoder_config": { + "_attn_implementation_autoset": false, + "_name_or_path": "", + "add_cross_attention": false, + "architectures": null, + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "gelu_pytorch_tanh", + "hidden_size": 1152, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "intermediate_size": 4304, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "layer_norm_eps": 1e-06, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "model_type": "videollama3_vision_encoder", + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_channels": 3, + "num_hidden_layers": 27, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "patch_size": 14, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": null, + "torchscript": false, + "typical_p": 1.0, + "use_bfloat16": false + }, + "vision_encoder_lr": null, + "vocab_size": 152064 +} diff --git a/viscop_qwen2.5_7b_MultiVisCoP_DomainProbes-withGeneral-8probeeach-EGO-EXO-DEPTHexpert_train-ViSCoP-projector-LLM_LoRA_ProbeDiversityLoss-v1-10xWeight/non_lora_trainables.bin b/viscop_qwen2.5_7b_MultiVisCoP_DomainProbes-withGeneral-8probeeach-EGO-EXO-DEPTHexpert_train-ViSCoP-projector-LLM_LoRA_ProbeDiversityLoss-v1-10xWeight/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..3381075bec7c2e9991dd982e752a12538e6c124e --- /dev/null +++ b/viscop_qwen2.5_7b_MultiVisCoP_DomainProbes-withGeneral-8probeeach-EGO-EXO-DEPTHexpert_train-ViSCoP-projector-LLM_LoRA_ProbeDiversityLoss-v1-10xWeight/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ee9d69eb43f474d609bb2ab579bba1882801a4422d418252f322f16114a9de4 +size 1215957382 diff --git a/viscop_qwen2.5_7b_MultiVisCoP_DomainProbes-withGeneral-8probeeach-EGO-EXO-DEPTHexpert_train-ViSCoP-projector-LLM_LoRA_ProbeDiversityLoss-v1-10xWeight/train_viscop_multi-viscop.sh b/viscop_qwen2.5_7b_MultiVisCoP_DomainProbes-withGeneral-8probeeach-EGO-EXO-DEPTHexpert_train-ViSCoP-projector-LLM_LoRA_ProbeDiversityLoss-v1-10xWeight/train_viscop_multi-viscop.sh new file mode 100644 index 0000000000000000000000000000000000000000..b3e68bce8d42a799e90e02a6258e42fcf81e2c06 --- /dev/null +++ b/viscop_qwen2.5_7b_MultiVisCoP_DomainProbes-withGeneral-8probeeach-EGO-EXO-DEPTHexpert_train-ViSCoP-projector-LLM_LoRA_ProbeDiversityLoss-v1-10xWeight/train_viscop_multi-viscop.sh @@ -0,0 +1,163 @@ +#!/bin/bash +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$SCRIPT_DIR" +while [ "$REPO_ROOT" != "/" ] && [ ! -d "$REPO_ROOT/.git" ] && [ ! -d "$REPO_ROOT/training_jsons" ]; do + REPO_ROOT="$(cd "$REPO_ROOT/.." && pwd)" +done + +CONFIG_FILE="${CONFIG_FILE:-$REPO_ROOT/local.env}" +if [ -f "$CONFIG_FILE" ]; then + set -a + . "$CONFIG_FILE" + set +a +else + echo "Missing config: $CONFIG_FILE" + echo "Create $REPO_ROOT/local.env based on values for this server." + exit 1 +fi + +# Environment Variables +ARG_WORLD_SIZE=${1:-1} +ARG_NPROC_PER_NODE=${2:-8} + +if [[ -v MASTER_ADDR_PASSED ]]; then + ARG_MASTER_ADDR=$MASTER_ADDR_PASSED # passed via slurm submission script +else + ARG_MASTER_ADDR=127.0.0.1 # for dev environments +fi +ARG_MASTER_PORT=12356 +# ARG_RANK=$SLURM_NODEID +ARG_RANK=0 + +# Multiple conditions +if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then + WORLD_SIZE=$ARG_WORLD_SIZE + NPROC_PER_NODE=$ARG_NPROC_PER_NODE +fi + +if [ ! -n "$MASTER_ADDR" ] || [ ! -n "$MASTER_PORT" ] || [ ! -n "$RANK" ]; then + MASTER_ADDR=$ARG_MASTER_ADDR + MASTER_PORT=$ARG_MASTER_PORT + RANK=$ARG_RANK +fi + +echo "MASTER_ADDR: $MASTER_ADDR. MASTER_PORT: $MASTER_PORT. RANK: $RANK" +echo "WORLD_SIZE: $WORLD_SIZE" +echo "NPROC_PER_NODE: $NPROC_PER_NODE" + +# Training Arguments +GLOBAL_BATCH_SIZE=128 # aka effective batch size +LOCAL_BATCH_SIZE=8 # batch size per GPU +GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$LOCAL_BATCH_SIZE)] +echo $GRADIENT_ACCUMULATION_STEPS + +INIT_MODEL=$VISCOP_INIT_MODEL # path to base VLM (for ViSCoP we use VideoLLaMA3 as the base VLM) + +NUM_DATA_WORKERS=8 +# NUM_TRAIN_EPOCHS=3 # ! +NUM_TRAIN_EPOCHS=1 +LORA_TRAINING=True + +# ViSCoP Arguments +NUM_VISUAL_PROBES=8 # > usually 16 but changed for domain experiment +INTERACTION_MODULE_NAME=cross_attention +INTERACTION_MODULE_POS=all +PASS_PROBES_TO_LLM=True +PASS_VIS_FEATURES_TO_LLM=True + +# Logging Arguments +export WANDB_PROJECT=sony26_mm_viscop +REPORT_TO=wandb +OUTP_DIR=work_dirs/egoexo +RUN_NAME=viscop_qwen2.5_7b_MultiVisCoP_DomainProbes-withGeneral-8probeeach-EGO-EXO-DEPTHexpert_train-ViSCoP-projector-LLM_LoRA_ProbeDiversityLoss-v1-10xWeight + +# Data Arguments +DATA_DIR=$VISCOP_DATA_DIR +SHUFFLE_DATA=True # > +DOMAIN_NAMES=("egocentric" "depth" "exocentric") # ("egocentric" "depth" "robotics" "exocentric") # > +INCLUDE_GENERAL_DOMAIN=True # > + +DATA_JSONS=( + "$REPO_ROOT/training_jsons/train-instr_viscop_egoview.json" + "$REPO_ROOT/training_jsons/train-instr_viscop_depthmodality.json" + "$REPO_ROOT/training_jsons/train_instr_viscop_exoviews.json" + # "$REPO_ROOT/training_jsons/D-inBC-text-multi-train-8k-front.json" +) + +TRAINING_JSON="" +for json_file in "${DATA_JSONS[@]}"; do + TRAINING_JSON+="${json_file} " +done + +# ! Debug: remove setting of ego max frames to 40 +# if [[ $TRAINING_JSON == *"egoview"* ]]; then +# MAX_FRAMES=40 # use 40 frames for training on ego +# else +# MAX_FRAMES=180 +# fi +MAX_FRAMES=180 + +# Optional Arguments. Set TESTING to 1 to quickly test the training script without logging or data workers, useful for debugging +TESTING=0 +if [ $TESTING -eq 1 ]; then + NUM_DATA_WORKERS=0 + REPORT_TO=none + RUN_NAME=TESTING +fi + +mkdir -p "${OUTP_DIR}/${RUN_NAME}/" +cp "$0" "${OUTP_DIR}/${RUN_NAME}/" + +torchrun --nnodes $WORLD_SIZE \ + --nproc_per_node $NPROC_PER_NODE \ + --master_addr=$MASTER_ADDR \ + --master_port=$MASTER_PORT \ + --node_rank $RANK \ + viscop/train_multiviscop.py \ + --interaction_module_layers $INTERACTION_MODULE_POS \ + --interaction_module_name $INTERACTION_MODULE_NAME \ + --viscop_type multi-viscop \ + --enable_probe_diversity_loss True \ + --lora_enable $LORA_TRAINING \ + --num_train_epochs $NUM_TRAIN_EPOCHS \ + --deepspeed scripts/zero2.json \ + --model_type viscop_qwen2 \ + --model_path $INIT_MODEL \ + --vision_encoder DAMO-NLP-SG/SigLIP-NaViT \ + --mm_projector_type mlp2x_gelu \ + --data_path $TRAINING_JSON \ + --shuffle_data $SHUFFLE_DATA \ + --domain_names ${DOMAIN_NAMES[@]} \ + --include_general_domain $INCLUDE_GENERAL_DOMAIN \ + --data_folder $DATA_DIR \ + --image_merge_size 2 \ + --video_merge_size 2 \ + --fps 1 \ + --max_frames $MAX_FRAMES \ + --model_max_length 16384 \ + --mm_max_length 10240 \ + --bf16 True \ + --tf32 True \ + --fp16 False \ + --output_dir ${OUTP_DIR}/${RUN_NAME} \ + --per_device_train_batch_size $LOCAL_BATCH_SIZE \ + --per_device_eval_batch_size 2 \ + --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \ + --evaluation_strategy "no" \ + --save_strategy "no" \ + --save_steps 5000 \ + --save_total_limit 1 \ + --mm_projector_lr 1e-5 \ + --llm_lr 1e-5 \ + --weight_decay 0. \ + --warmup_ratio 0.03 \ + --lr_scheduler_type "cosine" \ + --logging_steps 1 \ + --gradient_checkpointing True \ + --dataloader_num_workers $NUM_DATA_WORKERS \ + --report_to $REPORT_TO \ + --run_name $RUN_NAME \ + --dataset_cache_dir $VISCOP_DATASET_CACHE_DIR \ + --include_visual_tokens $PASS_VIS_FEATURES_TO_LLM \ + --include_visual_probes $PASS_PROBES_TO_LLM \ + --num_visual_probes $NUM_VISUAL_PROBES diff --git a/viscop_qwen2.5_7b_MultiVisCoP_DomainProbes-withGeneral-8probeeach-EGO-EXO-DEPTHexpert_train-ViSCoP-projector-LLM_LoRA_ProbeDiversityLoss-v1-10xWeight/trainer_state.json b/viscop_qwen2.5_7b_MultiVisCoP_DomainProbes-withGeneral-8probeeach-EGO-EXO-DEPTHexpert_train-ViSCoP-projector-LLM_LoRA_ProbeDiversityLoss-v1-10xWeight/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..47c7db449cfbacecc4ab1925dbfb08f0e44d7dff --- /dev/null +++ b/viscop_qwen2.5_7b_MultiVisCoP_DomainProbes-withGeneral-8probeeach-EGO-EXO-DEPTHexpert_train-ViSCoP-projector-LLM_LoRA_ProbeDiversityLoss-v1-10xWeight/trainer_state.json @@ -0,0 +1,6642 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9994321408290744, + "eval_steps": 500, + "global_step": 660, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "autoregressive_loss": 1.2188, + "epoch": 0.0015142911224682945, + "grad_norm": 2.6142666339874268, + "learning_rate": 5.000000000000001e-07, + "loss": 82.0777, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.9047, + "step": 1 + }, + { + "autoregressive_loss": 1.2031, + "epoch": 0.003028582244936589, + "grad_norm": 3.3796701431274414, + "learning_rate": 1.0000000000000002e-06, + "loss": 81.9868, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.9047, + "step": 2 + }, + { + "autoregressive_loss": 1.1953, + "epoch": 0.004542873367404884, + "grad_norm": 3.343059539794922, + "learning_rate": 1.5e-06, + "loss": 81.9496, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.9047, + "step": 3 + }, + { + "autoregressive_loss": 1.2188, + "epoch": 0.006057164489873178, + "grad_norm": 2.2775630950927734, + "learning_rate": 2.0000000000000003e-06, + "loss": 82.1073, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.9047, + "step": 4 + }, + { + "autoregressive_loss": 1.2031, + "epoch": 0.007571455612341473, + "grad_norm": 2.4767560958862305, + "learning_rate": 2.5e-06, + "loss": 82.0266, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.9047, + "step": 5 + }, + { + "autoregressive_loss": 1.2109, + "epoch": 0.009085746734809767, + "grad_norm": 2.3617897033691406, + "learning_rate": 3e-06, + "loss": 82.0437, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.9047, + "step": 6 + }, + { + "autoregressive_loss": 1.1875, + "epoch": 0.010600037857278062, + "grad_norm": 3.5791139602661133, + "learning_rate": 3.5e-06, + "loss": 81.9126, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.9047, + "step": 7 + }, + { + "autoregressive_loss": 1.2031, + "epoch": 0.012114328979746356, + "grad_norm": 2.6964592933654785, + "learning_rate": 4.000000000000001e-06, + "loss": 81.9474, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.9045, + "step": 8 + }, + { + "autoregressive_loss": 1.1875, + "epoch": 0.013628620102214651, + "grad_norm": 3.696932315826416, + "learning_rate": 4.5e-06, + "loss": 81.891, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.9043, + "step": 9 + }, + { + "autoregressive_loss": 1.1875, + "epoch": 0.015142911224682946, + "grad_norm": 5.3065361976623535, + "learning_rate": 5e-06, + "loss": 81.6685, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.9028, + "step": 10 + }, + { + "autoregressive_loss": 1.1719, + "epoch": 0.01665720234715124, + "grad_norm": 4.820298671722412, + "learning_rate": 5.500000000000001e-06, + "loss": 81.3987, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.9008, + "step": 11 + }, + { + "autoregressive_loss": 1.1562, + "epoch": 0.018171493469619535, + "grad_norm": 5.017973899841309, + "learning_rate": 6e-06, + "loss": 81.2102, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.8994, + "step": 12 + }, + { + "autoregressive_loss": 1.1094, + "epoch": 0.019685784592087828, + "grad_norm": 5.868105888366699, + "learning_rate": 6.5000000000000004e-06, + "loss": 80.5655, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.8961, + "step": 13 + }, + { + "autoregressive_loss": 1.0, + "epoch": 0.021200075714556125, + "grad_norm": 6.6121602058410645, + "learning_rate": 7e-06, + "loss": 78.7147, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.8839, + "step": 14 + }, + { + "autoregressive_loss": 1.0547, + "epoch": 0.022714366837024418, + "grad_norm": 5.075581073760986, + "learning_rate": 7.500000000000001e-06, + "loss": 78.8835, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.8806, + "step": 15 + }, + { + "autoregressive_loss": 1.0547, + "epoch": 0.02422865795949271, + "grad_norm": 5.654189109802246, + "learning_rate": 8.000000000000001e-06, + "loss": 77.9934, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.8696, + "step": 16 + }, + { + "autoregressive_loss": 0.9648, + "epoch": 0.02574294908196101, + "grad_norm": 6.3510518074035645, + "learning_rate": 8.5e-06, + "loss": 76.6772, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.8622, + "step": 17 + }, + { + "autoregressive_loss": 0.8438, + "epoch": 0.027257240204429302, + "grad_norm": 5.991873741149902, + "learning_rate": 9e-06, + "loss": 74.7364, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.8498, + "step": 18 + }, + { + "autoregressive_loss": 0.7734, + "epoch": 0.028771531326897595, + "grad_norm": 6.033049583435059, + "learning_rate": 9.5e-06, + "loss": 72.1953, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.8253, + "step": 19 + }, + { + "autoregressive_loss": 0.7383, + "epoch": 0.030285822449365892, + "grad_norm": 6.989581108093262, + "learning_rate": 1e-05, + "loss": 70.1821, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.8034, + "step": 20 + }, + { + "autoregressive_loss": 0.5703, + "epoch": 0.03180011357183418, + "grad_norm": 7.940646171569824, + "learning_rate": 9.999939760836287e-06, + "loss": 67.7485, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.7897, + "step": 21 + }, + { + "autoregressive_loss": 0.5234, + "epoch": 0.03331440469430248, + "grad_norm": 6.354173183441162, + "learning_rate": 9.99975904479664e-06, + "loss": 65.3092, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.7637, + "step": 22 + }, + { + "autoregressive_loss": 0.4277, + "epoch": 0.034828695816770776, + "grad_norm": 8.316685676574707, + "learning_rate": 9.999457856235542e-06, + "loss": 63.2692, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.7482, + "step": 23 + }, + { + "autoregressive_loss": 0.2637, + "epoch": 0.03634298693923907, + "grad_norm": 5.653663158416748, + "learning_rate": 9.999036202410324e-06, + "loss": 60.6835, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.732, + "step": 24 + }, + { + "autoregressive_loss": 0.2266, + "epoch": 0.03785727806170736, + "grad_norm": 4.401732921600342, + "learning_rate": 9.998494093481022e-06, + "loss": 58.4716, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.7083, + "step": 25 + }, + { + "autoregressive_loss": 0.1904, + "epoch": 0.039371569184175656, + "grad_norm": 4.296875, + "learning_rate": 9.997831542510107e-06, + "loss": 56.4626, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.6867, + "step": 26 + }, + { + "autoregressive_loss": 0.1582, + "epoch": 0.04088586030664395, + "grad_norm": 3.489771842956543, + "learning_rate": 9.997048565462188e-06, + "loss": 54.5955, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.6666, + "step": 27 + }, + { + "autoregressive_loss": 0.1523, + "epoch": 0.04240015142911225, + "grad_norm": 3.0337727069854736, + "learning_rate": 9.996145181203616e-06, + "loss": 52.5291, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.6413, + "step": 28 + }, + { + "autoregressive_loss": 0.1367, + "epoch": 0.04391444255158054, + "grad_norm": 2.6953823566436768, + "learning_rate": 9.995121411502037e-06, + "loss": 50.6282, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.6191, + "step": 29 + }, + { + "autoregressive_loss": 0.1309, + "epoch": 0.045428733674048836, + "grad_norm": 3.0111963748931885, + "learning_rate": 9.993977281025862e-06, + "loss": 49.502, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.6057, + "step": 30 + }, + { + "autoregressive_loss": 0.1201, + "epoch": 0.04694302479651713, + "grad_norm": 2.763465642929077, + "learning_rate": 9.99271281734368e-06, + "loss": 47.4393, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.581, + "step": 31 + }, + { + "autoregressive_loss": 0.1216, + "epoch": 0.04845731591898542, + "grad_norm": 3.030705213546753, + "learning_rate": 9.99132805092358e-06, + "loss": 46.1608, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.5649, + "step": 32 + }, + { + "autoregressive_loss": 0.1279, + "epoch": 0.04997160704145372, + "grad_norm": 2.555126905441284, + "learning_rate": 9.989823015132433e-06, + "loss": 44.9071, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.5486, + "step": 33 + }, + { + "autoregressive_loss": 0.1123, + "epoch": 0.05148589816392202, + "grad_norm": 2.971357583999634, + "learning_rate": 9.98819774623508e-06, + "loss": 44.2517, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.5419, + "step": 34 + }, + { + "autoregressive_loss": 0.1206, + "epoch": 0.05300018928639031, + "grad_norm": 3.2798564434051514, + "learning_rate": 9.986452283393452e-06, + "loss": 43.4465, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.531, + "step": 35 + }, + { + "autoregressive_loss": 0.1191, + "epoch": 0.054514480408858604, + "grad_norm": 2.985137462615967, + "learning_rate": 9.984586668665641e-06, + "loss": 42.8869, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.5242, + "step": 36 + }, + { + "autoregressive_loss": 0.1094, + "epoch": 0.0560287715313269, + "grad_norm": 2.3863425254821777, + "learning_rate": 9.982600947004875e-06, + "loss": 42.2216, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.5168, + "step": 37 + }, + { + "autoregressive_loss": 0.1211, + "epoch": 0.05754306265379519, + "grad_norm": 2.226412773132324, + "learning_rate": 9.980495166258437e-06, + "loss": 41.8485, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.511, + "step": 38 + }, + { + "autoregressive_loss": 0.125, + "epoch": 0.059057353776263484, + "grad_norm": 2.429041862487793, + "learning_rate": 9.978269377166517e-06, + "loss": 41.5305, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.5066, + "step": 39 + }, + { + "autoregressive_loss": 0.1167, + "epoch": 0.060571644898731784, + "grad_norm": 2.3527092933654785, + "learning_rate": 9.975923633360985e-06, + "loss": 41.1945, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.5032, + "step": 40 + }, + { + "autoregressive_loss": 0.1167, + "epoch": 0.06208593602120008, + "grad_norm": 2.7156426906585693, + "learning_rate": 9.973457991364098e-06, + "loss": 40.9483, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.5002, + "step": 41 + }, + { + "autoregressive_loss": 0.0947, + "epoch": 0.06360022714366836, + "grad_norm": 2.3831348419189453, + "learning_rate": 9.970872510587142e-06, + "loss": 40.4648, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4963, + "step": 42 + }, + { + "autoregressive_loss": 0.1162, + "epoch": 0.06511451826613666, + "grad_norm": 2.2227847576141357, + "learning_rate": 9.968167253328995e-06, + "loss": 40.2749, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4918, + "step": 43 + }, + { + "autoregressive_loss": 0.1172, + "epoch": 0.06662880938860496, + "grad_norm": 1.8962568044662476, + "learning_rate": 9.965342284774633e-06, + "loss": 39.8748, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4867, + "step": 44 + }, + { + "autoregressive_loss": 0.1152, + "epoch": 0.06814310051107325, + "grad_norm": 1.9804664850234985, + "learning_rate": 9.962397672993552e-06, + "loss": 39.6829, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4845, + "step": 45 + }, + { + "autoregressive_loss": 0.1104, + "epoch": 0.06965739163354155, + "grad_norm": 1.6518906354904175, + "learning_rate": 9.95933348893813e-06, + "loss": 39.4172, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4817, + "step": 46 + }, + { + "autoregressive_loss": 0.1113, + "epoch": 0.07117168275600984, + "grad_norm": 1.9959464073181152, + "learning_rate": 9.956149806441927e-06, + "loss": 39.1357, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4781, + "step": 47 + }, + { + "autoregressive_loss": 0.1162, + "epoch": 0.07268597387847814, + "grad_norm": 2.205864667892456, + "learning_rate": 9.952846702217886e-06, + "loss": 39.1088, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4773, + "step": 48 + }, + { + "autoregressive_loss": 0.1133, + "epoch": 0.07420026500094644, + "grad_norm": 2.1969475746154785, + "learning_rate": 9.949424255856506e-06, + "loss": 38.9149, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4751, + "step": 49 + }, + { + "autoregressive_loss": 0.1123, + "epoch": 0.07571455612341473, + "grad_norm": 2.4351046085357666, + "learning_rate": 9.945882549823906e-06, + "loss": 38.6665, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4721, + "step": 50 + }, + { + "autoregressive_loss": 0.1104, + "epoch": 0.07722884724588303, + "grad_norm": 1.790277361869812, + "learning_rate": 9.94222166945985e-06, + "loss": 38.622, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4718, + "step": 51 + }, + { + "autoregressive_loss": 0.1123, + "epoch": 0.07874313836835131, + "grad_norm": 2.04057240486145, + "learning_rate": 9.938441702975689e-06, + "loss": 38.5512, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4707, + "step": 52 + }, + { + "autoregressive_loss": 0.1133, + "epoch": 0.08025742949081961, + "grad_norm": 1.6350489854812622, + "learning_rate": 9.93454274145223e-06, + "loss": 38.4603, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4694, + "step": 53 + }, + { + "autoregressive_loss": 0.1143, + "epoch": 0.0817717206132879, + "grad_norm": 1.5349832773208618, + "learning_rate": 9.930524878837544e-06, + "loss": 38.253, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4668, + "step": 54 + }, + { + "autoregressive_loss": 0.1113, + "epoch": 0.0832860117357562, + "grad_norm": 1.4876055717468262, + "learning_rate": 9.926388211944707e-06, + "loss": 38.1555, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4658, + "step": 55 + }, + { + "autoregressive_loss": 0.1182, + "epoch": 0.0848003028582245, + "grad_norm": 1.7751145362854004, + "learning_rate": 9.922132840449459e-06, + "loss": 38.0769, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4642, + "step": 56 + }, + { + "autoregressive_loss": 0.1162, + "epoch": 0.08631459398069279, + "grad_norm": 1.4472836256027222, + "learning_rate": 9.917758866887808e-06, + "loss": 38.0179, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4637, + "step": 57 + }, + { + "autoregressive_loss": 0.1123, + "epoch": 0.08782888510316109, + "grad_norm": 1.4114046096801758, + "learning_rate": 9.91326639665356e-06, + "loss": 37.8867, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4623, + "step": 58 + }, + { + "autoregressive_loss": 0.1089, + "epoch": 0.08934317622562937, + "grad_norm": 1.0022757053375244, + "learning_rate": 9.908655537995772e-06, + "loss": 37.7349, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4608, + "step": 59 + }, + { + "autoregressive_loss": 0.1143, + "epoch": 0.09085746734809767, + "grad_norm": 1.3286306858062744, + "learning_rate": 9.903926402016153e-06, + "loss": 37.6643, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4594, + "step": 60 + }, + { + "autoregressive_loss": 0.1108, + "epoch": 0.09237175847056596, + "grad_norm": 1.3575870990753174, + "learning_rate": 9.899079102666382e-06, + "loss": 37.43, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4568, + "step": 61 + }, + { + "autoregressive_loss": 0.1167, + "epoch": 0.09388604959303426, + "grad_norm": 1.37393057346344, + "learning_rate": 9.894113756745362e-06, + "loss": 37.5437, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4577, + "step": 62 + }, + { + "autoregressive_loss": 0.1172, + "epoch": 0.09540034071550256, + "grad_norm": 1.0836256742477417, + "learning_rate": 9.88903048389641e-06, + "loss": 37.5011, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.457, + "step": 63 + }, + { + "autoregressive_loss": 0.1162, + "epoch": 0.09691463183797085, + "grad_norm": 1.3667747974395752, + "learning_rate": 9.883829406604363e-06, + "loss": 37.429, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4562, + "step": 64 + }, + { + "autoregressive_loss": 0.1104, + "epoch": 0.09842892296043915, + "grad_norm": 1.3270810842514038, + "learning_rate": 9.878510650192644e-06, + "loss": 37.2986, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4552, + "step": 65 + }, + { + "autoregressive_loss": 0.1147, + "epoch": 0.09994321408290743, + "grad_norm": 1.5425379276275635, + "learning_rate": 9.873074342820225e-06, + "loss": 37.4071, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4561, + "step": 66 + }, + { + "autoregressive_loss": 0.1191, + "epoch": 0.10145750520537573, + "grad_norm": 1.288554072380066, + "learning_rate": 9.867520615478554e-06, + "loss": 37.2861, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4542, + "step": 67 + }, + { + "autoregressive_loss": 0.104, + "epoch": 0.10297179632784403, + "grad_norm": 1.3148400783538818, + "learning_rate": 9.861849601988384e-06, + "loss": 37.1677, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4542, + "step": 68 + }, + { + "autoregressive_loss": 0.1094, + "epoch": 0.10448608745031232, + "grad_norm": 1.189699411392212, + "learning_rate": 9.85606143899656e-06, + "loss": 37.0789, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4525, + "step": 69 + }, + { + "autoregressive_loss": 0.1113, + "epoch": 0.10600037857278062, + "grad_norm": 1.1804251670837402, + "learning_rate": 9.850156265972722e-06, + "loss": 36.9441, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4507, + "step": 70 + }, + { + "autoregressive_loss": 0.1147, + "epoch": 0.10751466969524891, + "grad_norm": 1.0557808876037598, + "learning_rate": 9.844134225205941e-06, + "loss": 36.9879, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4509, + "step": 71 + }, + { + "autoregressive_loss": 0.1006, + "epoch": 0.10902896081771721, + "grad_norm": 1.1114401817321777, + "learning_rate": 9.8379954618013e-06, + "loss": 36.7703, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4495, + "step": 72 + }, + { + "autoregressive_loss": 0.1172, + "epoch": 0.1105432519401855, + "grad_norm": 0.9423372745513916, + "learning_rate": 9.831740123676387e-06, + "loss": 36.8552, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.449, + "step": 73 + }, + { + "autoregressive_loss": 0.1104, + "epoch": 0.1120575430626538, + "grad_norm": 1.0795594453811646, + "learning_rate": 9.825368361557738e-06, + "loss": 36.7683, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4486, + "step": 74 + }, + { + "autoregressive_loss": 0.1084, + "epoch": 0.1135718341851221, + "grad_norm": 1.041020154953003, + "learning_rate": 9.8188803289772e-06, + "loss": 36.7949, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4491, + "step": 75 + }, + { + "autoregressive_loss": 0.1123, + "epoch": 0.11508612530759038, + "grad_norm": 1.1202330589294434, + "learning_rate": 9.812276182268236e-06, + "loss": 36.6491, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4469, + "step": 76 + }, + { + "autoregressive_loss": 0.1113, + "epoch": 0.11660041643005868, + "grad_norm": 1.0114858150482178, + "learning_rate": 9.80555608056216e-06, + "loss": 36.672, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4472, + "step": 77 + }, + { + "autoregressive_loss": 0.0996, + "epoch": 0.11811470755252697, + "grad_norm": 0.9755472540855408, + "learning_rate": 9.798720185784288e-06, + "loss": 36.5815, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4473, + "step": 78 + }, + { + "autoregressive_loss": 0.1084, + "epoch": 0.11962899867499527, + "grad_norm": 0.8341207504272461, + "learning_rate": 9.791768662650059e-06, + "loss": 36.6515, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4473, + "step": 79 + }, + { + "autoregressive_loss": 0.1084, + "epoch": 0.12114328979746357, + "grad_norm": 0.9151251316070557, + "learning_rate": 9.784701678661045e-06, + "loss": 36.5847, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4465, + "step": 80 + }, + { + "autoregressive_loss": 0.1069, + "epoch": 0.12265758091993186, + "grad_norm": 0.883343517780304, + "learning_rate": 9.777519404100933e-06, + "loss": 36.5339, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.446, + "step": 81 + }, + { + "autoregressive_loss": 0.0986, + "epoch": 0.12417187204240016, + "grad_norm": 0.8017266392707825, + "learning_rate": 9.770222012031404e-06, + "loss": 36.3655, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4447, + "step": 82 + }, + { + "autoregressive_loss": 0.1099, + "epoch": 0.12568616316486844, + "grad_norm": 0.8046688437461853, + "learning_rate": 9.762809678287977e-06, + "loss": 36.4789, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.445, + "step": 83 + }, + { + "autoregressive_loss": 0.1113, + "epoch": 0.12720045428733673, + "grad_norm": 0.7140364050865173, + "learning_rate": 9.755282581475769e-06, + "loss": 36.3452, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4432, + "step": 84 + }, + { + "autoregressive_loss": 0.1079, + "epoch": 0.12871474540980504, + "grad_norm": 0.7693091630935669, + "learning_rate": 9.747640902965185e-06, + "loss": 36.3388, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4435, + "step": 85 + }, + { + "autoregressive_loss": 0.1104, + "epoch": 0.13022903653227333, + "grad_norm": 0.7321847081184387, + "learning_rate": 9.739884826887554e-06, + "loss": 36.321, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.443, + "step": 86 + }, + { + "autoregressive_loss": 0.1123, + "epoch": 0.13174332765474162, + "grad_norm": 0.7911235690116882, + "learning_rate": 9.73201454013069e-06, + "loss": 36.375, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4434, + "step": 87 + }, + { + "autoregressive_loss": 0.1152, + "epoch": 0.13325761877720993, + "grad_norm": 0.7472660541534424, + "learning_rate": 9.72403023233439e-06, + "loss": 36.3372, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4427, + "step": 88 + }, + { + "autoregressive_loss": 0.1128, + "epoch": 0.13477190989967822, + "grad_norm": 0.8787516355514526, + "learning_rate": 9.715932095885867e-06, + "loss": 36.2593, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.442, + "step": 89 + }, + { + "autoregressive_loss": 0.1143, + "epoch": 0.1362862010221465, + "grad_norm": 0.8006541728973389, + "learning_rate": 9.707720325915105e-06, + "loss": 36.2735, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.442, + "step": 90 + }, + { + "autoregressive_loss": 0.106, + "epoch": 0.1378004921446148, + "grad_norm": 0.7240521311759949, + "learning_rate": 9.699395120290166e-06, + "loss": 36.2262, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4422, + "step": 91 + }, + { + "autoregressive_loss": 0.106, + "epoch": 0.1393147832670831, + "grad_norm": 0.7540751099586487, + "learning_rate": 9.690956679612422e-06, + "loss": 36.1711, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4415, + "step": 92 + }, + { + "autoregressive_loss": 0.1177, + "epoch": 0.1408290743895514, + "grad_norm": 0.6583181619644165, + "learning_rate": 9.682405207211714e-06, + "loss": 36.1711, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4404, + "step": 93 + }, + { + "autoregressive_loss": 0.1016, + "epoch": 0.14234336551201968, + "grad_norm": 0.7352666258811951, + "learning_rate": 9.673740909141463e-06, + "loss": 36.0911, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.441, + "step": 94 + }, + { + "autoregressive_loss": 0.1089, + "epoch": 0.143857656634488, + "grad_norm": 0.6482987403869629, + "learning_rate": 9.664963994173695e-06, + "loss": 36.0657, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4399, + "step": 95 + }, + { + "autoregressive_loss": 0.1045, + "epoch": 0.14537194775695628, + "grad_norm": 0.7324978709220886, + "learning_rate": 9.656074673794018e-06, + "loss": 36.0148, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4397, + "step": 96 + }, + { + "autoregressive_loss": 0.1128, + "epoch": 0.14688623887942456, + "grad_norm": 0.7501246929168701, + "learning_rate": 9.647073162196524e-06, + "loss": 35.992, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4386, + "step": 97 + }, + { + "autoregressive_loss": 0.1113, + "epoch": 0.14840053000189288, + "grad_norm": 0.718576967716217, + "learning_rate": 9.637959676278621e-06, + "loss": 36.0178, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4391, + "step": 98 + }, + { + "autoregressive_loss": 0.1108, + "epoch": 0.14991482112436116, + "grad_norm": 0.5948686003684998, + "learning_rate": 9.62873443563582e-06, + "loss": 35.9593, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4384, + "step": 99 + }, + { + "autoregressive_loss": 0.0991, + "epoch": 0.15142911224682945, + "grad_norm": 0.6520156860351562, + "learning_rate": 9.619397662556434e-06, + "loss": 35.8888, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4387, + "step": 100 + }, + { + "autoregressive_loss": 0.1084, + "epoch": 0.15294340336929774, + "grad_norm": 0.7031528353691101, + "learning_rate": 9.609949582016223e-06, + "loss": 35.9066, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.438, + "step": 101 + }, + { + "autoregressive_loss": 0.1064, + "epoch": 0.15445769449176605, + "grad_norm": 0.7441109418869019, + "learning_rate": 9.600390421672976e-06, + "loss": 35.8985, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4381, + "step": 102 + }, + { + "autoregressive_loss": 0.1089, + "epoch": 0.15597198561423434, + "grad_norm": 0.6931670308113098, + "learning_rate": 9.590720411861022e-06, + "loss": 35.8733, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4375, + "step": 103 + }, + { + "autoregressive_loss": 0.1113, + "epoch": 0.15748627673670262, + "grad_norm": 0.6845383644104004, + "learning_rate": 9.58093978558568e-06, + "loss": 35.8856, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4374, + "step": 104 + }, + { + "autoregressive_loss": 0.0981, + "epoch": 0.15900056785917094, + "grad_norm": 0.6183035373687744, + "learning_rate": 9.571048778517655e-06, + "loss": 35.7819, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4374, + "step": 105 + }, + { + "autoregressive_loss": 0.1123, + "epoch": 0.16051485898163922, + "grad_norm": 0.6418727040290833, + "learning_rate": 9.561047628987338e-06, + "loss": 35.8681, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4372, + "step": 106 + }, + { + "autoregressive_loss": 0.0938, + "epoch": 0.1620291501041075, + "grad_norm": 0.6316526532173157, + "learning_rate": 9.55093657797909e-06, + "loss": 35.7336, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4373, + "step": 107 + }, + { + "autoregressive_loss": 0.1104, + "epoch": 0.1635434412265758, + "grad_norm": 0.7596312165260315, + "learning_rate": 9.540715869125407e-06, + "loss": 35.7699, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4361, + "step": 108 + }, + { + "autoregressive_loss": 0.1104, + "epoch": 0.1650577323490441, + "grad_norm": 0.6458888053894043, + "learning_rate": 9.530385748701074e-06, + "loss": 35.769, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4361, + "step": 109 + }, + { + "autoregressive_loss": 0.0977, + "epoch": 0.1665720234715124, + "grad_norm": 0.6201041340827942, + "learning_rate": 9.519946465617217e-06, + "loss": 35.6191, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4354, + "step": 110 + }, + { + "autoregressive_loss": 0.1064, + "epoch": 0.16808631459398068, + "grad_norm": 0.5310369729995728, + "learning_rate": 9.509398271415308e-06, + "loss": 35.7267, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.436, + "step": 111 + }, + { + "autoregressive_loss": 0.1069, + "epoch": 0.169600605716449, + "grad_norm": 0.5491090416908264, + "learning_rate": 9.498741420261109e-06, + "loss": 35.6911, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4355, + "step": 112 + }, + { + "autoregressive_loss": 0.1055, + "epoch": 0.17111489683891729, + "grad_norm": 0.5765312314033508, + "learning_rate": 9.487976168938535e-06, + "loss": 35.6763, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4354, + "step": 113 + }, + { + "autoregressive_loss": 0.1104, + "epoch": 0.17262918796138557, + "grad_norm": 0.5928649306297302, + "learning_rate": 9.477102776843486e-06, + "loss": 35.7244, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4355, + "step": 114 + }, + { + "autoregressive_loss": 0.1074, + "epoch": 0.17414347908385386, + "grad_norm": 0.5479605793952942, + "learning_rate": 9.466121505977577e-06, + "loss": 35.6283, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4346, + "step": 115 + }, + { + "autoregressive_loss": 0.1045, + "epoch": 0.17565777020632217, + "grad_norm": 0.5274514555931091, + "learning_rate": 9.45503262094184e-06, + "loss": 35.6199, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4348, + "step": 116 + }, + { + "autoregressive_loss": 0.1069, + "epoch": 0.17717206132879046, + "grad_norm": 0.5964930057525635, + "learning_rate": 9.443836388930339e-06, + "loss": 35.6511, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4349, + "step": 117 + }, + { + "autoregressive_loss": 0.1104, + "epoch": 0.17868635245125875, + "grad_norm": 0.5694099068641663, + "learning_rate": 9.432533079723734e-06, + "loss": 35.624, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4343, + "step": 118 + }, + { + "autoregressive_loss": 0.0996, + "epoch": 0.18020064357372706, + "grad_norm": 0.5774425864219666, + "learning_rate": 9.421122965682782e-06, + "loss": 35.5655, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4346, + "step": 119 + }, + { + "autoregressive_loss": 0.105, + "epoch": 0.18171493469619535, + "grad_norm": 0.6029176115989685, + "learning_rate": 9.409606321741776e-06, + "loss": 35.573, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4342, + "step": 120 + }, + { + "autoregressive_loss": 0.1035, + "epoch": 0.18322922581866363, + "grad_norm": 0.5015498995780945, + "learning_rate": 9.397983425401915e-06, + "loss": 35.5853, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4345, + "step": 121 + }, + { + "autoregressive_loss": 0.1152, + "epoch": 0.18474351694113192, + "grad_norm": 0.5049082040786743, + "learning_rate": 9.386254556724622e-06, + "loss": 35.5679, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4331, + "step": 122 + }, + { + "autoregressive_loss": 0.1006, + "epoch": 0.18625780806360023, + "grad_norm": 0.563130795955658, + "learning_rate": 9.374419998324792e-06, + "loss": 35.5257, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.434, + "step": 123 + }, + { + "autoregressive_loss": 0.1055, + "epoch": 0.18777209918606852, + "grad_norm": 0.6377906799316406, + "learning_rate": 9.362480035363987e-06, + "loss": 35.5856, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4342, + "step": 124 + }, + { + "autoregressive_loss": 0.1064, + "epoch": 0.1892863903085368, + "grad_norm": 0.5468383431434631, + "learning_rate": 9.350434955543557e-06, + "loss": 35.6072, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4344, + "step": 125 + }, + { + "autoregressive_loss": 0.1025, + "epoch": 0.19080068143100512, + "grad_norm": 0.5611220598220825, + "learning_rate": 9.338285049097722e-06, + "loss": 35.5671, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4343, + "step": 126 + }, + { + "autoregressive_loss": 0.1113, + "epoch": 0.1923149725534734, + "grad_norm": 0.6079346537590027, + "learning_rate": 9.326030608786558e-06, + "loss": 35.5347, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4331, + "step": 127 + }, + { + "autoregressive_loss": 0.1104, + "epoch": 0.1938292636759417, + "grad_norm": 0.5119832158088684, + "learning_rate": 9.31367192988896e-06, + "loss": 35.5969, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4339, + "step": 128 + }, + { + "autoregressive_loss": 0.105, + "epoch": 0.19534355479841, + "grad_norm": 0.5591228604316711, + "learning_rate": 9.301209310195523e-06, + "loss": 35.5142, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4335, + "step": 129 + }, + { + "autoregressive_loss": 0.1006, + "epoch": 0.1968578459208783, + "grad_norm": 0.5370998382568359, + "learning_rate": 9.288643050001362e-06, + "loss": 35.484, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4335, + "step": 130 + }, + { + "autoregressive_loss": 0.1084, + "epoch": 0.19837213704334658, + "grad_norm": 0.4227616488933563, + "learning_rate": 9.275973452098877e-06, + "loss": 35.4774, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4326, + "step": 131 + }, + { + "autoregressive_loss": 0.1045, + "epoch": 0.19988642816581487, + "grad_norm": 0.5789006948471069, + "learning_rate": 9.263200821770462e-06, + "loss": 35.4813, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4331, + "step": 132 + }, + { + "autoregressive_loss": 0.0967, + "epoch": 0.20140071928828318, + "grad_norm": 0.5915460586547852, + "learning_rate": 9.250325466781145e-06, + "loss": 35.4384, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4333, + "step": 133 + }, + { + "autoregressive_loss": 0.1079, + "epoch": 0.20291501041075147, + "grad_norm": 0.51969975233078, + "learning_rate": 9.237347697371173e-06, + "loss": 35.524, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4332, + "step": 134 + }, + { + "autoregressive_loss": 0.0903, + "epoch": 0.20442930153321975, + "grad_norm": 0.5110373497009277, + "learning_rate": 9.224267826248536e-06, + "loss": 35.36, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4329, + "step": 135 + }, + { + "autoregressive_loss": 0.1045, + "epoch": 0.20594359265568807, + "grad_norm": 0.4828433096408844, + "learning_rate": 9.211086168581433e-06, + "loss": 35.4467, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4326, + "step": 136 + }, + { + "autoregressive_loss": 0.105, + "epoch": 0.20745788377815635, + "grad_norm": 0.522764265537262, + "learning_rate": 9.19780304199068e-06, + "loss": 35.4617, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4328, + "step": 137 + }, + { + "autoregressive_loss": 0.1069, + "epoch": 0.20897217490062464, + "grad_norm": 0.5540645718574524, + "learning_rate": 9.184418766542046e-06, + "loss": 35.505, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4331, + "step": 138 + }, + { + "autoregressive_loss": 0.0928, + "epoch": 0.21048646602309293, + "grad_norm": 0.5058237314224243, + "learning_rate": 9.170933664738563e-06, + "loss": 35.3252, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4323, + "step": 139 + }, + { + "autoregressive_loss": 0.1064, + "epoch": 0.21200075714556124, + "grad_norm": 0.5019365549087524, + "learning_rate": 9.157348061512728e-06, + "loss": 35.4104, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.432, + "step": 140 + }, + { + "autoregressive_loss": 0.1113, + "epoch": 0.21351504826802953, + "grad_norm": 0.5416918992996216, + "learning_rate": 9.143662284218691e-06, + "loss": 35.4198, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4316, + "step": 141 + }, + { + "autoregressive_loss": 0.1055, + "epoch": 0.21502933939049781, + "grad_norm": 0.5010437369346619, + "learning_rate": 9.129876662624366e-06, + "loss": 35.4423, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4325, + "step": 142 + }, + { + "autoregressive_loss": 0.1074, + "epoch": 0.21654363051296613, + "grad_norm": 0.5381550788879395, + "learning_rate": 9.11599152890348e-06, + "loss": 35.406, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4318, + "step": 143 + }, + { + "autoregressive_loss": 0.1104, + "epoch": 0.21805792163543442, + "grad_norm": 0.5745255351066589, + "learning_rate": 9.102007217627568e-06, + "loss": 35.4106, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4316, + "step": 144 + }, + { + "autoregressive_loss": 0.1035, + "epoch": 0.2195722127579027, + "grad_norm": 0.5166819095611572, + "learning_rate": 9.08792406575792e-06, + "loss": 35.3394, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4314, + "step": 145 + }, + { + "autoregressive_loss": 0.1055, + "epoch": 0.221086503880371, + "grad_norm": 0.5259589552879333, + "learning_rate": 9.073742412637448e-06, + "loss": 35.3527, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4313, + "step": 146 + }, + { + "autoregressive_loss": 0.1016, + "epoch": 0.2226007950028393, + "grad_norm": 0.5345125794410706, + "learning_rate": 9.059462599982525e-06, + "loss": 35.3379, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4315, + "step": 147 + }, + { + "autoregressive_loss": 0.106, + "epoch": 0.2241150861253076, + "grad_norm": 0.5773440599441528, + "learning_rate": 9.045084971874738e-06, + "loss": 35.4159, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4321, + "step": 148 + }, + { + "autoregressive_loss": 0.1035, + "epoch": 0.22562937724777588, + "grad_norm": 0.575214684009552, + "learning_rate": 9.030609874752604e-06, + "loss": 35.3701, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4318, + "step": 149 + }, + { + "autoregressive_loss": 0.0967, + "epoch": 0.2271436683702442, + "grad_norm": 0.5951133966445923, + "learning_rate": 9.016037657403225e-06, + "loss": 35.3123, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4318, + "step": 150 + }, + { + "autoregressive_loss": 0.0957, + "epoch": 0.22865795949271248, + "grad_norm": 0.47159436345100403, + "learning_rate": 9.001368670953872e-06, + "loss": 35.3594, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4324, + "step": 151 + }, + { + "autoregressive_loss": 0.1074, + "epoch": 0.23017225061518076, + "grad_norm": 0.5594759583473206, + "learning_rate": 8.986603268863536e-06, + "loss": 35.3302, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4309, + "step": 152 + }, + { + "autoregressive_loss": 0.1074, + "epoch": 0.23168654173764908, + "grad_norm": 0.5136753916740417, + "learning_rate": 8.971741806914409e-06, + "loss": 35.3729, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4315, + "step": 153 + }, + { + "autoregressive_loss": 0.0967, + "epoch": 0.23320083286011736, + "grad_norm": 0.5430551171302795, + "learning_rate": 8.956784643203303e-06, + "loss": 35.2849, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4314, + "step": 154 + }, + { + "autoregressive_loss": 0.1035, + "epoch": 0.23471512398258565, + "grad_norm": 0.5030525922775269, + "learning_rate": 8.941732138133032e-06, + "loss": 35.3583, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4316, + "step": 155 + }, + { + "autoregressive_loss": 0.0957, + "epoch": 0.23622941510505394, + "grad_norm": 0.47862210869789124, + "learning_rate": 8.926584654403725e-06, + "loss": 35.272, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4313, + "step": 156 + }, + { + "autoregressive_loss": 0.1074, + "epoch": 0.23774370622752225, + "grad_norm": 0.47320693731307983, + "learning_rate": 8.911342557004084e-06, + "loss": 35.3485, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4311, + "step": 157 + }, + { + "autoregressive_loss": 0.0977, + "epoch": 0.23925799734999054, + "grad_norm": 0.5546271204948425, + "learning_rate": 8.896006213202584e-06, + "loss": 35.2906, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4314, + "step": 158 + }, + { + "autoregressive_loss": 0.1016, + "epoch": 0.24077228847245882, + "grad_norm": 0.5470238327980042, + "learning_rate": 8.88057599253864e-06, + "loss": 35.2809, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4309, + "step": 159 + }, + { + "autoregressive_loss": 0.1016, + "epoch": 0.24228657959492714, + "grad_norm": 0.5319735407829285, + "learning_rate": 8.865052266813686e-06, + "loss": 35.327, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4314, + "step": 160 + }, + { + "autoregressive_loss": 0.1035, + "epoch": 0.24380087071739542, + "grad_norm": 0.5861176252365112, + "learning_rate": 8.849435410082224e-06, + "loss": 35.281, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4307, + "step": 161 + }, + { + "autoregressive_loss": 0.0996, + "epoch": 0.2453151618398637, + "grad_norm": 0.5896406769752502, + "learning_rate": 8.833725798642809e-06, + "loss": 35.3104, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4314, + "step": 162 + }, + { + "autoregressive_loss": 0.1035, + "epoch": 0.246829452962332, + "grad_norm": 0.4997479021549225, + "learning_rate": 8.817923811028984e-06, + "loss": 35.3535, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4315, + "step": 163 + }, + { + "autoregressive_loss": 0.1006, + "epoch": 0.2483437440848003, + "grad_norm": 0.5883369445800781, + "learning_rate": 8.802029828000157e-06, + "loss": 35.2739, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4308, + "step": 164 + }, + { + "autoregressive_loss": 0.0986, + "epoch": 0.2498580352072686, + "grad_norm": 0.4916251599788666, + "learning_rate": 8.786044232532423e-06, + "loss": 35.2362, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4306, + "step": 165 + }, + { + "autoregressive_loss": 0.1016, + "epoch": 0.2513723263297369, + "grad_norm": 0.513014018535614, + "learning_rate": 8.769967409809348e-06, + "loss": 35.2363, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4303, + "step": 166 + }, + { + "autoregressive_loss": 0.1016, + "epoch": 0.2528866174522052, + "grad_norm": 0.5235852003097534, + "learning_rate": 8.753799747212672e-06, + "loss": 35.2498, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4304, + "step": 167 + }, + { + "autoregressive_loss": 0.0991, + "epoch": 0.25440090857467346, + "grad_norm": 0.674767017364502, + "learning_rate": 8.737541634312985e-06, + "loss": 35.2492, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4307, + "step": 168 + }, + { + "autoregressive_loss": 0.1055, + "epoch": 0.25591519969714177, + "grad_norm": 0.44855162501335144, + "learning_rate": 8.721193462860335e-06, + "loss": 35.2826, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4304, + "step": 169 + }, + { + "autoregressive_loss": 0.0996, + "epoch": 0.2574294908196101, + "grad_norm": 0.4613415598869324, + "learning_rate": 8.704755626774796e-06, + "loss": 35.2017, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4301, + "step": 170 + }, + { + "autoregressive_loss": 0.0991, + "epoch": 0.25894378194207834, + "grad_norm": 0.6335465908050537, + "learning_rate": 8.688228522136966e-06, + "loss": 35.2126, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4302, + "step": 171 + }, + { + "autoregressive_loss": 0.1064, + "epoch": 0.26045807306454666, + "grad_norm": 0.4550322890281677, + "learning_rate": 8.671612547178428e-06, + "loss": 35.3107, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4308, + "step": 172 + }, + { + "autoregressive_loss": 0.0986, + "epoch": 0.26197236418701497, + "grad_norm": 0.5079850554466248, + "learning_rate": 8.65490810227216e-06, + "loss": 35.1868, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.43, + "step": 173 + }, + { + "autoregressive_loss": 0.1055, + "epoch": 0.26348665530948323, + "grad_norm": 0.45010823011398315, + "learning_rate": 8.638115589922875e-06, + "loss": 35.3012, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4307, + "step": 174 + }, + { + "autoregressive_loss": 0.1055, + "epoch": 0.26500094643195155, + "grad_norm": 0.4433356523513794, + "learning_rate": 8.621235414757337e-06, + "loss": 35.2885, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4306, + "step": 175 + }, + { + "autoregressive_loss": 0.0898, + "epoch": 0.26651523755441986, + "grad_norm": 0.48780950903892517, + "learning_rate": 8.604267983514595e-06, + "loss": 35.1429, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4302, + "step": 176 + }, + { + "autoregressive_loss": 0.1064, + "epoch": 0.2680295286768881, + "grad_norm": 0.5431328415870667, + "learning_rate": 8.587213705036202e-06, + "loss": 35.2705, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4302, + "step": 177 + }, + { + "autoregressive_loss": 0.0967, + "epoch": 0.26954381979935643, + "grad_norm": 0.46060287952423096, + "learning_rate": 8.570072990256342e-06, + "loss": 35.2242, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4307, + "step": 178 + }, + { + "autoregressive_loss": 0.1001, + "epoch": 0.27105811092182475, + "grad_norm": 0.4758576452732086, + "learning_rate": 8.552846252191949e-06, + "loss": 35.187, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4298, + "step": 179 + }, + { + "autoregressive_loss": 0.0923, + "epoch": 0.272572402044293, + "grad_norm": 0.45338496565818787, + "learning_rate": 8.535533905932739e-06, + "loss": 35.1486, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4301, + "step": 180 + }, + { + "autoregressive_loss": 0.1035, + "epoch": 0.2740866931667613, + "grad_norm": 0.4543963670730591, + "learning_rate": 8.518136368631216e-06, + "loss": 35.231, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.43, + "step": 181 + }, + { + "autoregressive_loss": 0.0977, + "epoch": 0.2756009842892296, + "grad_norm": 0.532296895980835, + "learning_rate": 8.500654059492618e-06, + "loss": 35.1973, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4302, + "step": 182 + }, + { + "autoregressive_loss": 0.0981, + "epoch": 0.2771152754116979, + "grad_norm": 0.5008907914161682, + "learning_rate": 8.48308739976482e-06, + "loss": 35.191, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4301, + "step": 183 + }, + { + "autoregressive_loss": 0.1006, + "epoch": 0.2786295665341662, + "grad_norm": 0.46187299489974976, + "learning_rate": 8.465436812728181e-06, + "loss": 35.2014, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4299, + "step": 184 + }, + { + "autoregressive_loss": 0.1074, + "epoch": 0.28014385765663447, + "grad_norm": 0.4999636113643646, + "learning_rate": 8.447702723685335e-06, + "loss": 35.277, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4302, + "step": 185 + }, + { + "autoregressive_loss": 0.1035, + "epoch": 0.2816581487791028, + "grad_norm": 0.4796339273452759, + "learning_rate": 8.429885559950965e-06, + "loss": 35.2204, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4299, + "step": 186 + }, + { + "autoregressive_loss": 0.1006, + "epoch": 0.2831724399015711, + "grad_norm": 0.45920100808143616, + "learning_rate": 8.411985750841484e-06, + "loss": 35.1793, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4297, + "step": 187 + }, + { + "autoregressive_loss": 0.0981, + "epoch": 0.28468673102403935, + "grad_norm": 0.5071916580200195, + "learning_rate": 8.39400372766471e-06, + "loss": 35.1404, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4294, + "step": 188 + }, + { + "autoregressive_loss": 0.0952, + "epoch": 0.28620102214650767, + "grad_norm": 0.4852062463760376, + "learning_rate": 8.375939923709453e-06, + "loss": 35.1361, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4297, + "step": 189 + }, + { + "autoregressive_loss": 0.1069, + "epoch": 0.287715313268976, + "grad_norm": 0.49625808000564575, + "learning_rate": 8.357794774235094e-06, + "loss": 35.1805, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4291, + "step": 190 + }, + { + "autoregressive_loss": 0.104, + "epoch": 0.28922960439144424, + "grad_norm": 0.5065264105796814, + "learning_rate": 8.339568716461082e-06, + "loss": 35.2092, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4297, + "step": 191 + }, + { + "autoregressive_loss": 0.1006, + "epoch": 0.29074389551391255, + "grad_norm": 0.5253987908363342, + "learning_rate": 8.32126218955641e-06, + "loss": 35.1608, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4294, + "step": 192 + }, + { + "autoregressive_loss": 0.0986, + "epoch": 0.29225818663638087, + "grad_norm": 0.49405911564826965, + "learning_rate": 8.302875634629027e-06, + "loss": 35.1451, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4294, + "step": 193 + }, + { + "autoregressive_loss": 0.0977, + "epoch": 0.2937724777588491, + "grad_norm": 0.4636495113372803, + "learning_rate": 8.284409494715208e-06, + "loss": 35.2112, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4304, + "step": 194 + }, + { + "autoregressive_loss": 0.1035, + "epoch": 0.29528676888131744, + "grad_norm": 0.4696371257305145, + "learning_rate": 8.265864214768883e-06, + "loss": 35.1782, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4294, + "step": 195 + }, + { + "autoregressive_loss": 0.0957, + "epoch": 0.29680106000378575, + "grad_norm": 0.6433189511299133, + "learning_rate": 8.247240241650918e-06, + "loss": 35.1168, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4294, + "step": 196 + }, + { + "autoregressive_loss": 0.0986, + "epoch": 0.298315351126254, + "grad_norm": 0.5026069283485413, + "learning_rate": 8.228538024118338e-06, + "loss": 35.164, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4297, + "step": 197 + }, + { + "autoregressive_loss": 0.0918, + "epoch": 0.29982964224872233, + "grad_norm": 0.5516616702079773, + "learning_rate": 8.209758012813515e-06, + "loss": 35.0933, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4295, + "step": 198 + }, + { + "autoregressive_loss": 0.0952, + "epoch": 0.3013439333711906, + "grad_norm": 0.4735584259033203, + "learning_rate": 8.190900660253327e-06, + "loss": 35.1021, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4293, + "step": 199 + }, + { + "autoregressive_loss": 0.0967, + "epoch": 0.3028582244936589, + "grad_norm": 0.48158344626426697, + "learning_rate": 8.171966420818227e-06, + "loss": 35.1414, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4296, + "step": 200 + }, + { + "autoregressive_loss": 0.1006, + "epoch": 0.3043725156161272, + "grad_norm": 0.4967866539955139, + "learning_rate": 8.15295575074132e-06, + "loss": 35.1648, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4295, + "step": 201 + }, + { + "autoregressive_loss": 0.0996, + "epoch": 0.3058868067385955, + "grad_norm": 0.467647910118103, + "learning_rate": 8.133869108097349e-06, + "loss": 35.128, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4291, + "step": 202 + }, + { + "autoregressive_loss": 0.0977, + "epoch": 0.3074010978610638, + "grad_norm": 0.5850085616111755, + "learning_rate": 8.11470695279167e-06, + "loss": 35.1355, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4295, + "step": 203 + }, + { + "autoregressive_loss": 0.1016, + "epoch": 0.3089153889835321, + "grad_norm": 0.468377023935318, + "learning_rate": 8.095469746549172e-06, + "loss": 35.1184, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4288, + "step": 204 + }, + { + "autoregressive_loss": 0.1006, + "epoch": 0.31042968010600036, + "grad_norm": 0.4914652109146118, + "learning_rate": 8.076157952903134e-06, + "loss": 35.1641, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4295, + "step": 205 + }, + { + "autoregressive_loss": 0.1011, + "epoch": 0.3119439712284687, + "grad_norm": 0.48183414340019226, + "learning_rate": 8.056772037184083e-06, + "loss": 35.1201, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4289, + "step": 206 + }, + { + "autoregressive_loss": 0.0864, + "epoch": 0.313458262350937, + "grad_norm": 0.462963730096817, + "learning_rate": 8.037312466508555e-06, + "loss": 35.03, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4292, + "step": 207 + }, + { + "autoregressive_loss": 0.0962, + "epoch": 0.31497255347340525, + "grad_norm": 0.5688491463661194, + "learning_rate": 8.017779709767857e-06, + "loss": 35.1257, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4295, + "step": 208 + }, + { + "autoregressive_loss": 0.1045, + "epoch": 0.31648684459587356, + "grad_norm": 0.4644634425640106, + "learning_rate": 7.998174237616763e-06, + "loss": 35.1296, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4287, + "step": 209 + }, + { + "autoregressive_loss": 0.0957, + "epoch": 0.3180011357183419, + "grad_norm": 0.4339991509914398, + "learning_rate": 7.978496522462167e-06, + "loss": 35.1158, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4294, + "step": 210 + }, + { + "autoregressive_loss": 0.0918, + "epoch": 0.31951542684081014, + "grad_norm": 0.47882065176963806, + "learning_rate": 7.958747038451715e-06, + "loss": 35.1106, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4297, + "step": 211 + }, + { + "autoregressive_loss": 0.0957, + "epoch": 0.32102971796327845, + "grad_norm": 0.614219069480896, + "learning_rate": 7.938926261462366e-06, + "loss": 35.074, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4288, + "step": 212 + }, + { + "autoregressive_loss": 0.0947, + "epoch": 0.3225440090857467, + "grad_norm": 0.48970454931259155, + "learning_rate": 7.919034669088933e-06, + "loss": 35.0617, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4288, + "step": 213 + }, + { + "autoregressive_loss": 0.0981, + "epoch": 0.324058300208215, + "grad_norm": 0.48159271478652954, + "learning_rate": 7.89907274063257e-06, + "loss": 35.1061, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.429, + "step": 214 + }, + { + "autoregressive_loss": 0.0967, + "epoch": 0.32557259133068334, + "grad_norm": 0.61226487159729, + "learning_rate": 7.879040957089229e-06, + "loss": 35.1133, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4293, + "step": 215 + }, + { + "autoregressive_loss": 0.0977, + "epoch": 0.3270868824531516, + "grad_norm": 0.4215747117996216, + "learning_rate": 7.858939801138061e-06, + "loss": 35.1063, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4291, + "step": 216 + }, + { + "autoregressive_loss": 0.0933, + "epoch": 0.3286011735756199, + "grad_norm": 0.48939043283462524, + "learning_rate": 7.838769757129804e-06, + "loss": 35.0699, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.429, + "step": 217 + }, + { + "autoregressive_loss": 0.0996, + "epoch": 0.3301154646980882, + "grad_norm": 0.5238614082336426, + "learning_rate": 7.818531311075084e-06, + "loss": 35.0644, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4284, + "step": 218 + }, + { + "autoregressive_loss": 0.0947, + "epoch": 0.3316297558205565, + "grad_norm": 0.5280364155769348, + "learning_rate": 7.79822495063273e-06, + "loss": 35.0569, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4287, + "step": 219 + }, + { + "autoregressive_loss": 0.0928, + "epoch": 0.3331440469430248, + "grad_norm": 0.5425875782966614, + "learning_rate": 7.777851165098012e-06, + "loss": 35.0369, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4287, + "step": 220 + }, + { + "autoregressive_loss": 0.0986, + "epoch": 0.3346583380654931, + "grad_norm": 0.5411438941955566, + "learning_rate": 7.757410445390847e-06, + "loss": 35.1003, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4289, + "step": 221 + }, + { + "autoregressive_loss": 0.1045, + "epoch": 0.33617262918796137, + "grad_norm": 0.5106423497200012, + "learning_rate": 7.736903284043985e-06, + "loss": 35.1353, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4287, + "step": 222 + }, + { + "autoregressive_loss": 0.0986, + "epoch": 0.3376869203104297, + "grad_norm": 0.4744527339935303, + "learning_rate": 7.716330175191118e-06, + "loss": 35.0903, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4287, + "step": 223 + }, + { + "autoregressive_loss": 0.1055, + "epoch": 0.339201211432898, + "grad_norm": 0.5454025268554688, + "learning_rate": 7.695691614555002e-06, + "loss": 35.1311, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4286, + "step": 224 + }, + { + "autoregressive_loss": 0.0967, + "epoch": 0.34071550255536626, + "grad_norm": 0.4324324429035187, + "learning_rate": 7.674988099435487e-06, + "loss": 35.0964, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4291, + "step": 225 + }, + { + "autoregressive_loss": 0.1016, + "epoch": 0.34222979367783457, + "grad_norm": 0.48215311765670776, + "learning_rate": 7.654220128697547e-06, + "loss": 35.0915, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4285, + "step": 226 + }, + { + "autoregressive_loss": 0.0957, + "epoch": 0.3437440848003029, + "grad_norm": 0.4873673617839813, + "learning_rate": 7.633388202759262e-06, + "loss": 35.0536, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4286, + "step": 227 + }, + { + "autoregressive_loss": 0.0996, + "epoch": 0.34525837592277114, + "grad_norm": 0.5847741365432739, + "learning_rate": 7.612492823579744e-06, + "loss": 35.0838, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4286, + "step": 228 + }, + { + "autoregressive_loss": 0.1045, + "epoch": 0.34677266704523946, + "grad_norm": 0.5521331429481506, + "learning_rate": 7.591534494647066e-06, + "loss": 35.0905, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4282, + "step": 229 + }, + { + "autoregressive_loss": 0.0986, + "epoch": 0.3482869581677077, + "grad_norm": 0.5114601254463196, + "learning_rate": 7.570513720966108e-06, + "loss": 35.0741, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4286, + "step": 230 + }, + { + "autoregressive_loss": 0.1025, + "epoch": 0.34980124929017603, + "grad_norm": 0.5362839698791504, + "learning_rate": 7.549431009046404e-06, + "loss": 35.1057, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4286, + "step": 231 + }, + { + "autoregressive_loss": 0.0986, + "epoch": 0.35131554041264434, + "grad_norm": 0.5299453735351562, + "learning_rate": 7.528286866889924e-06, + "loss": 35.0719, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4285, + "step": 232 + }, + { + "autoregressive_loss": 0.0938, + "epoch": 0.3528298315351126, + "grad_norm": 0.5524781942367554, + "learning_rate": 7.5070818039788455e-06, + "loss": 35.0282, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4285, + "step": 233 + }, + { + "autoregressive_loss": 0.0947, + "epoch": 0.3543441226575809, + "grad_norm": 0.4737882614135742, + "learning_rate": 7.485816331263273e-06, + "loss": 35.0566, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4288, + "step": 234 + }, + { + "autoregressive_loss": 0.0889, + "epoch": 0.35585841378004923, + "grad_norm": 0.5651814341545105, + "learning_rate": 7.464490961148921e-06, + "loss": 34.976, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4283, + "step": 235 + }, + { + "autoregressive_loss": 0.0996, + "epoch": 0.3573727049025175, + "grad_norm": 0.4817633032798767, + "learning_rate": 7.443106207484776e-06, + "loss": 35.0843, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4286, + "step": 236 + }, + { + "autoregressive_loss": 0.0967, + "epoch": 0.3588869960249858, + "grad_norm": 0.480886310338974, + "learning_rate": 7.421662585550707e-06, + "loss": 35.0375, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4283, + "step": 237 + }, + { + "autoregressive_loss": 0.0923, + "epoch": 0.3604012871474541, + "grad_norm": 0.49071013927459717, + "learning_rate": 7.400160612045057e-06, + "loss": 35.0144, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4284, + "step": 238 + }, + { + "autoregressive_loss": 0.0957, + "epoch": 0.3619155782699224, + "grad_norm": 0.4387322962284088, + "learning_rate": 7.378600805072186e-06, + "loss": 35.0234, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4282, + "step": 239 + }, + { + "autoregressive_loss": 0.0938, + "epoch": 0.3634298693923907, + "grad_norm": 0.5867125391960144, + "learning_rate": 7.3569836841299905e-06, + "loss": 35.0411, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4287, + "step": 240 + }, + { + "autoregressive_loss": 0.1001, + "epoch": 0.364944160514859, + "grad_norm": 0.47319120168685913, + "learning_rate": 7.335309770097383e-06, + "loss": 35.0277, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4278, + "step": 241 + }, + { + "autoregressive_loss": 0.0962, + "epoch": 0.36645845163732726, + "grad_norm": 0.5654913783073425, + "learning_rate": 7.313579585221752e-06, + "loss": 35.0449, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4285, + "step": 242 + }, + { + "autoregressive_loss": 0.0947, + "epoch": 0.3679727427597956, + "grad_norm": 0.576474130153656, + "learning_rate": 7.291793653106357e-06, + "loss": 35.0017, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.428, + "step": 243 + }, + { + "autoregressive_loss": 0.0928, + "epoch": 0.36948703388226384, + "grad_norm": 0.4818061590194702, + "learning_rate": 7.269952498697734e-06, + "loss": 34.9913, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4281, + "step": 244 + }, + { + "autoregressive_loss": 0.0996, + "epoch": 0.37100132500473215, + "grad_norm": 0.439746618270874, + "learning_rate": 7.248056648273034e-06, + "loss": 35.0293, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4279, + "step": 245 + }, + { + "autoregressive_loss": 0.0894, + "epoch": 0.37251561612720047, + "grad_norm": 0.5234421491622925, + "learning_rate": 7.226106629427342e-06, + "loss": 34.9907, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4284, + "step": 246 + }, + { + "autoregressive_loss": 0.0952, + "epoch": 0.3740299072496687, + "grad_norm": 0.549595832824707, + "learning_rate": 7.204102971060971e-06, + "loss": 35.0207, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4282, + "step": 247 + }, + { + "autoregressive_loss": 0.0957, + "epoch": 0.37554419837213704, + "grad_norm": 0.5052510499954224, + "learning_rate": 7.18204620336671e-06, + "loss": 35.0123, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4281, + "step": 248 + }, + { + "autoregressive_loss": 0.0947, + "epoch": 0.37705848949460535, + "grad_norm": 0.4589613676071167, + "learning_rate": 7.15993685781706e-06, + "loss": 35.0326, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4284, + "step": 249 + }, + { + "autoregressive_loss": 0.0938, + "epoch": 0.3785727806170736, + "grad_norm": 0.5762051939964294, + "learning_rate": 7.137775467151411e-06, + "loss": 35.0329, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4286, + "step": 250 + }, + { + "autoregressive_loss": 0.0898, + "epoch": 0.3800870717395419, + "grad_norm": 0.5920025706291199, + "learning_rate": 7.115562565363221e-06, + "loss": 34.9685, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4281, + "step": 251 + }, + { + "autoregressive_loss": 0.0962, + "epoch": 0.38160136286201024, + "grad_norm": 0.5249273180961609, + "learning_rate": 7.093298687687141e-06, + "loss": 34.9886, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4277, + "step": 252 + }, + { + "autoregressive_loss": 0.1006, + "epoch": 0.3831156539844785, + "grad_norm": 0.5248683094978333, + "learning_rate": 7.070984370586119e-06, + "loss": 35.0159, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4277, + "step": 253 + }, + { + "autoregressive_loss": 0.0879, + "epoch": 0.3846299451069468, + "grad_norm": 0.5286358594894409, + "learning_rate": 7.048620151738478e-06, + "loss": 34.9748, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4284, + "step": 254 + }, + { + "autoregressive_loss": 0.0967, + "epoch": 0.3861442362294151, + "grad_norm": 0.507294774055481, + "learning_rate": 7.026206570024949e-06, + "loss": 34.9921, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4278, + "step": 255 + }, + { + "autoregressive_loss": 0.0938, + "epoch": 0.3876585273518834, + "grad_norm": 0.5101819038391113, + "learning_rate": 7.0037441655157045e-06, + "loss": 35.0014, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4281, + "step": 256 + }, + { + "autoregressive_loss": 0.0986, + "epoch": 0.3891728184743517, + "grad_norm": 0.5403295159339905, + "learning_rate": 6.9812334794573285e-06, + "loss": 35.0351, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4281, + "step": 257 + }, + { + "autoregressive_loss": 0.0981, + "epoch": 0.39068710959682, + "grad_norm": 0.4803125858306885, + "learning_rate": 6.95867505425978e-06, + "loss": 35.0295, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4281, + "step": 258 + }, + { + "autoregressive_loss": 0.1016, + "epoch": 0.3922014007192883, + "grad_norm": 0.612701416015625, + "learning_rate": 6.936069433483329e-06, + "loss": 35.0463, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4279, + "step": 259 + }, + { + "autoregressive_loss": 0.0938, + "epoch": 0.3937156918417566, + "grad_norm": 0.4375706613063812, + "learning_rate": 6.913417161825449e-06, + "loss": 34.9803, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4279, + "step": 260 + }, + { + "autoregressive_loss": 0.0938, + "epoch": 0.39522998296422485, + "grad_norm": 0.4879351854324341, + "learning_rate": 6.8907187851077026e-06, + "loss": 34.9687, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4278, + "step": 261 + }, + { + "autoregressive_loss": 0.0928, + "epoch": 0.39674427408669316, + "grad_norm": 0.5406920909881592, + "learning_rate": 6.867974850262582e-06, + "loss": 34.9559, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4277, + "step": 262 + }, + { + "autoregressive_loss": 0.0928, + "epoch": 0.3982585652091615, + "grad_norm": 0.47432941198349, + "learning_rate": 6.845185905320333e-06, + "loss": 34.9619, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4277, + "step": 263 + }, + { + "autoregressive_loss": 0.0869, + "epoch": 0.39977285633162973, + "grad_norm": 0.533322274684906, + "learning_rate": 6.822352499395751e-06, + "loss": 34.9455, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4281, + "step": 264 + }, + { + "autoregressive_loss": 0.0967, + "epoch": 0.40128714745409805, + "grad_norm": 0.5559669733047485, + "learning_rate": 6.799475182674942e-06, + "loss": 35.0071, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.428, + "step": 265 + }, + { + "autoregressive_loss": 0.0938, + "epoch": 0.40280143857656636, + "grad_norm": 0.4233275055885315, + "learning_rate": 6.776554506402081e-06, + "loss": 34.9745, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4278, + "step": 266 + }, + { + "autoregressive_loss": 0.0957, + "epoch": 0.4043157296990346, + "grad_norm": 0.5335232615470886, + "learning_rate": 6.753591022866117e-06, + "loss": 34.9902, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4278, + "step": 267 + }, + { + "autoregressive_loss": 0.0933, + "epoch": 0.40583002082150293, + "grad_norm": 0.437085896730423, + "learning_rate": 6.730585285387465e-06, + "loss": 34.9786, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4279, + "step": 268 + }, + { + "autoregressive_loss": 0.0928, + "epoch": 0.40734431194397125, + "grad_norm": 0.44024282693862915, + "learning_rate": 6.707537848304682e-06, + "loss": 35.008, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4283, + "step": 269 + }, + { + "autoregressive_loss": 0.0918, + "epoch": 0.4088586030664395, + "grad_norm": 0.5187075138092041, + "learning_rate": 6.684449266961101e-06, + "loss": 34.96, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4278, + "step": 270 + }, + { + "autoregressive_loss": 0.0938, + "epoch": 0.4103728941889078, + "grad_norm": 0.5021286010742188, + "learning_rate": 6.661320097691454e-06, + "loss": 35.0372, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4286, + "step": 271 + }, + { + "autoregressive_loss": 0.0957, + "epoch": 0.41188718531137614, + "grad_norm": 0.4422614872455597, + "learning_rate": 6.638150897808469e-06, + "loss": 35.0079, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.428, + "step": 272 + }, + { + "autoregressive_loss": 0.0869, + "epoch": 0.4134014764338444, + "grad_norm": 0.544759213924408, + "learning_rate": 6.614942225589432e-06, + "loss": 34.909, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4276, + "step": 273 + }, + { + "autoregressive_loss": 0.0928, + "epoch": 0.4149157675563127, + "grad_norm": 0.4701213240623474, + "learning_rate": 6.591694640262749e-06, + "loss": 34.9528, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4276, + "step": 274 + }, + { + "autoregressive_loss": 0.0947, + "epoch": 0.416430058678781, + "grad_norm": 0.4707038998603821, + "learning_rate": 6.568408701994459e-06, + "loss": 34.9573, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4275, + "step": 275 + }, + { + "autoregressive_loss": 0.0874, + "epoch": 0.4179443498012493, + "grad_norm": 0.49830111861228943, + "learning_rate": 6.545084971874738e-06, + "loss": 34.9231, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4278, + "step": 276 + }, + { + "autoregressive_loss": 0.0962, + "epoch": 0.4194586409237176, + "grad_norm": 0.5557491183280945, + "learning_rate": 6.521724011904387e-06, + "loss": 34.9795, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4276, + "step": 277 + }, + { + "autoregressive_loss": 0.0928, + "epoch": 0.42097293204618585, + "grad_norm": 0.49009308218955994, + "learning_rate": 6.4983263849812835e-06, + "loss": 34.955, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4277, + "step": 278 + }, + { + "autoregressive_loss": 0.0928, + "epoch": 0.42248722316865417, + "grad_norm": 0.5258228182792664, + "learning_rate": 6.474892654886819e-06, + "loss": 34.9588, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4277, + "step": 279 + }, + { + "autoregressive_loss": 0.0918, + "epoch": 0.4240015142911225, + "grad_norm": 0.5438114404678345, + "learning_rate": 6.451423386272312e-06, + "loss": 34.9847, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4282, + "step": 280 + }, + { + "autoregressive_loss": 0.0991, + "epoch": 0.42551580541359074, + "grad_norm": 0.5512975454330444, + "learning_rate": 6.427919144645411e-06, + "loss": 35.011, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4278, + "step": 281 + }, + { + "autoregressive_loss": 0.0928, + "epoch": 0.42703009653605906, + "grad_norm": 0.4364534020423889, + "learning_rate": 6.4043804963564616e-06, + "loss": 34.9442, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4275, + "step": 282 + }, + { + "autoregressive_loss": 0.0986, + "epoch": 0.42854438765852737, + "grad_norm": 0.6216069459915161, + "learning_rate": 6.3808080085848544e-06, + "loss": 35.008, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4278, + "step": 283 + }, + { + "autoregressive_loss": 0.0928, + "epoch": 0.43005867878099563, + "grad_norm": 0.5614982843399048, + "learning_rate": 6.3572022493253715e-06, + "loss": 34.923, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4273, + "step": 284 + }, + { + "autoregressive_loss": 0.0869, + "epoch": 0.43157296990346394, + "grad_norm": 0.5375229120254517, + "learning_rate": 6.333563787374493e-06, + "loss": 34.8878, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4274, + "step": 285 + }, + { + "autoregressive_loss": 0.0947, + "epoch": 0.43308726102593226, + "grad_norm": 0.6111340522766113, + "learning_rate": 6.309893192316687e-06, + "loss": 34.9666, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4276, + "step": 286 + }, + { + "autoregressive_loss": 0.0947, + "epoch": 0.4346015521484005, + "grad_norm": 0.5252456068992615, + "learning_rate": 6.2861910345107e-06, + "loss": 34.9519, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4274, + "step": 287 + }, + { + "autoregressive_loss": 0.0889, + "epoch": 0.43611584327086883, + "grad_norm": 0.5538668632507324, + "learning_rate": 6.26245788507579e-06, + "loss": 34.9168, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4275, + "step": 288 + }, + { + "autoregressive_loss": 0.0864, + "epoch": 0.43763013439333714, + "grad_norm": 0.5652018785476685, + "learning_rate": 6.238694315877994e-06, + "loss": 34.9392, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4281, + "step": 289 + }, + { + "autoregressive_loss": 0.0947, + "epoch": 0.4391444255158054, + "grad_norm": 0.4843859076499939, + "learning_rate": 6.21490089951632e-06, + "loss": 34.9678, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4276, + "step": 290 + }, + { + "autoregressive_loss": 0.0918, + "epoch": 0.4406587166382737, + "grad_norm": 0.47921842336654663, + "learning_rate": 6.191078209308974e-06, + "loss": 34.9289, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4275, + "step": 291 + }, + { + "autoregressive_loss": 0.0928, + "epoch": 0.442173007760742, + "grad_norm": 0.4732070863246918, + "learning_rate": 6.1672268192795285e-06, + "loss": 34.9354, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4274, + "step": 292 + }, + { + "autoregressive_loss": 0.0972, + "epoch": 0.4436872988832103, + "grad_norm": 0.47595152258872986, + "learning_rate": 6.143347304143098e-06, + "loss": 34.977, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4275, + "step": 293 + }, + { + "autoregressive_loss": 0.0967, + "epoch": 0.4452015900056786, + "grad_norm": 0.5305737257003784, + "learning_rate": 6.119440239292493e-06, + "loss": 34.9854, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4276, + "step": 294 + }, + { + "autoregressive_loss": 0.0947, + "epoch": 0.44671588112814686, + "grad_norm": 0.48250338435173035, + "learning_rate": 6.095506200784349e-06, + "loss": 34.9632, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4276, + "step": 295 + }, + { + "autoregressive_loss": 0.0957, + "epoch": 0.4482301722506152, + "grad_norm": 0.5477715134620667, + "learning_rate": 6.071545765325254e-06, + "loss": 34.9549, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4274, + "step": 296 + }, + { + "autoregressive_loss": 0.1006, + "epoch": 0.4497444633730835, + "grad_norm": 0.5417200922966003, + "learning_rate": 6.0475595102578455e-06, + "loss": 35.0066, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4275, + "step": 297 + }, + { + "autoregressive_loss": 0.0977, + "epoch": 0.45125875449555175, + "grad_norm": 0.6128395795822144, + "learning_rate": 6.023548013546899e-06, + "loss": 35.0021, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4277, + "step": 298 + }, + { + "autoregressive_loss": 0.0938, + "epoch": 0.45277304561802006, + "grad_norm": 0.6036803722381592, + "learning_rate": 5.99951185376541e-06, + "loss": 34.9477, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4275, + "step": 299 + }, + { + "autoregressive_loss": 0.0938, + "epoch": 0.4542873367404884, + "grad_norm": 0.4633611738681793, + "learning_rate": 5.975451610080643e-06, + "loss": 34.9459, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4274, + "step": 300 + }, + { + "autoregressive_loss": 0.0903, + "epoch": 0.45580162786295664, + "grad_norm": 0.6200799942016602, + "learning_rate": 5.95136786224018e-06, + "loss": 34.9047, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4273, + "step": 301 + }, + { + "autoregressive_loss": 0.0898, + "epoch": 0.45731591898542495, + "grad_norm": 0.5984820127487183, + "learning_rate": 5.927261190557955e-06, + "loss": 34.9018, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4273, + "step": 302 + }, + { + "autoregressive_loss": 0.0938, + "epoch": 0.45883021010789327, + "grad_norm": 0.4801160395145416, + "learning_rate": 5.903132175900264e-06, + "loss": 34.9364, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4274, + "step": 303 + }, + { + "autoregressive_loss": 0.0952, + "epoch": 0.4603445012303615, + "grad_norm": 0.548920214176178, + "learning_rate": 5.878981399671774e-06, + "loss": 34.9636, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4275, + "step": 304 + }, + { + "autoregressive_loss": 0.0908, + "epoch": 0.46185879235282984, + "grad_norm": 0.4783765971660614, + "learning_rate": 5.8548094438015065e-06, + "loss": 34.925, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4275, + "step": 305 + }, + { + "autoregressive_loss": 0.0898, + "epoch": 0.46337308347529815, + "grad_norm": 0.5205101370811462, + "learning_rate": 5.830616890728828e-06, + "loss": 34.9346, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4277, + "step": 306 + }, + { + "autoregressive_loss": 0.0811, + "epoch": 0.4648873745977664, + "grad_norm": 0.49219366908073425, + "learning_rate": 5.806404323389403e-06, + "loss": 34.8481, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4275, + "step": 307 + }, + { + "autoregressive_loss": 0.0908, + "epoch": 0.4664016657202347, + "grad_norm": 0.4646260738372803, + "learning_rate": 5.782172325201155e-06, + "loss": 34.8987, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4272, + "step": 308 + }, + { + "autoregressive_loss": 0.085, + "epoch": 0.467915956842703, + "grad_norm": 0.6045141816139221, + "learning_rate": 5.757921480050206e-06, + "loss": 34.886, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4275, + "step": 309 + }, + { + "autoregressive_loss": 0.0981, + "epoch": 0.4694302479651713, + "grad_norm": 0.4875032603740692, + "learning_rate": 5.733652372276809e-06, + "loss": 34.9721, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4273, + "step": 310 + }, + { + "autoregressive_loss": 0.0845, + "epoch": 0.4709445390876396, + "grad_norm": 0.5332487225532532, + "learning_rate": 5.709365586661266e-06, + "loss": 34.851, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4272, + "step": 311 + }, + { + "autoregressive_loss": 0.0938, + "epoch": 0.47245883021010787, + "grad_norm": 0.5613682866096497, + "learning_rate": 5.6850617084098416e-06, + "loss": 34.931, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4273, + "step": 312 + }, + { + "autoregressive_loss": 0.0854, + "epoch": 0.4739731213325762, + "grad_norm": 0.6050405502319336, + "learning_rate": 5.660741323140651e-06, + "loss": 34.8635, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4272, + "step": 313 + }, + { + "autoregressive_loss": 0.0957, + "epoch": 0.4754874124550445, + "grad_norm": 0.5093547701835632, + "learning_rate": 5.636405016869567e-06, + "loss": 34.9447, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4272, + "step": 314 + }, + { + "autoregressive_loss": 0.0962, + "epoch": 0.47700170357751276, + "grad_norm": 0.544954240322113, + "learning_rate": 5.612053375996082e-06, + "loss": 34.9305, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.427, + "step": 315 + }, + { + "autoregressive_loss": 0.0913, + "epoch": 0.4785159946999811, + "grad_norm": 0.5036815404891968, + "learning_rate": 5.587686987289189e-06, + "loss": 34.8926, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.427, + "step": 316 + }, + { + "autoregressive_loss": 0.0908, + "epoch": 0.4800302858224494, + "grad_norm": 0.5659224987030029, + "learning_rate": 5.563306437873239e-06, + "loss": 34.903, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4272, + "step": 317 + }, + { + "autoregressive_loss": 0.0952, + "epoch": 0.48154457694491765, + "grad_norm": 0.4971538782119751, + "learning_rate": 5.5389123152137965e-06, + "loss": 34.9359, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4272, + "step": 318 + }, + { + "autoregressive_loss": 0.0942, + "epoch": 0.48305886806738596, + "grad_norm": 0.43787309527397156, + "learning_rate": 5.514505207103482e-06, + "loss": 34.9428, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4273, + "step": 319 + }, + { + "autoregressive_loss": 0.0918, + "epoch": 0.4845731591898543, + "grad_norm": 0.5557945370674133, + "learning_rate": 5.490085701647805e-06, + "loss": 34.9089, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4272, + "step": 320 + }, + { + "autoregressive_loss": 0.0947, + "epoch": 0.48608745031232253, + "grad_norm": 0.44556209444999695, + "learning_rate": 5.4656543872509994e-06, + "loss": 34.9117, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4269, + "step": 321 + }, + { + "autoregressive_loss": 0.0869, + "epoch": 0.48760174143479085, + "grad_norm": 0.5435645580291748, + "learning_rate": 5.441211852601849e-06, + "loss": 34.8964, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4276, + "step": 322 + }, + { + "autoregressive_loss": 0.0879, + "epoch": 0.4891160325572591, + "grad_norm": 0.4776194989681244, + "learning_rate": 5.416758686659488e-06, + "loss": 34.882, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4272, + "step": 323 + }, + { + "autoregressive_loss": 0.0918, + "epoch": 0.4906303236797274, + "grad_norm": 0.48529672622680664, + "learning_rate": 5.392295478639226e-06, + "loss": 34.9125, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4272, + "step": 324 + }, + { + "autoregressive_loss": 0.0938, + "epoch": 0.49214461480219573, + "grad_norm": 0.4742003381252289, + "learning_rate": 5.367822817998338e-06, + "loss": 34.8944, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4268, + "step": 325 + }, + { + "autoregressive_loss": 0.0957, + "epoch": 0.493658905924664, + "grad_norm": 0.49207785725593567, + "learning_rate": 5.343341294421868e-06, + "loss": 34.9365, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4272, + "step": 326 + }, + { + "autoregressive_loss": 0.0947, + "epoch": 0.4951731970471323, + "grad_norm": 0.4983101189136505, + "learning_rate": 5.318851497808424e-06, + "loss": 34.9296, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4272, + "step": 327 + }, + { + "autoregressive_loss": 0.0908, + "epoch": 0.4966874881696006, + "grad_norm": 0.4934469759464264, + "learning_rate": 5.294354018255945e-06, + "loss": 34.9116, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4273, + "step": 328 + }, + { + "autoregressive_loss": 0.0942, + "epoch": 0.4982017792920689, + "grad_norm": 0.5142028331756592, + "learning_rate": 5.26984944604751e-06, + "loss": 34.9325, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4272, + "step": 329 + }, + { + "autoregressive_loss": 0.0981, + "epoch": 0.4997160704145372, + "grad_norm": 0.39390942454338074, + "learning_rate": 5.245338371637091e-06, + "loss": 34.9437, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.427, + "step": 330 + }, + { + "autoregressive_loss": 0.0884, + "epoch": 0.5012303615370055, + "grad_norm": 0.4992430806159973, + "learning_rate": 5.220821385635337e-06, + "loss": 34.8697, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.427, + "step": 331 + }, + { + "autoregressive_loss": 0.0942, + "epoch": 0.5027446526594738, + "grad_norm": 0.40678730607032776, + "learning_rate": 5.1962990787953436e-06, + "loss": 34.932, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4272, + "step": 332 + }, + { + "autoregressive_loss": 0.0933, + "epoch": 0.504258943781942, + "grad_norm": 0.5377686023712158, + "learning_rate": 5.171772041998412e-06, + "loss": 34.8964, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4269, + "step": 333 + }, + { + "autoregressive_loss": 0.0903, + "epoch": 0.5057732349044104, + "grad_norm": 0.47779470682144165, + "learning_rate": 5.147240866239817e-06, + "loss": 34.8971, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4272, + "step": 334 + }, + { + "autoregressive_loss": 0.0957, + "epoch": 0.5072875260268787, + "grad_norm": 0.44903361797332764, + "learning_rate": 5.122706142614562e-06, + "loss": 34.9469, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4273, + "step": 335 + }, + { + "autoregressive_loss": 0.0879, + "epoch": 0.5088018171493469, + "grad_norm": 0.4870534837245941, + "learning_rate": 5.098168462303141e-06, + "loss": 34.8471, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4268, + "step": 336 + }, + { + "autoregressive_loss": 0.0908, + "epoch": 0.5103161082718153, + "grad_norm": 0.44688403606414795, + "learning_rate": 5.073628416557293e-06, + "loss": 34.8795, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4269, + "step": 337 + }, + { + "autoregressive_loss": 0.0928, + "epoch": 0.5118303993942835, + "grad_norm": 0.45804962515830994, + "learning_rate": 5.049086596685749e-06, + "loss": 34.9163, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4272, + "step": 338 + }, + { + "autoregressive_loss": 0.085, + "epoch": 0.5133446905167518, + "grad_norm": 0.44981124997138977, + "learning_rate": 5.024543594039991e-06, + "loss": 34.8432, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.427, + "step": 339 + }, + { + "autoregressive_loss": 0.0947, + "epoch": 0.5148589816392202, + "grad_norm": 0.4531640410423279, + "learning_rate": 5e-06, + "loss": 34.9063, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4268, + "step": 340 + }, + { + "autoregressive_loss": 0.0903, + "epoch": 0.5163732727616884, + "grad_norm": 0.509989321231842, + "learning_rate": 4.97545640596001e-06, + "loss": 34.8636, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4267, + "step": 341 + }, + { + "autoregressive_loss": 0.0938, + "epoch": 0.5178875638841567, + "grad_norm": 0.3763474225997925, + "learning_rate": 4.9509134033142525e-06, + "loss": 34.9165, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4271, + "step": 342 + }, + { + "autoregressive_loss": 0.0938, + "epoch": 0.5194018550066251, + "grad_norm": 0.4568621516227722, + "learning_rate": 4.926371583442709e-06, + "loss": 34.8938, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4268, + "step": 343 + }, + { + "autoregressive_loss": 0.0879, + "epoch": 0.5209161461290933, + "grad_norm": 0.4906661808490753, + "learning_rate": 4.90183153769686e-06, + "loss": 34.8655, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.427, + "step": 344 + }, + { + "autoregressive_loss": 0.0977, + "epoch": 0.5224304372515616, + "grad_norm": 0.45283588767051697, + "learning_rate": 4.87729385738544e-06, + "loss": 34.9471, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.427, + "step": 345 + }, + { + "autoregressive_loss": 0.0942, + "epoch": 0.5239447283740299, + "grad_norm": 0.4244708716869354, + "learning_rate": 4.852759133760184e-06, + "loss": 34.9075, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4269, + "step": 346 + }, + { + "autoregressive_loss": 0.0918, + "epoch": 0.5254590194964982, + "grad_norm": 0.49666088819503784, + "learning_rate": 4.828227958001589e-06, + "loss": 34.9247, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4274, + "step": 347 + }, + { + "autoregressive_loss": 0.0791, + "epoch": 0.5269733106189665, + "grad_norm": 0.559128999710083, + "learning_rate": 4.803700921204659e-06, + "loss": 34.7788, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4269, + "step": 348 + }, + { + "autoregressive_loss": 0.0913, + "epoch": 0.5284876017414348, + "grad_norm": 0.4289547801017761, + "learning_rate": 4.779178614364664e-06, + "loss": 34.8903, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.427, + "step": 349 + }, + { + "autoregressive_loss": 0.0928, + "epoch": 0.5300018928639031, + "grad_norm": 0.5467272400856018, + "learning_rate": 4.75466162836291e-06, + "loss": 34.9005, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.427, + "step": 350 + }, + { + "autoregressive_loss": 0.0908, + "epoch": 0.5315161839863713, + "grad_norm": 0.44082117080688477, + "learning_rate": 4.730150553952491e-06, + "loss": 34.8799, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4269, + "step": 351 + }, + { + "autoregressive_loss": 0.0928, + "epoch": 0.5330304751088397, + "grad_norm": 0.4480251669883728, + "learning_rate": 4.705645981744055e-06, + "loss": 34.9206, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4272, + "step": 352 + }, + { + "autoregressive_loss": 0.0918, + "epoch": 0.534544766231308, + "grad_norm": 0.5339522957801819, + "learning_rate": 4.6811485021915784e-06, + "loss": 34.9032, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4271, + "step": 353 + }, + { + "autoregressive_loss": 0.0908, + "epoch": 0.5360590573537762, + "grad_norm": 0.573104739189148, + "learning_rate": 4.6566587055781324e-06, + "loss": 34.8866, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.427, + "step": 354 + }, + { + "autoregressive_loss": 0.0898, + "epoch": 0.5375733484762446, + "grad_norm": 0.46984177827835083, + "learning_rate": 4.6321771820016635e-06, + "loss": 34.8981, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4272, + "step": 355 + }, + { + "autoregressive_loss": 0.0913, + "epoch": 0.5390876395987129, + "grad_norm": 0.5871784687042236, + "learning_rate": 4.6077045213607765e-06, + "loss": 34.8816, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4269, + "step": 356 + }, + { + "autoregressive_loss": 0.0845, + "epoch": 0.5406019307211811, + "grad_norm": 0.538987934589386, + "learning_rate": 4.583241313340512e-06, + "loss": 34.826, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4268, + "step": 357 + }, + { + "autoregressive_loss": 0.0889, + "epoch": 0.5421162218436495, + "grad_norm": 0.47866809368133545, + "learning_rate": 4.5587881473981535e-06, + "loss": 34.8613, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4269, + "step": 358 + }, + { + "autoregressive_loss": 0.0879, + "epoch": 0.5436305129661178, + "grad_norm": 0.6175789833068848, + "learning_rate": 4.534345612749002e-06, + "loss": 34.8582, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4269, + "step": 359 + }, + { + "autoregressive_loss": 0.0957, + "epoch": 0.545144804088586, + "grad_norm": 0.5450776815414429, + "learning_rate": 4.509914298352197e-06, + "loss": 34.9145, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4269, + "step": 360 + }, + { + "autoregressive_loss": 0.0908, + "epoch": 0.5466590952110544, + "grad_norm": 0.44723251461982727, + "learning_rate": 4.485494792896519e-06, + "loss": 34.8835, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.427, + "step": 361 + }, + { + "autoregressive_loss": 0.0908, + "epoch": 0.5481733863335226, + "grad_norm": 0.5691998600959778, + "learning_rate": 4.4610876847862034e-06, + "loss": 34.8834, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4269, + "step": 362 + }, + { + "autoregressive_loss": 0.0908, + "epoch": 0.5496876774559909, + "grad_norm": 0.4829149842262268, + "learning_rate": 4.436693562126762e-06, + "loss": 34.8938, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4271, + "step": 363 + }, + { + "autoregressive_loss": 0.085, + "epoch": 0.5512019685784592, + "grad_norm": 0.4736376404762268, + "learning_rate": 4.4123130127108125e-06, + "loss": 34.8202, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4267, + "step": 364 + }, + { + "autoregressive_loss": 0.0913, + "epoch": 0.5527162597009275, + "grad_norm": 0.5339804291725159, + "learning_rate": 4.38794662400392e-06, + "loss": 34.8847, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4269, + "step": 365 + }, + { + "autoregressive_loss": 0.0889, + "epoch": 0.5542305508233958, + "grad_norm": 0.43287280201911926, + "learning_rate": 4.363594983130435e-06, + "loss": 34.8736, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.427, + "step": 366 + }, + { + "autoregressive_loss": 0.0889, + "epoch": 0.555744841945864, + "grad_norm": 0.4394111633300781, + "learning_rate": 4.339258676859349e-06, + "loss": 34.8559, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4268, + "step": 367 + }, + { + "autoregressive_loss": 0.0898, + "epoch": 0.5572591330683324, + "grad_norm": 0.5730719566345215, + "learning_rate": 4.314938291590161e-06, + "loss": 34.8753, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4269, + "step": 368 + }, + { + "autoregressive_loss": 0.0801, + "epoch": 0.5587734241908007, + "grad_norm": 0.4875525236129761, + "learning_rate": 4.290634413338735e-06, + "loss": 34.8104, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4271, + "step": 369 + }, + { + "autoregressive_loss": 0.0908, + "epoch": 0.5602877153132689, + "grad_norm": 0.3966294527053833, + "learning_rate": 4.266347627723192e-06, + "loss": 34.8814, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4269, + "step": 370 + }, + { + "autoregressive_loss": 0.0952, + "epoch": 0.5618020064357373, + "grad_norm": 0.4737849533557892, + "learning_rate": 4.242078519949795e-06, + "loss": 34.9154, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4269, + "step": 371 + }, + { + "autoregressive_loss": 0.0864, + "epoch": 0.5633162975582056, + "grad_norm": 0.5324759483337402, + "learning_rate": 4.217827674798845e-06, + "loss": 34.8511, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.427, + "step": 372 + }, + { + "autoregressive_loss": 0.0957, + "epoch": 0.5648305886806738, + "grad_norm": 0.46697068214416504, + "learning_rate": 4.193595676610599e-06, + "loss": 34.8962, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4266, + "step": 373 + }, + { + "autoregressive_loss": 0.0918, + "epoch": 0.5663448798031422, + "grad_norm": 0.4849524199962616, + "learning_rate": 4.169383109271174e-06, + "loss": 34.8757, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4268, + "step": 374 + }, + { + "autoregressive_loss": 0.0903, + "epoch": 0.5678591709256104, + "grad_norm": 0.5503900647163391, + "learning_rate": 4.145190556198494e-06, + "loss": 34.8598, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4267, + "step": 375 + }, + { + "autoregressive_loss": 0.0977, + "epoch": 0.5693734620480787, + "grad_norm": 0.4767417013645172, + "learning_rate": 4.1210186003282275e-06, + "loss": 34.9242, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4268, + "step": 376 + }, + { + "autoregressive_loss": 0.0918, + "epoch": 0.5708877531705471, + "grad_norm": 0.4968801736831665, + "learning_rate": 4.096867824099736e-06, + "loss": 34.8861, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4269, + "step": 377 + }, + { + "autoregressive_loss": 0.0938, + "epoch": 0.5724020442930153, + "grad_norm": 0.517738401889801, + "learning_rate": 4.072738809442046e-06, + "loss": 34.8729, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4266, + "step": 378 + }, + { + "autoregressive_loss": 0.0864, + "epoch": 0.5739163354154836, + "grad_norm": 0.42351633310317993, + "learning_rate": 4.048632137759821e-06, + "loss": 34.8493, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.427, + "step": 379 + }, + { + "autoregressive_loss": 0.0898, + "epoch": 0.575430626537952, + "grad_norm": 0.4349358081817627, + "learning_rate": 4.02454838991936e-06, + "loss": 34.8531, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4267, + "step": 380 + }, + { + "autoregressive_loss": 0.0923, + "epoch": 0.5769449176604202, + "grad_norm": 0.5473352670669556, + "learning_rate": 4.000488146234592e-06, + "loss": 34.8769, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4268, + "step": 381 + }, + { + "autoregressive_loss": 0.0903, + "epoch": 0.5784592087828885, + "grad_norm": 0.4549475610256195, + "learning_rate": 3.9764519864531026e-06, + "loss": 34.8549, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4266, + "step": 382 + }, + { + "autoregressive_loss": 0.0947, + "epoch": 0.5799734999053568, + "grad_norm": 0.3957749903202057, + "learning_rate": 3.952440489742158e-06, + "loss": 34.9021, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4268, + "step": 383 + }, + { + "autoregressive_loss": 0.0762, + "epoch": 0.5814877910278251, + "grad_norm": 0.4357142150402069, + "learning_rate": 3.928454234674748e-06, + "loss": 34.7552, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4268, + "step": 384 + }, + { + "autoregressive_loss": 0.0918, + "epoch": 0.5830020821502934, + "grad_norm": 0.4978048503398895, + "learning_rate": 3.904493799215652e-06, + "loss": 34.8793, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4268, + "step": 385 + }, + { + "autoregressive_loss": 0.0859, + "epoch": 0.5845163732727617, + "grad_norm": 0.42923659086227417, + "learning_rate": 3.880559760707508e-06, + "loss": 34.8271, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4268, + "step": 386 + }, + { + "autoregressive_loss": 0.0889, + "epoch": 0.58603066439523, + "grad_norm": 0.49369609355926514, + "learning_rate": 3.8566526958569025e-06, + "loss": 34.8653, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4269, + "step": 387 + }, + { + "autoregressive_loss": 0.0889, + "epoch": 0.5875449555176983, + "grad_norm": 0.4035526216030121, + "learning_rate": 3.832773180720475e-06, + "loss": 34.8512, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4268, + "step": 388 + }, + { + "autoregressive_loss": 0.084, + "epoch": 0.5890592466401666, + "grad_norm": 0.5031617283821106, + "learning_rate": 3.8089217906910274e-06, + "loss": 34.8217, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4269, + "step": 389 + }, + { + "autoregressive_loss": 0.0894, + "epoch": 0.5905735377626349, + "grad_norm": 0.44186675548553467, + "learning_rate": 3.7850991004836813e-06, + "loss": 34.8334, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4265, + "step": 390 + }, + { + "autoregressive_loss": 0.0986, + "epoch": 0.5920878288851031, + "grad_norm": 0.4768434166908264, + "learning_rate": 3.761305684122008e-06, + "loss": 34.9223, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4266, + "step": 391 + }, + { + "autoregressive_loss": 0.0908, + "epoch": 0.5936021200075715, + "grad_norm": 0.4222986102104187, + "learning_rate": 3.7375421149242102e-06, + "loss": 34.9053, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4272, + "step": 392 + }, + { + "autoregressive_loss": 0.0879, + "epoch": 0.5951164111300398, + "grad_norm": 0.4309958815574646, + "learning_rate": 3.7138089654893027e-06, + "loss": 34.8417, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4268, + "step": 393 + }, + { + "autoregressive_loss": 0.0889, + "epoch": 0.596630702252508, + "grad_norm": 0.45433348417282104, + "learning_rate": 3.6901068076833136e-06, + "loss": 34.8474, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4267, + "step": 394 + }, + { + "autoregressive_loss": 0.0933, + "epoch": 0.5981449933749763, + "grad_norm": 0.39417994022369385, + "learning_rate": 3.6664362126255087e-06, + "loss": 34.8846, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4267, + "step": 395 + }, + { + "autoregressive_loss": 0.0864, + "epoch": 0.5996592844974447, + "grad_norm": 0.4867398738861084, + "learning_rate": 3.6427977506746293e-06, + "loss": 34.8142, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4265, + "step": 396 + }, + { + "autoregressive_loss": 0.0928, + "epoch": 0.6011735756199129, + "grad_norm": 0.41836783289909363, + "learning_rate": 3.619191991415146e-06, + "loss": 34.8394, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4262, + "step": 397 + }, + { + "autoregressive_loss": 0.0942, + "epoch": 0.6026878667423812, + "grad_norm": 0.5118010640144348, + "learning_rate": 3.595619503643541e-06, + "loss": 34.9089, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4269, + "step": 398 + }, + { + "autoregressive_loss": 0.0908, + "epoch": 0.6042021578648495, + "grad_norm": 0.33514273166656494, + "learning_rate": 3.5720808553545894e-06, + "loss": 34.8253, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4263, + "step": 399 + }, + { + "autoregressive_loss": 0.0796, + "epoch": 0.6057164489873178, + "grad_norm": 0.3933655619621277, + "learning_rate": 3.5485766137276894e-06, + "loss": 34.7736, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4267, + "step": 400 + }, + { + "autoregressive_loss": 0.0884, + "epoch": 0.6072307401097861, + "grad_norm": 0.46109527349472046, + "learning_rate": 3.5251073451131824e-06, + "loss": 34.8316, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4265, + "step": 401 + }, + { + "autoregressive_loss": 0.0854, + "epoch": 0.6087450312322544, + "grad_norm": 0.42791593074798584, + "learning_rate": 3.501673615018717e-06, + "loss": 34.8054, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4265, + "step": 402 + }, + { + "autoregressive_loss": 0.085, + "epoch": 0.6102593223547227, + "grad_norm": 0.478319376707077, + "learning_rate": 3.478275988095615e-06, + "loss": 34.8153, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4267, + "step": 403 + }, + { + "autoregressive_loss": 0.0898, + "epoch": 0.611773613477191, + "grad_norm": 0.4135425388813019, + "learning_rate": 3.4549150281252635e-06, + "loss": 34.8424, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4265, + "step": 404 + }, + { + "autoregressive_loss": 0.0859, + "epoch": 0.6132879045996593, + "grad_norm": 0.4186260998249054, + "learning_rate": 3.4315912980055433e-06, + "loss": 34.7886, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4263, + "step": 405 + }, + { + "autoregressive_loss": 0.0889, + "epoch": 0.6148021957221276, + "grad_norm": 0.4154527485370636, + "learning_rate": 3.4083053597372517e-06, + "loss": 34.8351, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4265, + "step": 406 + }, + { + "autoregressive_loss": 0.0854, + "epoch": 0.6163164868445958, + "grad_norm": 0.4814777672290802, + "learning_rate": 3.3850577744105682e-06, + "loss": 34.8174, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4266, + "step": 407 + }, + { + "autoregressive_loss": 0.0879, + "epoch": 0.6178307779670642, + "grad_norm": 0.44730573892593384, + "learning_rate": 3.3618491021915334e-06, + "loss": 34.8236, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4265, + "step": 408 + }, + { + "autoregressive_loss": 0.0894, + "epoch": 0.6193450690895325, + "grad_norm": 0.44284746050834656, + "learning_rate": 3.338679902308547e-06, + "loss": 34.8542, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4267, + "step": 409 + }, + { + "autoregressive_loss": 0.0928, + "epoch": 0.6208593602120007, + "grad_norm": 0.3492773175239563, + "learning_rate": 3.3155507330389004e-06, + "loss": 34.8706, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4266, + "step": 410 + }, + { + "autoregressive_loss": 0.0913, + "epoch": 0.6223736513344691, + "grad_norm": 0.48861467838287354, + "learning_rate": 3.2924621516953195e-06, + "loss": 34.8525, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4265, + "step": 411 + }, + { + "autoregressive_loss": 0.0952, + "epoch": 0.6238879424569373, + "grad_norm": 0.4528800845146179, + "learning_rate": 3.269414714612534e-06, + "loss": 34.8925, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4266, + "step": 412 + }, + { + "autoregressive_loss": 0.0889, + "epoch": 0.6254022335794056, + "grad_norm": 0.41865745186805725, + "learning_rate": 3.2464089771338856e-06, + "loss": 34.83, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4265, + "step": 413 + }, + { + "autoregressive_loss": 0.0898, + "epoch": 0.626916524701874, + "grad_norm": 0.4364733397960663, + "learning_rate": 3.223445493597921e-06, + "loss": 34.8376, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4265, + "step": 414 + }, + { + "autoregressive_loss": 0.084, + "epoch": 0.6284308158243422, + "grad_norm": 0.45800378918647766, + "learning_rate": 3.2005248173250593e-06, + "loss": 34.821, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4269, + "step": 415 + }, + { + "autoregressive_loss": 0.085, + "epoch": 0.6299451069468105, + "grad_norm": 0.4728463590145111, + "learning_rate": 3.177647500604252e-06, + "loss": 34.7908, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4264, + "step": 416 + }, + { + "autoregressive_loss": 0.0923, + "epoch": 0.6314593980692789, + "grad_norm": 0.44017675518989563, + "learning_rate": 3.154814094679668e-06, + "loss": 34.8805, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4268, + "step": 417 + }, + { + "autoregressive_loss": 0.0889, + "epoch": 0.6329736891917471, + "grad_norm": 0.42653894424438477, + "learning_rate": 3.1320251497374187e-06, + "loss": 34.8405, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4266, + "step": 418 + }, + { + "autoregressive_loss": 0.0928, + "epoch": 0.6344879803142154, + "grad_norm": 0.3785884380340576, + "learning_rate": 3.109281214892298e-06, + "loss": 34.8605, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4265, + "step": 419 + }, + { + "autoregressive_loss": 0.0894, + "epoch": 0.6360022714366838, + "grad_norm": 0.4289815425872803, + "learning_rate": 3.0865828381745515e-06, + "loss": 34.8333, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4265, + "step": 420 + }, + { + "autoregressive_loss": 0.0938, + "epoch": 0.637516562559152, + "grad_norm": 0.45750102400779724, + "learning_rate": 3.0639305665166724e-06, + "loss": 34.8785, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4266, + "step": 421 + }, + { + "autoregressive_loss": 0.0903, + "epoch": 0.6390308536816203, + "grad_norm": 0.412979394197464, + "learning_rate": 3.0413249457402206e-06, + "loss": 34.8493, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4266, + "step": 422 + }, + { + "autoregressive_loss": 0.0908, + "epoch": 0.6405451448040886, + "grad_norm": 0.4756372272968292, + "learning_rate": 3.018766520542673e-06, + "loss": 34.8358, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4264, + "step": 423 + }, + { + "autoregressive_loss": 0.0869, + "epoch": 0.6420594359265569, + "grad_norm": 0.4353180527687073, + "learning_rate": 2.9962558344842963e-06, + "loss": 34.8591, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.427, + "step": 424 + }, + { + "autoregressive_loss": 0.0864, + "epoch": 0.6435737270490252, + "grad_norm": 0.4504074156284332, + "learning_rate": 2.9737934299750514e-06, + "loss": 34.8191, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4266, + "step": 425 + }, + { + "autoregressive_loss": 0.0889, + "epoch": 0.6450880181714934, + "grad_norm": 0.4144100546836853, + "learning_rate": 2.951379848261523e-06, + "loss": 34.8435, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4267, + "step": 426 + }, + { + "autoregressive_loss": 0.0928, + "epoch": 0.6466023092939618, + "grad_norm": 0.4290841221809387, + "learning_rate": 2.9290156294138807e-06, + "loss": 34.8649, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4265, + "step": 427 + }, + { + "autoregressive_loss": 0.0884, + "epoch": 0.64811660041643, + "grad_norm": 0.500068187713623, + "learning_rate": 2.906701312312861e-06, + "loss": 34.8347, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4266, + "step": 428 + }, + { + "autoregressive_loss": 0.0889, + "epoch": 0.6496308915388983, + "grad_norm": 0.44262510538101196, + "learning_rate": 2.88443743463678e-06, + "loss": 34.8286, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4264, + "step": 429 + }, + { + "autoregressive_loss": 0.0869, + "epoch": 0.6511451826613667, + "grad_norm": 0.38101115822792053, + "learning_rate": 2.862224532848591e-06, + "loss": 34.8228, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4266, + "step": 430 + }, + { + "autoregressive_loss": 0.0854, + "epoch": 0.6526594737838349, + "grad_norm": 0.40929633378982544, + "learning_rate": 2.840063142182941e-06, + "loss": 34.8048, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4265, + "step": 431 + }, + { + "autoregressive_loss": 0.0898, + "epoch": 0.6541737649063032, + "grad_norm": 0.47931191325187683, + "learning_rate": 2.817953796633289e-06, + "loss": 34.8445, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4266, + "step": 432 + }, + { + "autoregressive_loss": 0.0898, + "epoch": 0.6556880560287716, + "grad_norm": 0.43975597620010376, + "learning_rate": 2.7958970289390317e-06, + "loss": 34.8307, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4264, + "step": 433 + }, + { + "autoregressive_loss": 0.0889, + "epoch": 0.6572023471512398, + "grad_norm": 0.44371703267097473, + "learning_rate": 2.77389337057266e-06, + "loss": 34.8411, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4266, + "step": 434 + }, + { + "autoregressive_loss": 0.0918, + "epoch": 0.6587166382737081, + "grad_norm": 0.3957557678222656, + "learning_rate": 2.7519433517269665e-06, + "loss": 34.8661, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4266, + "step": 435 + }, + { + "autoregressive_loss": 0.0815, + "epoch": 0.6602309293961764, + "grad_norm": 0.40987053513526917, + "learning_rate": 2.7300475013022666e-06, + "loss": 34.7725, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4265, + "step": 436 + }, + { + "autoregressive_loss": 0.0908, + "epoch": 0.6617452205186447, + "grad_norm": 0.43157750368118286, + "learning_rate": 2.7082063468936427e-06, + "loss": 34.8484, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4265, + "step": 437 + }, + { + "autoregressive_loss": 0.0918, + "epoch": 0.663259511641113, + "grad_norm": 0.36990004777908325, + "learning_rate": 2.68642041477825e-06, + "loss": 34.8468, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4264, + "step": 438 + }, + { + "autoregressive_loss": 0.085, + "epoch": 0.6647738027635813, + "grad_norm": 0.4386705458164215, + "learning_rate": 2.6646902299026183e-06, + "loss": 34.8015, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4265, + "step": 439 + }, + { + "autoregressive_loss": 0.0879, + "epoch": 0.6662880938860496, + "grad_norm": 0.39429959654808044, + "learning_rate": 2.6430163158700116e-06, + "loss": 34.8059, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4263, + "step": 440 + }, + { + "autoregressive_loss": 0.083, + "epoch": 0.6678023850085179, + "grad_norm": 0.43264690041542053, + "learning_rate": 2.621399194927817e-06, + "loss": 34.7717, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4264, + "step": 441 + }, + { + "autoregressive_loss": 0.0898, + "epoch": 0.6693166761309862, + "grad_norm": 0.45001494884490967, + "learning_rate": 2.5998393879549444e-06, + "loss": 34.8283, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4264, + "step": 442 + }, + { + "autoregressive_loss": 0.0859, + "epoch": 0.6708309672534545, + "grad_norm": 0.40147706866264343, + "learning_rate": 2.5783374144492946e-06, + "loss": 34.8011, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4265, + "step": 443 + }, + { + "autoregressive_loss": 0.084, + "epoch": 0.6723452583759227, + "grad_norm": 0.40624192357063293, + "learning_rate": 2.5568937925152272e-06, + "loss": 34.7934, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4265, + "step": 444 + }, + { + "autoregressive_loss": 0.085, + "epoch": 0.6738595494983911, + "grad_norm": 0.41449639201164246, + "learning_rate": 2.5355090388510806e-06, + "loss": 34.7827, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4263, + "step": 445 + }, + { + "autoregressive_loss": 0.0869, + "epoch": 0.6753738406208594, + "grad_norm": 0.4276653826236725, + "learning_rate": 2.5141836687367273e-06, + "loss": 34.8297, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4266, + "step": 446 + }, + { + "autoregressive_loss": 0.0884, + "epoch": 0.6768881317433276, + "grad_norm": 0.447142094373703, + "learning_rate": 2.4929181960211553e-06, + "loss": 34.8355, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4266, + "step": 447 + }, + { + "autoregressive_loss": 0.084, + "epoch": 0.678402422865796, + "grad_norm": 0.3984714448451996, + "learning_rate": 2.471713133110078e-06, + "loss": 34.7857, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4264, + "step": 448 + }, + { + "autoregressive_loss": 0.0918, + "epoch": 0.6799167139882643, + "grad_norm": 0.3696002960205078, + "learning_rate": 2.4505689909535967e-06, + "loss": 34.8569, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4265, + "step": 449 + }, + { + "autoregressive_loss": 0.0884, + "epoch": 0.6814310051107325, + "grad_norm": 0.3757135570049286, + "learning_rate": 2.429486279033892e-06, + "loss": 34.8206, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4264, + "step": 450 + }, + { + "autoregressive_loss": 0.0933, + "epoch": 0.6829452962332009, + "grad_norm": 0.411026269197464, + "learning_rate": 2.4084655053529337e-06, + "loss": 34.8643, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4265, + "step": 451 + }, + { + "autoregressive_loss": 0.0898, + "epoch": 0.6844595873556691, + "grad_norm": 0.38915422558784485, + "learning_rate": 2.387507176420256e-06, + "loss": 34.8146, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4262, + "step": 452 + }, + { + "autoregressive_loss": 0.0879, + "epoch": 0.6859738784781374, + "grad_norm": 0.3617386519908905, + "learning_rate": 2.366611797240741e-06, + "loss": 34.8246, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4265, + "step": 453 + }, + { + "autoregressive_loss": 0.0903, + "epoch": 0.6874881696006058, + "grad_norm": 0.3905327320098877, + "learning_rate": 2.345779871302453e-06, + "loss": 34.8173, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4262, + "step": 454 + }, + { + "autoregressive_loss": 0.0908, + "epoch": 0.689002460723074, + "grad_norm": 0.34236669540405273, + "learning_rate": 2.325011900564515e-06, + "loss": 34.8535, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4266, + "step": 455 + }, + { + "autoregressive_loss": 0.0967, + "epoch": 0.6905167518455423, + "grad_norm": 0.391426682472229, + "learning_rate": 2.304308385444999e-06, + "loss": 34.8707, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4262, + "step": 456 + }, + { + "autoregressive_loss": 0.0889, + "epoch": 0.6920310429680105, + "grad_norm": 0.4189107120037079, + "learning_rate": 2.2836698248088814e-06, + "loss": 34.8265, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4264, + "step": 457 + }, + { + "autoregressive_loss": 0.083, + "epoch": 0.6935453340904789, + "grad_norm": 0.35606297850608826, + "learning_rate": 2.263096715956019e-06, + "loss": 34.7837, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4265, + "step": 458 + }, + { + "autoregressive_loss": 0.0938, + "epoch": 0.6950596252129472, + "grad_norm": 0.3791707158088684, + "learning_rate": 2.2425895546091534e-06, + "loss": 34.8646, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4264, + "step": 459 + }, + { + "autoregressive_loss": 0.0903, + "epoch": 0.6965739163354154, + "grad_norm": 0.36069735884666443, + "learning_rate": 2.2221488349019903e-06, + "loss": 34.8368, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4264, + "step": 460 + }, + { + "autoregressive_loss": 0.0928, + "epoch": 0.6980882074578838, + "grad_norm": 0.39582717418670654, + "learning_rate": 2.2017750493672704e-06, + "loss": 34.8526, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4264, + "step": 461 + }, + { + "autoregressive_loss": 0.0908, + "epoch": 0.6996024985803521, + "grad_norm": 0.3693932294845581, + "learning_rate": 2.181468688924916e-06, + "loss": 34.8151, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4262, + "step": 462 + }, + { + "autoregressive_loss": 0.0894, + "epoch": 0.7011167897028203, + "grad_norm": 0.4039921760559082, + "learning_rate": 2.1612302428701993e-06, + "loss": 34.825, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4264, + "step": 463 + }, + { + "autoregressive_loss": 0.0879, + "epoch": 0.7026310808252887, + "grad_norm": 0.3628940284252167, + "learning_rate": 2.1410601988619394e-06, + "loss": 34.8311, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4266, + "step": 464 + }, + { + "autoregressive_loss": 0.0859, + "epoch": 0.704145371947757, + "grad_norm": 0.40642890334129333, + "learning_rate": 2.1209590429107734e-06, + "loss": 34.8031, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4265, + "step": 465 + }, + { + "autoregressive_loss": 0.0781, + "epoch": 0.7056596630702252, + "grad_norm": 0.3716905117034912, + "learning_rate": 2.1009272593674323e-06, + "loss": 34.7206, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4262, + "step": 466 + }, + { + "autoregressive_loss": 0.0889, + "epoch": 0.7071739541926936, + "grad_norm": 0.4110797941684723, + "learning_rate": 2.0809653309110685e-06, + "loss": 34.817, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4264, + "step": 467 + }, + { + "autoregressive_loss": 0.0889, + "epoch": 0.7086882453151618, + "grad_norm": 0.4111776351928711, + "learning_rate": 2.061073738537635e-06, + "loss": 34.8263, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4265, + "step": 468 + }, + { + "autoregressive_loss": 0.084, + "epoch": 0.7102025364376301, + "grad_norm": 0.35834285616874695, + "learning_rate": 2.0412529615482867e-06, + "loss": 34.7758, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4263, + "step": 469 + }, + { + "autoregressive_loss": 0.0933, + "epoch": 0.7117168275600985, + "grad_norm": 0.33383214473724365, + "learning_rate": 2.0215034775378336e-06, + "loss": 34.8649, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4265, + "step": 470 + }, + { + "autoregressive_loss": 0.084, + "epoch": 0.7132311186825667, + "grad_norm": 0.42933860421180725, + "learning_rate": 2.0018257623832393e-06, + "loss": 34.8006, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4266, + "step": 471 + }, + { + "autoregressive_loss": 0.0903, + "epoch": 0.714745409805035, + "grad_norm": 0.39561548829078674, + "learning_rate": 1.982220290232143e-06, + "loss": 34.8559, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4266, + "step": 472 + }, + { + "autoregressive_loss": 0.0938, + "epoch": 0.7162597009275034, + "grad_norm": 0.4012252390384674, + "learning_rate": 1.962687533491446e-06, + "loss": 34.8574, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4264, + "step": 473 + }, + { + "autoregressive_loss": 0.0889, + "epoch": 0.7177739920499716, + "grad_norm": 0.4318500757217407, + "learning_rate": 1.9432279628159188e-06, + "loss": 34.8133, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4263, + "step": 474 + }, + { + "autoregressive_loss": 0.085, + "epoch": 0.7192882831724399, + "grad_norm": 0.40785539150238037, + "learning_rate": 1.9238420470968665e-06, + "loss": 34.7886, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4264, + "step": 475 + }, + { + "autoregressive_loss": 0.0869, + "epoch": 0.7208025742949082, + "grad_norm": 0.40469008684158325, + "learning_rate": 1.9045302534508298e-06, + "loss": 34.8116, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4264, + "step": 476 + }, + { + "autoregressive_loss": 0.0879, + "epoch": 0.7223168654173765, + "grad_norm": 0.402775377035141, + "learning_rate": 1.8852930472083304e-06, + "loss": 34.806, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4263, + "step": 477 + }, + { + "autoregressive_loss": 0.0928, + "epoch": 0.7238311565398448, + "grad_norm": 0.39007893204689026, + "learning_rate": 1.8661308919026533e-06, + "loss": 34.87, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4266, + "step": 478 + }, + { + "autoregressive_loss": 0.0889, + "epoch": 0.7253454476623131, + "grad_norm": 0.38476496934890747, + "learning_rate": 1.847044249258681e-06, + "loss": 34.8313, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4265, + "step": 479 + }, + { + "autoregressive_loss": 0.085, + "epoch": 0.7268597387847814, + "grad_norm": 0.4436049163341522, + "learning_rate": 1.8280335791817733e-06, + "loss": 34.8054, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4266, + "step": 480 + }, + { + "autoregressive_loss": 0.083, + "epoch": 0.7283740299072496, + "grad_norm": 0.3641524016857147, + "learning_rate": 1.809099339746674e-06, + "loss": 34.7686, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4263, + "step": 481 + }, + { + "autoregressive_loss": 0.0869, + "epoch": 0.729888321029718, + "grad_norm": 0.3732715845108032, + "learning_rate": 1.790241987186485e-06, + "loss": 34.794, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4263, + "step": 482 + }, + { + "autoregressive_loss": 0.0884, + "epoch": 0.7314026121521863, + "grad_norm": 0.34419453144073486, + "learning_rate": 1.7714619758816649e-06, + "loss": 34.817, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4264, + "step": 483 + }, + { + "autoregressive_loss": 0.0962, + "epoch": 0.7329169032746545, + "grad_norm": 0.39912503957748413, + "learning_rate": 1.7527597583490825e-06, + "loss": 34.866, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4262, + "step": 484 + }, + { + "autoregressive_loss": 0.0879, + "epoch": 0.7344311943971229, + "grad_norm": 0.3872796297073364, + "learning_rate": 1.7341357852311175e-06, + "loss": 34.7939, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4262, + "step": 485 + }, + { + "autoregressive_loss": 0.085, + "epoch": 0.7359454855195912, + "grad_norm": 0.36976543068885803, + "learning_rate": 1.7155905052847938e-06, + "loss": 34.8061, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4266, + "step": 486 + }, + { + "autoregressive_loss": 0.0928, + "epoch": 0.7374597766420594, + "grad_norm": 0.36062124371528625, + "learning_rate": 1.697124365370974e-06, + "loss": 34.8591, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4264, + "step": 487 + }, + { + "autoregressive_loss": 0.0903, + "epoch": 0.7389740677645277, + "grad_norm": 0.41563186049461365, + "learning_rate": 1.6787378104435931e-06, + "loss": 34.8206, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4262, + "step": 488 + }, + { + "autoregressive_loss": 0.0845, + "epoch": 0.740488358886996, + "grad_norm": 0.4063948690891266, + "learning_rate": 1.6604312835389202e-06, + "loss": 34.7706, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4262, + "step": 489 + }, + { + "autoregressive_loss": 0.0923, + "epoch": 0.7420026500094643, + "grad_norm": 0.3606734573841095, + "learning_rate": 1.642205225764908e-06, + "loss": 34.8384, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4263, + "step": 490 + }, + { + "autoregressive_loss": 0.0894, + "epoch": 0.7435169411319326, + "grad_norm": 0.3888211250305176, + "learning_rate": 1.6240600762905485e-06, + "loss": 34.8142, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4262, + "step": 491 + }, + { + "autoregressive_loss": 0.0889, + "epoch": 0.7450312322544009, + "grad_norm": 0.409179151058197, + "learning_rate": 1.6059962723352912e-06, + "loss": 34.8067, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4262, + "step": 492 + }, + { + "autoregressive_loss": 0.0928, + "epoch": 0.7465455233768692, + "grad_norm": 0.37396499514579773, + "learning_rate": 1.588014249158516e-06, + "loss": 34.8408, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4262, + "step": 493 + }, + { + "autoregressive_loss": 0.0879, + "epoch": 0.7480598144993374, + "grad_norm": 0.3583182990550995, + "learning_rate": 1.570114440049037e-06, + "loss": 34.8239, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4265, + "step": 494 + }, + { + "autoregressive_loss": 0.0869, + "epoch": 0.7495741056218058, + "grad_norm": 0.4242543876171112, + "learning_rate": 1.5522972763146653e-06, + "loss": 34.8053, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4264, + "step": 495 + }, + { + "autoregressive_loss": 0.0898, + "epoch": 0.7510883967442741, + "grad_norm": 0.4354727566242218, + "learning_rate": 1.5345631872718214e-06, + "loss": 34.8188, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4263, + "step": 496 + }, + { + "autoregressive_loss": 0.0879, + "epoch": 0.7526026878667423, + "grad_norm": 0.35591554641723633, + "learning_rate": 1.5169126002351791e-06, + "loss": 34.8154, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4264, + "step": 497 + }, + { + "autoregressive_loss": 0.0898, + "epoch": 0.7541169789892107, + "grad_norm": 0.28155869245529175, + "learning_rate": 1.4993459405073825e-06, + "loss": 34.8059, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4261, + "step": 498 + }, + { + "autoregressive_loss": 0.0967, + "epoch": 0.755631270111679, + "grad_norm": 0.43063297867774963, + "learning_rate": 1.4818636313687868e-06, + "loss": 34.8708, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4262, + "step": 499 + }, + { + "autoregressive_loss": 0.0869, + "epoch": 0.7571455612341472, + "grad_norm": 0.422209233045578, + "learning_rate": 1.4644660940672628e-06, + "loss": 34.8064, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4263, + "step": 500 + }, + { + "autoregressive_loss": 0.083, + "epoch": 0.7586598523566156, + "grad_norm": 0.44441139698028564, + "learning_rate": 1.4471537478080516e-06, + "loss": 34.7706, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4264, + "step": 501 + }, + { + "autoregressive_loss": 0.0913, + "epoch": 0.7601741434790839, + "grad_norm": 0.4232626259326935, + "learning_rate": 1.429927009743659e-06, + "loss": 34.8245, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4262, + "step": 502 + }, + { + "autoregressive_loss": 0.0933, + "epoch": 0.7616884346015521, + "grad_norm": 0.3387332260608673, + "learning_rate": 1.412786294963801e-06, + "loss": 34.8415, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4262, + "step": 503 + }, + { + "autoregressive_loss": 0.0854, + "epoch": 0.7632027257240205, + "grad_norm": 0.3710520565509796, + "learning_rate": 1.395732016485406e-06, + "loss": 34.7938, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4264, + "step": 504 + }, + { + "autoregressive_loss": 0.0859, + "epoch": 0.7647170168464887, + "grad_norm": 0.4394475221633911, + "learning_rate": 1.3787645852426663e-06, + "loss": 34.7952, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4264, + "step": 505 + }, + { + "autoregressive_loss": 0.0879, + "epoch": 0.766231307968957, + "grad_norm": 0.4840888977050781, + "learning_rate": 1.3618844100771256e-06, + "loss": 34.8248, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4265, + "step": 506 + }, + { + "autoregressive_loss": 0.0879, + "epoch": 0.7677455990914254, + "grad_norm": 0.3793099522590637, + "learning_rate": 1.345091897727842e-06, + "loss": 34.8115, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4264, + "step": 507 + }, + { + "autoregressive_loss": 0.084, + "epoch": 0.7692598902138936, + "grad_norm": 0.3611990213394165, + "learning_rate": 1.3283874528215735e-06, + "loss": 34.7694, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4262, + "step": 508 + }, + { + "autoregressive_loss": 0.0879, + "epoch": 0.7707741813363619, + "grad_norm": 0.36514294147491455, + "learning_rate": 1.3117714778630358e-06, + "loss": 34.8105, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4263, + "step": 509 + }, + { + "autoregressive_loss": 0.0869, + "epoch": 0.7722884724588303, + "grad_norm": 0.43741509318351746, + "learning_rate": 1.2952443732252058e-06, + "loss": 34.7867, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4262, + "step": 510 + }, + { + "autoregressive_loss": 0.0967, + "epoch": 0.7738027635812985, + "grad_norm": 0.44763755798339844, + "learning_rate": 1.2788065371396652e-06, + "loss": 34.8795, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4263, + "step": 511 + }, + { + "autoregressive_loss": 0.0801, + "epoch": 0.7753170547037668, + "grad_norm": 0.42393553256988525, + "learning_rate": 1.2624583656870153e-06, + "loss": 34.7352, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4262, + "step": 512 + }, + { + "autoregressive_loss": 0.0869, + "epoch": 0.7768313458262351, + "grad_norm": 0.3310752213001251, + "learning_rate": 1.2462002527873301e-06, + "loss": 34.7981, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4263, + "step": 513 + }, + { + "autoregressive_loss": 0.0879, + "epoch": 0.7783456369487034, + "grad_norm": 0.3607689440250397, + "learning_rate": 1.2300325901906529e-06, + "loss": 34.8046, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4263, + "step": 514 + }, + { + "autoregressive_loss": 0.084, + "epoch": 0.7798599280711717, + "grad_norm": 0.39925089478492737, + "learning_rate": 1.2139557674675773e-06, + "loss": 34.7888, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4264, + "step": 515 + }, + { + "autoregressive_loss": 0.0898, + "epoch": 0.78137421919364, + "grad_norm": 0.40104687213897705, + "learning_rate": 1.1979701719998454e-06, + "loss": 34.8278, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4264, + "step": 516 + }, + { + "autoregressive_loss": 0.0889, + "epoch": 0.7828885103161083, + "grad_norm": 0.3895433843135834, + "learning_rate": 1.1820761889710175e-06, + "loss": 34.815, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4263, + "step": 517 + }, + { + "autoregressive_loss": 0.0879, + "epoch": 0.7844028014385765, + "grad_norm": 0.3535964787006378, + "learning_rate": 1.1662742013571926e-06, + "loss": 34.7871, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4261, + "step": 518 + }, + { + "autoregressive_loss": 0.0879, + "epoch": 0.7859170925610448, + "grad_norm": 0.39049097895622253, + "learning_rate": 1.1505645899177786e-06, + "loss": 34.7849, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.426, + "step": 519 + }, + { + "autoregressive_loss": 0.0957, + "epoch": 0.7874313836835132, + "grad_norm": 0.384605348110199, + "learning_rate": 1.134947733186315e-06, + "loss": 34.897, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4266, + "step": 520 + }, + { + "autoregressive_loss": 0.0845, + "epoch": 0.7889456748059814, + "grad_norm": 0.2869601249694824, + "learning_rate": 1.1194240074613617e-06, + "loss": 34.7608, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4261, + "step": 521 + }, + { + "autoregressive_loss": 0.0889, + "epoch": 0.7904599659284497, + "grad_norm": 0.3366320729255676, + "learning_rate": 1.1039937867974166e-06, + "loss": 34.8019, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4261, + "step": 522 + }, + { + "autoregressive_loss": 0.0869, + "epoch": 0.7919742570509181, + "grad_norm": 0.37444350123405457, + "learning_rate": 1.0886574429959185e-06, + "loss": 34.7976, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4263, + "step": 523 + }, + { + "autoregressive_loss": 0.0947, + "epoch": 0.7934885481733863, + "grad_norm": 0.3912818133831024, + "learning_rate": 1.0734153455962765e-06, + "loss": 34.863, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4264, + "step": 524 + }, + { + "autoregressive_loss": 0.0879, + "epoch": 0.7950028392958546, + "grad_norm": 0.3579573333263397, + "learning_rate": 1.058267861866969e-06, + "loss": 34.7847, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.426, + "step": 525 + }, + { + "autoregressive_loss": 0.0928, + "epoch": 0.796517130418323, + "grad_norm": 0.3460308909416199, + "learning_rate": 1.0432153567966985e-06, + "loss": 34.8324, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4262, + "step": 526 + }, + { + "autoregressive_loss": 0.0889, + "epoch": 0.7980314215407912, + "grad_norm": 0.34577476978302, + "learning_rate": 1.0282581930855933e-06, + "loss": 34.8179, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4263, + "step": 527 + }, + { + "autoregressive_loss": 0.0874, + "epoch": 0.7995457126632595, + "grad_norm": 0.34840166568756104, + "learning_rate": 1.013396731136465e-06, + "loss": 34.8041, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4263, + "step": 528 + }, + { + "autoregressive_loss": 0.0879, + "epoch": 0.8010600037857278, + "grad_norm": 0.3199150562286377, + "learning_rate": 9.986313290461287e-07, + "loss": 34.8009, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4262, + "step": 529 + }, + { + "autoregressive_loss": 0.0854, + "epoch": 0.8025742949081961, + "grad_norm": 0.3381352722644806, + "learning_rate": 9.83962342596776e-07, + "loss": 34.785, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4263, + "step": 530 + }, + { + "autoregressive_loss": 0.0933, + "epoch": 0.8040885860306644, + "grad_norm": 0.30624058842658997, + "learning_rate": 9.693901252473953e-07, + "loss": 34.85, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4263, + "step": 531 + }, + { + "autoregressive_loss": 0.0859, + "epoch": 0.8056028771531327, + "grad_norm": 0.36493048071861267, + "learning_rate": 9.549150281252633e-07, + "loss": 34.7849, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4262, + "step": 532 + }, + { + "autoregressive_loss": 0.0854, + "epoch": 0.807117168275601, + "grad_norm": 0.34038835763931274, + "learning_rate": 9.405374000174772e-07, + "loss": 34.8016, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4264, + "step": 533 + }, + { + "autoregressive_loss": 0.0869, + "epoch": 0.8086314593980692, + "grad_norm": 0.321810245513916, + "learning_rate": 9.262575873625529e-07, + "loss": 34.8011, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4263, + "step": 534 + }, + { + "autoregressive_loss": 0.0786, + "epoch": 0.8101457505205376, + "grad_norm": 0.3652377128601074, + "learning_rate": 9.120759342420821e-07, + "loss": 34.7419, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4264, + "step": 535 + }, + { + "autoregressive_loss": 0.0859, + "epoch": 0.8116600416430059, + "grad_norm": 0.2875250577926636, + "learning_rate": 8.979927823724321e-07, + "loss": 34.7948, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4263, + "step": 536 + }, + { + "autoregressive_loss": 0.0869, + "epoch": 0.8131743327654741, + "grad_norm": 0.35206449031829834, + "learning_rate": 8.840084710965202e-07, + "loss": 34.8017, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4263, + "step": 537 + }, + { + "autoregressive_loss": 0.0874, + "epoch": 0.8146886238879425, + "grad_norm": 0.3358929753303528, + "learning_rate": 8.701233373756352e-07, + "loss": 34.7872, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4261, + "step": 538 + }, + { + "autoregressive_loss": 0.0879, + "epoch": 0.8162029150104108, + "grad_norm": 0.3385925889015198, + "learning_rate": 8.563377157813102e-07, + "loss": 34.7945, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4261, + "step": 539 + }, + { + "autoregressive_loss": 0.0918, + "epoch": 0.817717206132879, + "grad_norm": 0.35589683055877686, + "learning_rate": 8.426519384872733e-07, + "loss": 34.8309, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4262, + "step": 540 + }, + { + "autoregressive_loss": 0.0928, + "epoch": 0.8192314972553474, + "grad_norm": 0.3237808644771576, + "learning_rate": 8.290663352614386e-07, + "loss": 34.8463, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4263, + "step": 541 + }, + { + "autoregressive_loss": 0.0913, + "epoch": 0.8207457883778156, + "grad_norm": 0.31155073642730713, + "learning_rate": 8.155812334579532e-07, + "loss": 34.818, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4261, + "step": 542 + }, + { + "autoregressive_loss": 0.0723, + "epoch": 0.8222600795002839, + "grad_norm": 0.289968878030777, + "learning_rate": 8.021969580093231e-07, + "loss": 34.6567, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.426, + "step": 543 + }, + { + "autoregressive_loss": 0.0869, + "epoch": 0.8237743706227523, + "grad_norm": 0.36453667283058167, + "learning_rate": 7.88913831418568e-07, + "loss": 34.7904, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4262, + "step": 544 + }, + { + "autoregressive_loss": 0.0884, + "epoch": 0.8252886617452205, + "grad_norm": 0.2898712754249573, + "learning_rate": 7.757321737514645e-07, + "loss": 34.8094, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4262, + "step": 545 + }, + { + "autoregressive_loss": 0.085, + "epoch": 0.8268029528676888, + "grad_norm": 0.3455236852169037, + "learning_rate": 7.626523026288279e-07, + "loss": 34.7994, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4265, + "step": 546 + }, + { + "autoregressive_loss": 0.0845, + "epoch": 0.8283172439901572, + "grad_norm": 0.32326579093933105, + "learning_rate": 7.496745332188555e-07, + "loss": 34.7569, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.426, + "step": 547 + }, + { + "autoregressive_loss": 0.0781, + "epoch": 0.8298315351126254, + "grad_norm": 0.30615735054016113, + "learning_rate": 7.367991782295392e-07, + "loss": 34.7215, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4262, + "step": 548 + }, + { + "autoregressive_loss": 0.0918, + "epoch": 0.8313458262350937, + "grad_norm": 0.3191310167312622, + "learning_rate": 7.240265479011249e-07, + "loss": 34.8198, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4261, + "step": 549 + }, + { + "autoregressive_loss": 0.084, + "epoch": 0.832860117357562, + "grad_norm": 0.32341960072517395, + "learning_rate": 7.113569499986401e-07, + "loss": 34.783, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4264, + "step": 550 + }, + { + "autoregressive_loss": 0.0811, + "epoch": 0.8343744084800303, + "grad_norm": 0.31386658549308777, + "learning_rate": 6.987906898044783e-07, + "loss": 34.7414, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4262, + "step": 551 + }, + { + "autoregressive_loss": 0.0879, + "epoch": 0.8358886996024986, + "grad_norm": 0.3214971125125885, + "learning_rate": 6.863280701110409e-07, + "loss": 34.8091, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4263, + "step": 552 + }, + { + "autoregressive_loss": 0.0859, + "epoch": 0.8374029907249668, + "grad_norm": 0.2714722752571106, + "learning_rate": 6.739693912134448e-07, + "loss": 34.7715, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.426, + "step": 553 + }, + { + "autoregressive_loss": 0.0859, + "epoch": 0.8389172818474352, + "grad_norm": 0.3653154969215393, + "learning_rate": 6.617149509022807e-07, + "loss": 34.7847, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4262, + "step": 554 + }, + { + "autoregressive_loss": 0.0879, + "epoch": 0.8404315729699035, + "grad_norm": 0.2869083881378174, + "learning_rate": 6.495650444564433e-07, + "loss": 34.802, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4262, + "step": 555 + }, + { + "autoregressive_loss": 0.0869, + "epoch": 0.8419458640923717, + "grad_norm": 0.3140890300273895, + "learning_rate": 6.375199646360142e-07, + "loss": 34.7929, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4262, + "step": 556 + }, + { + "autoregressive_loss": 0.0933, + "epoch": 0.8434601552148401, + "grad_norm": 0.3455820381641388, + "learning_rate": 6.255800016752089e-07, + "loss": 34.8372, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4261, + "step": 557 + }, + { + "autoregressive_loss": 0.0874, + "epoch": 0.8449744463373083, + "grad_norm": 0.332675576210022, + "learning_rate": 6.137454432753798e-07, + "loss": 34.8064, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4263, + "step": 558 + }, + { + "autoregressive_loss": 0.0874, + "epoch": 0.8464887374597766, + "grad_norm": 0.3277329206466675, + "learning_rate": 6.020165745980855e-07, + "loss": 34.7923, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4261, + "step": 559 + }, + { + "autoregressive_loss": 0.0898, + "epoch": 0.848003028582245, + "grad_norm": 0.33727237582206726, + "learning_rate": 5.903936782582253e-07, + "loss": 34.8194, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4262, + "step": 560 + }, + { + "autoregressive_loss": 0.083, + "epoch": 0.8495173197047132, + "grad_norm": 0.26772037148475647, + "learning_rate": 5.78877034317219e-07, + "loss": 34.7612, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4262, + "step": 561 + }, + { + "autoregressive_loss": 0.0869, + "epoch": 0.8510316108271815, + "grad_norm": 0.30933037400245667, + "learning_rate": 5.674669202762684e-07, + "loss": 34.7922, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4262, + "step": 562 + }, + { + "autoregressive_loss": 0.0874, + "epoch": 0.8525459019496499, + "grad_norm": 0.32231101393699646, + "learning_rate": 5.561636110696634e-07, + "loss": 34.804, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4263, + "step": 563 + }, + { + "autoregressive_loss": 0.0879, + "epoch": 0.8540601930721181, + "grad_norm": 0.308974027633667, + "learning_rate": 5.449673790581611e-07, + "loss": 34.796, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4262, + "step": 564 + }, + { + "autoregressive_loss": 0.084, + "epoch": 0.8555744841945864, + "grad_norm": 0.3265399932861328, + "learning_rate": 5.338784940224239e-07, + "loss": 34.7629, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4261, + "step": 565 + }, + { + "autoregressive_loss": 0.0869, + "epoch": 0.8570887753170547, + "grad_norm": 0.2981053292751312, + "learning_rate": 5.228972231565155e-07, + "loss": 34.7854, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4261, + "step": 566 + }, + { + "autoregressive_loss": 0.0879, + "epoch": 0.858603066439523, + "grad_norm": 0.319325715303421, + "learning_rate": 5.12023831061465e-07, + "loss": 34.8103, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4263, + "step": 567 + }, + { + "autoregressive_loss": 0.084, + "epoch": 0.8601173575619913, + "grad_norm": 0.29680633544921875, + "learning_rate": 5.012585797388936e-07, + "loss": 34.7786, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4263, + "step": 568 + }, + { + "autoregressive_loss": 0.0859, + "epoch": 0.8616316486844596, + "grad_norm": 0.33354252576828003, + "learning_rate": 4.906017285846921e-07, + "loss": 34.7734, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4261, + "step": 569 + }, + { + "autoregressive_loss": 0.0879, + "epoch": 0.8631459398069279, + "grad_norm": 0.32951420545578003, + "learning_rate": 4.800535343827834e-07, + "loss": 34.7945, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4262, + "step": 570 + }, + { + "autoregressive_loss": 0.0938, + "epoch": 0.8646602309293961, + "grad_norm": 0.30341294407844543, + "learning_rate": 4.6961425129892655e-07, + "loss": 34.8567, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4263, + "step": 571 + }, + { + "autoregressive_loss": 0.085, + "epoch": 0.8661745220518645, + "grad_norm": 0.29050469398498535, + "learning_rate": 4.5928413087459325e-07, + "loss": 34.7757, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4262, + "step": 572 + }, + { + "autoregressive_loss": 0.0879, + "epoch": 0.8676888131743328, + "grad_norm": 0.3062998354434967, + "learning_rate": 4.490634220209117e-07, + "loss": 34.806, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4263, + "step": 573 + }, + { + "autoregressive_loss": 0.0869, + "epoch": 0.869203104296801, + "grad_norm": 0.33275094628334045, + "learning_rate": 4.3895237101266195e-07, + "loss": 34.801, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4263, + "step": 574 + }, + { + "autoregressive_loss": 0.0967, + "epoch": 0.8707173954192694, + "grad_norm": 0.2958899736404419, + "learning_rate": 4.289512214823466e-07, + "loss": 34.8611, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4261, + "step": 575 + }, + { + "autoregressive_loss": 0.0879, + "epoch": 0.8722316865417377, + "grad_norm": 0.3450949192047119, + "learning_rate": 4.1906021441432074e-07, + "loss": 34.7957, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4262, + "step": 576 + }, + { + "autoregressive_loss": 0.0869, + "epoch": 0.8737459776642059, + "grad_norm": 0.3282499313354492, + "learning_rate": 4.092795881389805e-07, + "loss": 34.7923, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4262, + "step": 577 + }, + { + "autoregressive_loss": 0.0845, + "epoch": 0.8752602687866743, + "grad_norm": 0.322277307510376, + "learning_rate": 3.9960957832702594e-07, + "loss": 34.7753, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4262, + "step": 578 + }, + { + "autoregressive_loss": 0.0908, + "epoch": 0.8767745599091425, + "grad_norm": 0.31179720163345337, + "learning_rate": 3.9005041798377827e-07, + "loss": 34.8199, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4261, + "step": 579 + }, + { + "autoregressive_loss": 0.0889, + "epoch": 0.8782888510316108, + "grad_norm": 0.3436933159828186, + "learning_rate": 3.8060233744356634e-07, + "loss": 34.7951, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4261, + "step": 580 + }, + { + "autoregressive_loss": 0.0869, + "epoch": 0.8798031421540792, + "grad_norm": 0.32152727246284485, + "learning_rate": 3.7126556436417993e-07, + "loss": 34.782, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4261, + "step": 581 + }, + { + "autoregressive_loss": 0.0845, + "epoch": 0.8813174332765474, + "grad_norm": 0.29785141348838806, + "learning_rate": 3.620403237213799e-07, + "loss": 34.7662, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4261, + "step": 582 + }, + { + "autoregressive_loss": 0.0889, + "epoch": 0.8828317243990157, + "grad_norm": 0.38989323377609253, + "learning_rate": 3.5292683780347834e-07, + "loss": 34.7758, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4258, + "step": 583 + }, + { + "autoregressive_loss": 0.0889, + "epoch": 0.884346015521484, + "grad_norm": 0.3026503026485443, + "learning_rate": 3.439253262059822e-07, + "loss": 34.8099, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4262, + "step": 584 + }, + { + "autoregressive_loss": 0.0894, + "epoch": 0.8858603066439523, + "grad_norm": 0.33773958683013916, + "learning_rate": 3.350360058263058e-07, + "loss": 34.819, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4263, + "step": 585 + }, + { + "autoregressive_loss": 0.0762, + "epoch": 0.8873745977664206, + "grad_norm": 0.30930063128471375, + "learning_rate": 3.262590908585378e-07, + "loss": 34.7016, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4262, + "step": 586 + }, + { + "autoregressive_loss": 0.0864, + "epoch": 0.8888888888888888, + "grad_norm": 0.33707714080810547, + "learning_rate": 3.1759479278828665e-07, + "loss": 34.7849, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4261, + "step": 587 + }, + { + "autoregressive_loss": 0.082, + "epoch": 0.8904031800113572, + "grad_norm": 0.33073893189430237, + "learning_rate": 3.0904332038757977e-07, + "loss": 34.7421, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.426, + "step": 588 + }, + { + "autoregressive_loss": 0.082, + "epoch": 0.8919174711338255, + "grad_norm": 0.32578158378601074, + "learning_rate": 3.006048797098349e-07, + "loss": 34.7504, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4261, + "step": 589 + }, + { + "autoregressive_loss": 0.0771, + "epoch": 0.8934317622562937, + "grad_norm": 0.3023960292339325, + "learning_rate": 2.9227967408489653e-07, + "loss": 34.6997, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.426, + "step": 590 + }, + { + "autoregressive_loss": 0.0918, + "epoch": 0.8949460533787621, + "grad_norm": 0.3292075991630554, + "learning_rate": 2.8406790411413366e-07, + "loss": 34.8085, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4259, + "step": 591 + }, + { + "autoregressive_loss": 0.0869, + "epoch": 0.8964603445012304, + "grad_norm": 0.32842153310775757, + "learning_rate": 2.7596976766560977e-07, + "loss": 34.7751, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.426, + "step": 592 + }, + { + "autoregressive_loss": 0.0854, + "epoch": 0.8979746356236986, + "grad_norm": 0.29708924889564514, + "learning_rate": 2.6798545986931214e-07, + "loss": 34.7703, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4261, + "step": 593 + }, + { + "autoregressive_loss": 0.0806, + "epoch": 0.899488926746167, + "grad_norm": 0.3017195165157318, + "learning_rate": 2.601151731124485e-07, + "loss": 34.7561, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4264, + "step": 594 + }, + { + "autoregressive_loss": 0.0947, + "epoch": 0.9010032178686352, + "grad_norm": 0.30274471640586853, + "learning_rate": 2.523590970348166e-07, + "loss": 34.8448, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4261, + "step": 595 + }, + { + "autoregressive_loss": 0.0889, + "epoch": 0.9025175089911035, + "grad_norm": 0.3421143591403961, + "learning_rate": 2.447174185242324e-07, + "loss": 34.8194, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4264, + "step": 596 + }, + { + "autoregressive_loss": 0.0864, + "epoch": 0.9040318001135719, + "grad_norm": 0.3052239716053009, + "learning_rate": 2.3719032171202362e-07, + "loss": 34.7798, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4261, + "step": 597 + }, + { + "autoregressive_loss": 0.0835, + "epoch": 0.9055460912360401, + "grad_norm": 0.274460107088089, + "learning_rate": 2.2977798796859796e-07, + "loss": 34.7559, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4261, + "step": 598 + }, + { + "autoregressive_loss": 0.0825, + "epoch": 0.9070603823585084, + "grad_norm": 0.2991667091846466, + "learning_rate": 2.2248059589906944e-07, + "loss": 34.7417, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.426, + "step": 599 + }, + { + "autoregressive_loss": 0.0918, + "epoch": 0.9085746734809768, + "grad_norm": 0.33313480019569397, + "learning_rate": 2.152983213389559e-07, + "loss": 34.8168, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.426, + "step": 600 + }, + { + "autoregressive_loss": 0.0874, + "epoch": 0.910088964603445, + "grad_norm": 0.27085942029953003, + "learning_rate": 2.082313373499434e-07, + "loss": 34.8025, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4263, + "step": 601 + }, + { + "autoregressive_loss": 0.082, + "epoch": 0.9116032557259133, + "grad_norm": 0.28070002794265747, + "learning_rate": 2.0127981421571295e-07, + "loss": 34.7836, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4266, + "step": 602 + }, + { + "autoregressive_loss": 0.0869, + "epoch": 0.9131175468483816, + "grad_norm": 0.2977098226547241, + "learning_rate": 1.9444391943784225e-07, + "loss": 34.7832, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4261, + "step": 603 + }, + { + "autoregressive_loss": 0.0903, + "epoch": 0.9146318379708499, + "grad_norm": 0.3303548991680145, + "learning_rate": 1.8772381773176417e-07, + "loss": 34.8125, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4261, + "step": 604 + }, + { + "autoregressive_loss": 0.0869, + "epoch": 0.9161461290933182, + "grad_norm": 0.2439805269241333, + "learning_rate": 1.8111967102280082e-07, + "loss": 34.7779, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4261, + "step": 605 + }, + { + "autoregressive_loss": 0.0898, + "epoch": 0.9176604202157865, + "grad_norm": 0.28860223293304443, + "learning_rate": 1.7463163844226304e-07, + "loss": 34.8024, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4261, + "step": 606 + }, + { + "autoregressive_loss": 0.0913, + "epoch": 0.9191747113382548, + "grad_norm": 0.2815372347831726, + "learning_rate": 1.6825987632361373e-07, + "loss": 34.8232, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4262, + "step": 607 + }, + { + "autoregressive_loss": 0.0884, + "epoch": 0.920689002460723, + "grad_norm": 0.32012590765953064, + "learning_rate": 1.6200453819870122e-07, + "loss": 34.8145, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4263, + "step": 608 + }, + { + "autoregressive_loss": 0.0869, + "epoch": 0.9222032935831914, + "grad_norm": 0.27548354864120483, + "learning_rate": 1.5586577479406006e-07, + "loss": 34.7864, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4262, + "step": 609 + }, + { + "autoregressive_loss": 0.0869, + "epoch": 0.9237175847056597, + "grad_norm": 0.3104095160961151, + "learning_rate": 1.4984373402728014e-07, + "loss": 34.7872, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4262, + "step": 610 + }, + { + "autoregressive_loss": 0.0898, + "epoch": 0.9252318758281279, + "grad_norm": 0.3133639395236969, + "learning_rate": 1.4393856100344107e-07, + "loss": 34.8324, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4264, + "step": 611 + }, + { + "autoregressive_loss": 0.0913, + "epoch": 0.9267461669505963, + "grad_norm": 0.2894345819950104, + "learning_rate": 1.3815039801161723e-07, + "loss": 34.8207, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4261, + "step": 612 + }, + { + "autoregressive_loss": 0.0879, + "epoch": 0.9282604580730646, + "grad_norm": 0.2802545428276062, + "learning_rate": 1.3247938452144727e-07, + "loss": 34.7977, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4262, + "step": 613 + }, + { + "autoregressive_loss": 0.0781, + "epoch": 0.9297747491955328, + "grad_norm": 0.2661442458629608, + "learning_rate": 1.26925657179775e-07, + "loss": 34.7359, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4264, + "step": 614 + }, + { + "autoregressive_loss": 0.0889, + "epoch": 0.9312890403180011, + "grad_norm": 0.32268404960632324, + "learning_rate": 1.2148934980735772e-07, + "loss": 34.8009, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4261, + "step": 615 + }, + { + "autoregressive_loss": 0.084, + "epoch": 0.9328033314404695, + "grad_norm": 0.30408287048339844, + "learning_rate": 1.1617059339563807e-07, + "loss": 34.7695, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4263, + "step": 616 + }, + { + "autoregressive_loss": 0.0859, + "epoch": 0.9343176225629377, + "grad_norm": 0.2603103220462799, + "learning_rate": 1.1096951610359174e-07, + "loss": 34.773, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.426, + "step": 617 + }, + { + "autoregressive_loss": 0.0879, + "epoch": 0.935831913685406, + "grad_norm": 0.2911682426929474, + "learning_rate": 1.058862432546387e-07, + "loss": 34.7942, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4261, + "step": 618 + }, + { + "autoregressive_loss": 0.085, + "epoch": 0.9373462048078743, + "grad_norm": 0.26261624693870544, + "learning_rate": 1.0092089733361898e-07, + "loss": 34.7718, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4261, + "step": 619 + }, + { + "autoregressive_loss": 0.0811, + "epoch": 0.9388604959303426, + "grad_norm": 0.27117201685905457, + "learning_rate": 9.607359798384785e-08, + "loss": 34.7335, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4261, + "step": 620 + }, + { + "autoregressive_loss": 0.083, + "epoch": 0.9403747870528109, + "grad_norm": 0.2801746726036072, + "learning_rate": 9.134446200422919e-08, + "loss": 34.7531, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4261, + "step": 621 + }, + { + "autoregressive_loss": 0.0947, + "epoch": 0.9418890781752792, + "grad_norm": 0.24996960163116455, + "learning_rate": 8.67336033464411e-08, + "loss": 34.8349, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.426, + "step": 622 + }, + { + "autoregressive_loss": 0.0752, + "epoch": 0.9434033692977475, + "grad_norm": 0.26471927762031555, + "learning_rate": 8.224113311219251e-08, + "loss": 34.6906, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4262, + "step": 623 + }, + { + "autoregressive_loss": 0.0889, + "epoch": 0.9449176604202157, + "grad_norm": 0.29764389991760254, + "learning_rate": 7.786715955054202e-08, + "loss": 34.8172, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4263, + "step": 624 + }, + { + "autoregressive_loss": 0.0908, + "epoch": 0.9464319515426841, + "grad_norm": 0.3014511168003082, + "learning_rate": 7.36117880552939e-08, + "loss": 34.8138, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4261, + "step": 625 + }, + { + "autoregressive_loss": 0.082, + "epoch": 0.9479462426651524, + "grad_norm": 0.26645052433013916, + "learning_rate": 6.947512116245669e-08, + "loss": 34.7609, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4263, + "step": 626 + }, + { + "autoregressive_loss": 0.0928, + "epoch": 0.9494605337876206, + "grad_norm": 0.2837405204772949, + "learning_rate": 6.545725854777086e-08, + "loss": 34.8349, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4262, + "step": 627 + }, + { + "autoregressive_loss": 0.085, + "epoch": 0.950974824910089, + "grad_norm": 0.292466402053833, + "learning_rate": 6.15582970243117e-08, + "loss": 34.7758, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4262, + "step": 628 + }, + { + "autoregressive_loss": 0.0889, + "epoch": 0.9524891160325573, + "grad_norm": 0.26005902886390686, + "learning_rate": 5.777833054015025e-08, + "loss": 34.7936, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.426, + "step": 629 + }, + { + "autoregressive_loss": 0.0869, + "epoch": 0.9540034071550255, + "grad_norm": 0.30476951599121094, + "learning_rate": 5.411745017609493e-08, + "loss": 34.8084, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4264, + "step": 630 + }, + { + "autoregressive_loss": 0.0898, + "epoch": 0.9555176982774939, + "grad_norm": 0.2892724573612213, + "learning_rate": 5.0575744143495084e-08, + "loss": 34.8061, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4261, + "step": 631 + }, + { + "autoregressive_loss": 0.0884, + "epoch": 0.9570319893999621, + "grad_norm": 0.29837268590927124, + "learning_rate": 4.715329778211375e-08, + "loss": 34.8138, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4263, + "step": 632 + }, + { + "autoregressive_loss": 0.0928, + "epoch": 0.9585462805224304, + "grad_norm": 0.2778078615665436, + "learning_rate": 4.3850193558073736e-08, + "loss": 34.8147, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4259, + "step": 633 + }, + { + "autoregressive_loss": 0.0801, + "epoch": 0.9600605716448988, + "grad_norm": 0.26855188608169556, + "learning_rate": 4.0666511061869804e-08, + "loss": 34.7438, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4263, + "step": 634 + }, + { + "autoregressive_loss": 0.0894, + "epoch": 0.961574862767367, + "grad_norm": 0.30604061484336853, + "learning_rate": 3.7602327006450166e-08, + "loss": 34.7972, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.426, + "step": 635 + }, + { + "autoregressive_loss": 0.0771, + "epoch": 0.9630891538898353, + "grad_norm": 0.2829412817955017, + "learning_rate": 3.465771522536854e-08, + "loss": 34.7121, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4262, + "step": 636 + }, + { + "autoregressive_loss": 0.0879, + "epoch": 0.9646034450123037, + "grad_norm": 0.2582142949104309, + "learning_rate": 3.183274667100611e-08, + "loss": 34.7942, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4261, + "step": 637 + }, + { + "autoregressive_loss": 0.0913, + "epoch": 0.9661177361347719, + "grad_norm": 0.27536752820014954, + "learning_rate": 2.9127489412859033e-08, + "loss": 34.8186, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4261, + "step": 638 + }, + { + "autoregressive_loss": 0.0898, + "epoch": 0.9676320272572402, + "grad_norm": 0.2607889473438263, + "learning_rate": 2.6542008635902504e-08, + "loss": 34.8145, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4262, + "step": 639 + }, + { + "autoregressive_loss": 0.0859, + "epoch": 0.9691463183797085, + "grad_norm": 0.33923035860061646, + "learning_rate": 2.4076366639015914e-08, + "loss": 34.7835, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4262, + "step": 640 + }, + { + "autoregressive_loss": 0.0947, + "epoch": 0.9706606095021768, + "grad_norm": 0.3230699300765991, + "learning_rate": 2.1730622833483484e-08, + "loss": 34.8422, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.426, + "step": 641 + }, + { + "autoregressive_loss": 0.0859, + "epoch": 0.9721749006246451, + "grad_norm": 0.28410279750823975, + "learning_rate": 1.950483374156431e-08, + "loss": 34.7697, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.426, + "step": 642 + }, + { + "autoregressive_loss": 0.0835, + "epoch": 0.9736891917471134, + "grad_norm": 0.29380741715431213, + "learning_rate": 1.7399052995126787e-08, + "loss": 34.741, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4259, + "step": 643 + }, + { + "autoregressive_loss": 0.0938, + "epoch": 0.9752034828695817, + "grad_norm": 0.2831152081489563, + "learning_rate": 1.541333133436018e-08, + "loss": 34.8291, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.426, + "step": 644 + }, + { + "autoregressive_loss": 0.0889, + "epoch": 0.97671777399205, + "grad_norm": 0.2808015048503876, + "learning_rate": 1.3547716606548967e-08, + "loss": 34.7977, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4261, + "step": 645 + }, + { + "autoregressive_loss": 0.0889, + "epoch": 0.9782320651145182, + "grad_norm": 0.27903300523757935, + "learning_rate": 1.18022537649215e-08, + "loss": 34.8204, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4263, + "step": 646 + }, + { + "autoregressive_loss": 0.084, + "epoch": 0.9797463562369866, + "grad_norm": 0.30918940901756287, + "learning_rate": 1.0176984867567018e-08, + "loss": 34.754, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.426, + "step": 647 + }, + { + "autoregressive_loss": 0.0825, + "epoch": 0.9812606473594548, + "grad_norm": 0.25111234188079834, + "learning_rate": 8.671949076420883e-09, + "loss": 34.7615, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4263, + "step": 648 + }, + { + "autoregressive_loss": 0.0762, + "epoch": 0.9827749384819231, + "grad_norm": 0.26341938972473145, + "learning_rate": 7.2871826563214454e-09, + "loss": 34.7082, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4262, + "step": 649 + }, + { + "autoregressive_loss": 0.0859, + "epoch": 0.9842892296043915, + "grad_norm": 0.2976416349411011, + "learning_rate": 6.022718974137976e-09, + "loss": 34.7768, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4261, + "step": 650 + }, + { + "autoregressive_loss": 0.0889, + "epoch": 0.9858035207268597, + "grad_norm": 0.2481272667646408, + "learning_rate": 4.878588497964077e-09, + "loss": 34.7931, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.426, + "step": 651 + }, + { + "autoregressive_loss": 0.0894, + "epoch": 0.987317811849328, + "grad_norm": 0.27078938484191895, + "learning_rate": 3.854818796385495e-09, + "loss": 34.8071, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4262, + "step": 652 + }, + { + "autoregressive_loss": 0.0806, + "epoch": 0.9888321029717964, + "grad_norm": 0.29280775785446167, + "learning_rate": 2.9514345378134357e-09, + "loss": 34.7623, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4265, + "step": 653 + }, + { + "autoregressive_loss": 0.0889, + "epoch": 0.9903463940942646, + "grad_norm": 0.24757161736488342, + "learning_rate": 2.168457489893916e-09, + "loss": 34.8076, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4262, + "step": 654 + }, + { + "autoregressive_loss": 0.0908, + "epoch": 0.9918606852167329, + "grad_norm": 0.2919241189956665, + "learning_rate": 1.5059065189787502e-09, + "loss": 34.8076, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.426, + "step": 655 + }, + { + "autoregressive_loss": 0.0938, + "epoch": 0.9933749763392012, + "grad_norm": 0.300950825214386, + "learning_rate": 9.637975896759077e-10, + "loss": 34.8301, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.426, + "step": 656 + }, + { + "autoregressive_loss": 0.083, + "epoch": 0.9948892674616695, + "grad_norm": 0.27437418699264526, + "learning_rate": 5.421437644598237e-10, + "loss": 34.7561, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4262, + "step": 657 + }, + { + "autoregressive_loss": 0.0854, + "epoch": 0.9964035585841378, + "grad_norm": 0.28754356503486633, + "learning_rate": 2.4095520335998266e-10, + "loss": 34.7589, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4259, + "step": 658 + }, + { + "autoregressive_loss": 0.0879, + "epoch": 0.9979178497066061, + "grad_norm": 0.3117378354072571, + "learning_rate": 6.02391637155586e-11, + "loss": 34.7733, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.4259, + "step": 659 + }, + { + "autoregressive_loss": 0.0859, + "epoch": 0.9994321408290744, + "grad_norm": 0.2710754871368408, + "learning_rate": 0.0, + "loss": 34.7674, + "moe_aux_loss": 0.0, + "probe_diversity_loss": 0.426, + "step": 660 + }, + { + "epoch": 0.9994321408290744, + "step": 660, + "total_flos": 3.7320131833323586e+19, + "train_loss": 37.01489402886593, + "train_runtime": 59387.4366, + "train_samples_per_second": 1.423, + "train_steps_per_second": 0.011 + } + ], + "logging_steps": 1.0, + "max_steps": 660, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 5000.0, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.7320131833323586e+19, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}