{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 170, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03537214443625645, "grad_norm": 3483.533935546875, "learning_rate": 7.692307692307694e-07, "loss": 51.896, "step": 3 }, { "epoch": 0.0707442888725129, "grad_norm": 2646.85693359375, "learning_rate": 1.9230769230769234e-06, "loss": 35.9263, "step": 6 }, { "epoch": 0.10611643330876934, "grad_norm": 452.2362365722656, "learning_rate": 3.0769230769230774e-06, "loss": 6.4672, "step": 9 }, { "epoch": 0.1414885777450258, "grad_norm": 224.29888916015625, "learning_rate": 4.230769230769231e-06, "loss": 2.9289, "step": 12 }, { "epoch": 0.17686072218128224, "grad_norm": 343.0766296386719, "learning_rate": 5.384615384615385e-06, "loss": 2.6691, "step": 15 }, { "epoch": 0.21223286661753868, "grad_norm": 241.66685485839844, "learning_rate": 6.538461538461539e-06, "loss": 2.4547, "step": 18 }, { "epoch": 0.24760501105379515, "grad_norm": 266.5543212890625, "learning_rate": 7.692307692307694e-06, "loss": 2.2923, "step": 21 }, { "epoch": 0.2829771554900516, "grad_norm": 200.54481506347656, "learning_rate": 8.846153846153847e-06, "loss": 3.5719, "step": 24 }, { "epoch": 0.318349299926308, "grad_norm": 223.88656616210938, "learning_rate": 1e-05, "loss": 3.1235, "step": 27 }, { "epoch": 0.3537214443625645, "grad_norm": 289.29449462890625, "learning_rate": 9.999672943258572e-06, "loss": 2.9122, "step": 30 }, { "epoch": 0.38909358879882094, "grad_norm": 152.24334716796875, "learning_rate": 9.998691815820732e-06, "loss": 2.7438, "step": 33 }, { "epoch": 0.42446573323507736, "grad_norm": 43.65974426269531, "learning_rate": 9.997056746040215e-06, "loss": 1.9228, "step": 36 }, { "epoch": 0.45983787767133383, "grad_norm": 115.5759048461914, "learning_rate": 9.994767947821261e-06, "loss": 2.8869, "step": 39 }, { "epoch": 0.4952100221075903, "grad_norm": 72.44747924804688, "learning_rate": 9.991825720590627e-06, "loss": 1.8182, "step": 42 }, { "epoch": 0.5305821665438467, "grad_norm": 49.48904800415039, "learning_rate": 9.988230449258409e-06, "loss": 2.5766, "step": 45 }, { "epoch": 0.5659543109801032, "grad_norm": 53.719970703125, "learning_rate": 9.983982604167699e-06, "loss": 2.5584, "step": 48 }, { "epoch": 0.6013264554163597, "grad_norm": 72.06404113769531, "learning_rate": 9.979082741033047e-06, "loss": 2.0214, "step": 51 }, { "epoch": 0.636698599852616, "grad_norm": 197.82850646972656, "learning_rate": 9.973531500867761e-06, "loss": 2.4626, "step": 54 }, { "epoch": 0.6720707442888725, "grad_norm": 79.25627899169922, "learning_rate": 9.96732960990005e-06, "loss": 1.9808, "step": 57 }, { "epoch": 0.707442888725129, "grad_norm": 38.53336715698242, "learning_rate": 9.96047787947801e-06, "loss": 2.0431, "step": 60 }, { "epoch": 0.7428150331613854, "grad_norm": 106.73336029052734, "learning_rate": 9.952977205963496e-06, "loss": 2.3707, "step": 63 }, { "epoch": 0.7781871775976419, "grad_norm": 106.87248229980469, "learning_rate": 9.94482857061484e-06, "loss": 1.9355, "step": 66 }, { "epoch": 0.8135593220338984, "grad_norm": 30.651411056518555, "learning_rate": 9.936033039458494e-06, "loss": 1.9553, "step": 69 }, { "epoch": 0.8489314664701547, "grad_norm": 50.57050323486328, "learning_rate": 9.92659176314956e-06, "loss": 1.85, "step": 72 }, { "epoch": 0.8843036109064112, "grad_norm": 252.79238891601562, "learning_rate": 9.916505976821262e-06, "loss": 2.2428, "step": 75 }, { "epoch": 0.9196757553426677, "grad_norm": 159.39630126953125, "learning_rate": 9.905776999923369e-06, "loss": 2.4031, "step": 78 }, { "epoch": 0.9550478997789241, "grad_norm": 50.61412048339844, "learning_rate": 9.894406236049569e-06, "loss": 1.9067, "step": 81 }, { "epoch": 0.9904200442151806, "grad_norm": 55.9256706237793, "learning_rate": 9.882395172753852e-06, "loss": 1.8086, "step": 84 }, { "epoch": 1.0, "eval_loss": 0.05901043117046356, "eval_runtime": 22.6695, "eval_samples_per_second": 44.112, "eval_steps_per_second": 22.056, "step": 85 }, { "epoch": 1.023581429624171, "grad_norm": 27.12094497680664, "learning_rate": 9.869745381355906e-06, "loss": 1.3243, "step": 87 }, { "epoch": 1.0589535740604274, "grad_norm": 37.28285217285156, "learning_rate": 9.856458516735558e-06, "loss": 1.1639, "step": 90 }, { "epoch": 1.094325718496684, "grad_norm": 89.23674011230469, "learning_rate": 9.842536317116262e-06, "loss": 1.0754, "step": 93 }, { "epoch": 1.1296978629329404, "grad_norm": 98.14556121826172, "learning_rate": 9.827980603837715e-06, "loss": 1.0517, "step": 96 }, { "epoch": 1.1650700073691969, "grad_norm": 134.01821899414062, "learning_rate": 9.81279328111758e-06, "loss": 0.9979, "step": 99 }, { "epoch": 1.2004421518054533, "grad_norm": 37.08598709106445, "learning_rate": 9.796976335802369e-06, "loss": 1.2897, "step": 102 }, { "epoch": 1.2358142962417096, "grad_norm": 138.39120483398438, "learning_rate": 9.780531837107519e-06, "loss": 1.2761, "step": 105 }, { "epoch": 1.271186440677966, "grad_norm": 99.29012298583984, "learning_rate": 9.763461936346694e-06, "loss": 1.7901, "step": 108 }, { "epoch": 1.3065585851142225, "grad_norm": 117.44629669189453, "learning_rate": 9.745768866650339e-06, "loss": 1.2258, "step": 111 }, { "epoch": 1.341930729550479, "grad_norm": 48.49049377441406, "learning_rate": 9.727454942673544e-06, "loss": 1.4136, "step": 114 }, { "epoch": 1.3773028739867355, "grad_norm": 50.829097747802734, "learning_rate": 9.70852256029323e-06, "loss": 0.8894, "step": 117 }, { "epoch": 1.412675018422992, "grad_norm": 113.78025817871094, "learning_rate": 9.68897419629471e-06, "loss": 1.6923, "step": 120 }, { "epoch": 1.4480471628592484, "grad_norm": 80.82630157470703, "learning_rate": 9.66881240804768e-06, "loss": 1.0206, "step": 123 }, { "epoch": 1.4834193072955049, "grad_norm": 162.958984375, "learning_rate": 9.648039833171639e-06, "loss": 1.9516, "step": 126 }, { "epoch": 1.518791451731761, "grad_norm": 36.6719856262207, "learning_rate": 9.626659189190852e-06, "loss": 0.749, "step": 129 }, { "epoch": 1.5541635961680176, "grad_norm": 94.12837219238281, "learning_rate": 9.60467327317882e-06, "loss": 1.639, "step": 132 }, { "epoch": 1.589535740604274, "grad_norm": 190.69659423828125, "learning_rate": 9.582084961392358e-06, "loss": 1.209, "step": 135 }, { "epoch": 1.6249078850405305, "grad_norm": 52.122528076171875, "learning_rate": 9.55889720889533e-06, "loss": 1.2829, "step": 138 }, { "epoch": 1.660280029476787, "grad_norm": 92.39747619628906, "learning_rate": 9.53511304917204e-06, "loss": 1.3792, "step": 141 }, { "epoch": 1.6956521739130435, "grad_norm": 174.52503967285156, "learning_rate": 9.510735593730402e-06, "loss": 1.9879, "step": 144 }, { "epoch": 1.7310243183493, "grad_norm": 69.46823120117188, "learning_rate": 9.485768031694872e-06, "loss": 1.1063, "step": 147 }, { "epoch": 1.7663964627855564, "grad_norm": 56.77116775512695, "learning_rate": 9.460213629389241e-06, "loss": 1.3528, "step": 150 }, { "epoch": 1.8017686072218129, "grad_norm": 49.163902282714844, "learning_rate": 9.43407572990933e-06, "loss": 1.4749, "step": 153 }, { "epoch": 1.8371407516580693, "grad_norm": 96.97235870361328, "learning_rate": 9.407357752685628e-06, "loss": 1.3238, "step": 156 }, { "epoch": 1.8725128960943258, "grad_norm": 57.185482025146484, "learning_rate": 9.380063193035968e-06, "loss": 1.2848, "step": 159 }, { "epoch": 1.9078850405305823, "grad_norm": 55.79048156738281, "learning_rate": 9.352195621708239e-06, "loss": 1.1498, "step": 162 }, { "epoch": 1.9432571849668387, "grad_norm": 30.209585189819336, "learning_rate": 9.323758684413272e-06, "loss": 1.0718, "step": 165 }, { "epoch": 1.9786293294030952, "grad_norm": 37.25614547729492, "learning_rate": 9.294756101347888e-06, "loss": 0.647, "step": 168 }, { "epoch": 2.0, "eval_loss": 0.046793434768915176, "eval_runtime": 21.4672, "eval_samples_per_second": 46.583, "eval_steps_per_second": 23.291, "step": 170 } ], "logging_steps": 3, "max_steps": 850, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.100963244849234e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }