openthoughts3_code_100k / trainer_state.json
sedrickkeh's picture
Upload trainer_state.json with huggingface_hub
3d16f95 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 500,
"global_step": 950,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005263157894736842,
"grad_norm": 7.792475124460211,
"learning_rate": 8.421052631578948e-07,
"loss": 1.844,
"step": 1
},
{
"epoch": 0.010526315789473684,
"grad_norm": 7.77107258632735,
"learning_rate": 1.6842105263157895e-06,
"loss": 1.8342,
"step": 2
},
{
"epoch": 0.015789473684210527,
"grad_norm": 7.734735147966475,
"learning_rate": 2.5263157894736844e-06,
"loss": 1.8324,
"step": 3
},
{
"epoch": 0.021052631578947368,
"grad_norm": 7.248228363662717,
"learning_rate": 3.368421052631579e-06,
"loss": 1.816,
"step": 4
},
{
"epoch": 0.02631578947368421,
"grad_norm": 5.778539169912221,
"learning_rate": 4.210526315789474e-06,
"loss": 1.7849,
"step": 5
},
{
"epoch": 0.031578947368421054,
"grad_norm": 3.239108511490633,
"learning_rate": 5.052631578947369e-06,
"loss": 1.7388,
"step": 6
},
{
"epoch": 0.03684210526315789,
"grad_norm": 2.659157940796987,
"learning_rate": 5.8947368421052634e-06,
"loss": 1.7195,
"step": 7
},
{
"epoch": 0.042105263157894736,
"grad_norm": 5.788438535248494,
"learning_rate": 6.736842105263158e-06,
"loss": 1.716,
"step": 8
},
{
"epoch": 0.04736842105263158,
"grad_norm": 5.78966752434045,
"learning_rate": 7.578947368421054e-06,
"loss": 1.7132,
"step": 9
},
{
"epoch": 0.05263157894736842,
"grad_norm": 5.662263812934936,
"learning_rate": 8.421052631578948e-06,
"loss": 1.7155,
"step": 10
},
{
"epoch": 0.05789473684210526,
"grad_norm": 4.3906745364224955,
"learning_rate": 9.263157894736842e-06,
"loss": 1.6617,
"step": 11
},
{
"epoch": 0.06315789473684211,
"grad_norm": 3.7326640381933247,
"learning_rate": 1.0105263157894738e-05,
"loss": 1.6634,
"step": 12
},
{
"epoch": 0.06842105263157895,
"grad_norm": 2.669371108068819,
"learning_rate": 1.0947368421052633e-05,
"loss": 1.6463,
"step": 13
},
{
"epoch": 0.07368421052631578,
"grad_norm": 2.080061860580032,
"learning_rate": 1.1789473684210527e-05,
"loss": 1.5925,
"step": 14
},
{
"epoch": 0.07894736842105263,
"grad_norm": 2.245619823013294,
"learning_rate": 1.263157894736842e-05,
"loss": 1.5797,
"step": 15
},
{
"epoch": 0.08421052631578947,
"grad_norm": 2.0053863294690806,
"learning_rate": 1.3473684210526316e-05,
"loss": 1.5613,
"step": 16
},
{
"epoch": 0.08947368421052632,
"grad_norm": 1.9625639332300135,
"learning_rate": 1.4315789473684212e-05,
"loss": 1.5638,
"step": 17
},
{
"epoch": 0.09473684210526316,
"grad_norm": 1.5893875551469467,
"learning_rate": 1.5157894736842107e-05,
"loss": 1.5522,
"step": 18
},
{
"epoch": 0.1,
"grad_norm": 1.5906140921890974,
"learning_rate": 1.6000000000000003e-05,
"loss": 1.5209,
"step": 19
},
{
"epoch": 0.10526315789473684,
"grad_norm": 1.559169647702599,
"learning_rate": 1.6842105263157896e-05,
"loss": 1.4911,
"step": 20
},
{
"epoch": 0.11052631578947368,
"grad_norm": 1.2982321447862488,
"learning_rate": 1.768421052631579e-05,
"loss": 1.4991,
"step": 21
},
{
"epoch": 0.11578947368421053,
"grad_norm": 1.3143336518371307,
"learning_rate": 1.8526315789473684e-05,
"loss": 1.5012,
"step": 22
},
{
"epoch": 0.12105263157894737,
"grad_norm": 1.279000313318411,
"learning_rate": 1.936842105263158e-05,
"loss": 1.5011,
"step": 23
},
{
"epoch": 0.12631578947368421,
"grad_norm": 0.922398830661329,
"learning_rate": 2.0210526315789475e-05,
"loss": 1.4624,
"step": 24
},
{
"epoch": 0.13157894736842105,
"grad_norm": 1.3274580990016782,
"learning_rate": 2.105263157894737e-05,
"loss": 1.4645,
"step": 25
},
{
"epoch": 0.1368421052631579,
"grad_norm": 1.1341340679878056,
"learning_rate": 2.1894736842105266e-05,
"loss": 1.4715,
"step": 26
},
{
"epoch": 0.14210526315789473,
"grad_norm": 1.2628910185228979,
"learning_rate": 2.273684210526316e-05,
"loss": 1.4584,
"step": 27
},
{
"epoch": 0.14736842105263157,
"grad_norm": 1.57433223809749,
"learning_rate": 2.3578947368421054e-05,
"loss": 1.448,
"step": 28
},
{
"epoch": 0.15263157894736842,
"grad_norm": 0.8438420059614518,
"learning_rate": 2.442105263157895e-05,
"loss": 1.4442,
"step": 29
},
{
"epoch": 0.15789473684210525,
"grad_norm": 1.280060736418443,
"learning_rate": 2.526315789473684e-05,
"loss": 1.4577,
"step": 30
},
{
"epoch": 0.1631578947368421,
"grad_norm": 1.8388712899282178,
"learning_rate": 2.610526315789474e-05,
"loss": 1.4354,
"step": 31
},
{
"epoch": 0.16842105263157894,
"grad_norm": 1.1283525934214154,
"learning_rate": 2.6947368421052632e-05,
"loss": 1.4458,
"step": 32
},
{
"epoch": 0.1736842105263158,
"grad_norm": 1.559904631956879,
"learning_rate": 2.778947368421053e-05,
"loss": 1.4337,
"step": 33
},
{
"epoch": 0.17894736842105263,
"grad_norm": 1.433428530147804,
"learning_rate": 2.8631578947368423e-05,
"loss": 1.4311,
"step": 34
},
{
"epoch": 0.18421052631578946,
"grad_norm": 1.4475127262626666,
"learning_rate": 2.9473684210526317e-05,
"loss": 1.4296,
"step": 35
},
{
"epoch": 0.18947368421052632,
"grad_norm": 1.6258635588736965,
"learning_rate": 3.0315789473684214e-05,
"loss": 1.403,
"step": 36
},
{
"epoch": 0.19473684210526315,
"grad_norm": 0.9140879296838869,
"learning_rate": 3.1157894736842105e-05,
"loss": 1.4149,
"step": 37
},
{
"epoch": 0.2,
"grad_norm": 1.355462077600805,
"learning_rate": 3.2000000000000005e-05,
"loss": 1.4241,
"step": 38
},
{
"epoch": 0.20526315789473684,
"grad_norm": 2.025270623377536,
"learning_rate": 3.28421052631579e-05,
"loss": 1.4269,
"step": 39
},
{
"epoch": 0.21052631578947367,
"grad_norm": 1.5911005582582893,
"learning_rate": 3.368421052631579e-05,
"loss": 1.4261,
"step": 40
},
{
"epoch": 0.21578947368421053,
"grad_norm": 1.7736374990816877,
"learning_rate": 3.452631578947369e-05,
"loss": 1.4248,
"step": 41
},
{
"epoch": 0.22105263157894736,
"grad_norm": 2.020103172778917,
"learning_rate": 3.536842105263158e-05,
"loss": 1.4019,
"step": 42
},
{
"epoch": 0.22631578947368422,
"grad_norm": 2.1032904325246693,
"learning_rate": 3.621052631578948e-05,
"loss": 1.4138,
"step": 43
},
{
"epoch": 0.23157894736842105,
"grad_norm": 2.034839240989353,
"learning_rate": 3.705263157894737e-05,
"loss": 1.4099,
"step": 44
},
{
"epoch": 0.23684210526315788,
"grad_norm": 1.5310564952941104,
"learning_rate": 3.789473684210526e-05,
"loss": 1.419,
"step": 45
},
{
"epoch": 0.24210526315789474,
"grad_norm": 2.3582192444588594,
"learning_rate": 3.873684210526316e-05,
"loss": 1.3989,
"step": 46
},
{
"epoch": 0.24736842105263157,
"grad_norm": 1.2404229618115798,
"learning_rate": 3.9578947368421056e-05,
"loss": 1.4034,
"step": 47
},
{
"epoch": 0.25263157894736843,
"grad_norm": 2.631335015977353,
"learning_rate": 4.042105263157895e-05,
"loss": 1.4067,
"step": 48
},
{
"epoch": 0.2578947368421053,
"grad_norm": 1.7741169104229972,
"learning_rate": 4.126315789473685e-05,
"loss": 1.3842,
"step": 49
},
{
"epoch": 0.2631578947368421,
"grad_norm": 2.7958288897467813,
"learning_rate": 4.210526315789474e-05,
"loss": 1.4165,
"step": 50
},
{
"epoch": 0.26842105263157895,
"grad_norm": 2.106018978323054,
"learning_rate": 4.294736842105264e-05,
"loss": 1.408,
"step": 51
},
{
"epoch": 0.2736842105263158,
"grad_norm": 2.1854136752650284,
"learning_rate": 4.378947368421053e-05,
"loss": 1.4039,
"step": 52
},
{
"epoch": 0.2789473684210526,
"grad_norm": 2.1320541034792897,
"learning_rate": 4.463157894736842e-05,
"loss": 1.3792,
"step": 53
},
{
"epoch": 0.28421052631578947,
"grad_norm": 2.1929241850886183,
"learning_rate": 4.547368421052632e-05,
"loss": 1.405,
"step": 54
},
{
"epoch": 0.2894736842105263,
"grad_norm": 2.4083834163686406,
"learning_rate": 4.6315789473684214e-05,
"loss": 1.397,
"step": 55
},
{
"epoch": 0.29473684210526313,
"grad_norm": 1.8900388617787558,
"learning_rate": 4.715789473684211e-05,
"loss": 1.398,
"step": 56
},
{
"epoch": 0.3,
"grad_norm": 1.8869758461876107,
"learning_rate": 4.8e-05,
"loss": 1.399,
"step": 57
},
{
"epoch": 0.30526315789473685,
"grad_norm": 1.9387939885602292,
"learning_rate": 4.88421052631579e-05,
"loss": 1.4029,
"step": 58
},
{
"epoch": 0.3105263157894737,
"grad_norm": 2.8691231917123,
"learning_rate": 4.9684210526315796e-05,
"loss": 1.3774,
"step": 59
},
{
"epoch": 0.3157894736842105,
"grad_norm": 2.023114295041414,
"learning_rate": 5.052631578947368e-05,
"loss": 1.3925,
"step": 60
},
{
"epoch": 0.32105263157894737,
"grad_norm": 1.8266981690923911,
"learning_rate": 5.136842105263158e-05,
"loss": 1.3752,
"step": 61
},
{
"epoch": 0.3263157894736842,
"grad_norm": 2.749570229085121,
"learning_rate": 5.221052631578948e-05,
"loss": 1.3846,
"step": 62
},
{
"epoch": 0.33157894736842103,
"grad_norm": 1.577733284649954,
"learning_rate": 5.305263157894737e-05,
"loss": 1.3983,
"step": 63
},
{
"epoch": 0.3368421052631579,
"grad_norm": 3.4308320005068897,
"learning_rate": 5.3894736842105265e-05,
"loss": 1.3898,
"step": 64
},
{
"epoch": 0.34210526315789475,
"grad_norm": 2.7293556039435583,
"learning_rate": 5.4736842105263165e-05,
"loss": 1.3801,
"step": 65
},
{
"epoch": 0.3473684210526316,
"grad_norm": 2.7986318090963036,
"learning_rate": 5.557894736842106e-05,
"loss": 1.3929,
"step": 66
},
{
"epoch": 0.3526315789473684,
"grad_norm": 2.5730126104338407,
"learning_rate": 5.642105263157895e-05,
"loss": 1.382,
"step": 67
},
{
"epoch": 0.35789473684210527,
"grad_norm": 2.467033189056739,
"learning_rate": 5.726315789473685e-05,
"loss": 1.3738,
"step": 68
},
{
"epoch": 0.3631578947368421,
"grad_norm": 1.5266025457633567,
"learning_rate": 5.810526315789475e-05,
"loss": 1.394,
"step": 69
},
{
"epoch": 0.3684210526315789,
"grad_norm": 2.1446942030427567,
"learning_rate": 5.8947368421052634e-05,
"loss": 1.3845,
"step": 70
},
{
"epoch": 0.3736842105263158,
"grad_norm": 2.3417890559923986,
"learning_rate": 5.978947368421053e-05,
"loss": 1.3841,
"step": 71
},
{
"epoch": 0.37894736842105264,
"grad_norm": 1.9604854202915063,
"learning_rate": 6.063157894736843e-05,
"loss": 1.3834,
"step": 72
},
{
"epoch": 0.38421052631578945,
"grad_norm": 2.4507109733612578,
"learning_rate": 6.147368421052632e-05,
"loss": 1.4071,
"step": 73
},
{
"epoch": 0.3894736842105263,
"grad_norm": 3.5995720583445063,
"learning_rate": 6.231578947368421e-05,
"loss": 1.3704,
"step": 74
},
{
"epoch": 0.39473684210526316,
"grad_norm": 2.0182422302151797,
"learning_rate": 6.315789473684212e-05,
"loss": 1.379,
"step": 75
},
{
"epoch": 0.4,
"grad_norm": 4.1134652051924,
"learning_rate": 6.400000000000001e-05,
"loss": 1.3788,
"step": 76
},
{
"epoch": 0.4052631578947368,
"grad_norm": 2.929511327190635,
"learning_rate": 6.484210526315789e-05,
"loss": 1.3914,
"step": 77
},
{
"epoch": 0.4105263157894737,
"grad_norm": 3.5649183111031406,
"learning_rate": 6.56842105263158e-05,
"loss": 1.3715,
"step": 78
},
{
"epoch": 0.41578947368421054,
"grad_norm": 3.365859410878661,
"learning_rate": 6.652631578947369e-05,
"loss": 1.3986,
"step": 79
},
{
"epoch": 0.42105263157894735,
"grad_norm": 3.304005403851428,
"learning_rate": 6.736842105263159e-05,
"loss": 1.3783,
"step": 80
},
{
"epoch": 0.4263157894736842,
"grad_norm": 2.894847638309193,
"learning_rate": 6.821052631578948e-05,
"loss": 1.3802,
"step": 81
},
{
"epoch": 0.43157894736842106,
"grad_norm": 2.698808992917365,
"learning_rate": 6.905263157894737e-05,
"loss": 1.3761,
"step": 82
},
{
"epoch": 0.4368421052631579,
"grad_norm": 2.163408526778829,
"learning_rate": 6.989473684210527e-05,
"loss": 1.3911,
"step": 83
},
{
"epoch": 0.4421052631578947,
"grad_norm": 3.2099232827707014,
"learning_rate": 7.073684210526316e-05,
"loss": 1.3926,
"step": 84
},
{
"epoch": 0.4473684210526316,
"grad_norm": 2.475621296000252,
"learning_rate": 7.157894736842105e-05,
"loss": 1.3932,
"step": 85
},
{
"epoch": 0.45263157894736844,
"grad_norm": 2.9203370802467936,
"learning_rate": 7.242105263157896e-05,
"loss": 1.3822,
"step": 86
},
{
"epoch": 0.45789473684210524,
"grad_norm": 2.3862423279450393,
"learning_rate": 7.326315789473684e-05,
"loss": 1.3721,
"step": 87
},
{
"epoch": 0.4631578947368421,
"grad_norm": 3.275399501836966,
"learning_rate": 7.410526315789474e-05,
"loss": 1.4002,
"step": 88
},
{
"epoch": 0.46842105263157896,
"grad_norm": 2.5587038178412533,
"learning_rate": 7.494736842105264e-05,
"loss": 1.3812,
"step": 89
},
{
"epoch": 0.47368421052631576,
"grad_norm": 3.0292582788342375,
"learning_rate": 7.578947368421052e-05,
"loss": 1.3679,
"step": 90
},
{
"epoch": 0.4789473684210526,
"grad_norm": 2.4725567771769366,
"learning_rate": 7.663157894736843e-05,
"loss": 1.3825,
"step": 91
},
{
"epoch": 0.4842105263157895,
"grad_norm": 2.983972110527992,
"learning_rate": 7.747368421052633e-05,
"loss": 1.3727,
"step": 92
},
{
"epoch": 0.48947368421052634,
"grad_norm": 2.498982942091222,
"learning_rate": 7.831578947368422e-05,
"loss": 1.3769,
"step": 93
},
{
"epoch": 0.49473684210526314,
"grad_norm": 3.526109045872143,
"learning_rate": 7.915789473684211e-05,
"loss": 1.3788,
"step": 94
},
{
"epoch": 0.5,
"grad_norm": 3.1483584936822284,
"learning_rate": 8e-05,
"loss": 1.3643,
"step": 95
},
{
"epoch": 0.5052631578947369,
"grad_norm": 2.0080980217269517,
"learning_rate": 7.999972997932227e-05,
"loss": 1.3846,
"step": 96
},
{
"epoch": 0.5105263157894737,
"grad_norm": 4.353229694894079,
"learning_rate": 7.999891992093464e-05,
"loss": 1.3787,
"step": 97
},
{
"epoch": 0.5157894736842106,
"grad_norm": 2.878217785277169,
"learning_rate": 7.999756983577373e-05,
"loss": 1.3695,
"step": 98
},
{
"epoch": 0.5210526315789473,
"grad_norm": 1.962122819070823,
"learning_rate": 7.999567974206707e-05,
"loss": 1.364,
"step": 99
},
{
"epoch": 0.5263157894736842,
"grad_norm": 3.9822305254995887,
"learning_rate": 7.999324966533291e-05,
"loss": 1.3928,
"step": 100
},
{
"epoch": 0.531578947368421,
"grad_norm": 2.4880539516369713,
"learning_rate": 7.999027963837979e-05,
"loss": 1.3656,
"step": 101
},
{
"epoch": 0.5368421052631579,
"grad_norm": 4.910309768750227,
"learning_rate": 7.998676970130614e-05,
"loss": 1.3802,
"step": 102
},
{
"epoch": 0.5421052631578948,
"grad_norm": 2.905093024244942,
"learning_rate": 7.998271990149972e-05,
"loss": 1.3731,
"step": 103
},
{
"epoch": 0.5473684210526316,
"grad_norm": 3.8609393179697284,
"learning_rate": 7.997813029363704e-05,
"loss": 1.4037,
"step": 104
},
{
"epoch": 0.5526315789473685,
"grad_norm": 3.6672144152714865,
"learning_rate": 7.997300093968255e-05,
"loss": 1.3739,
"step": 105
},
{
"epoch": 0.5578947368421052,
"grad_norm": 3.0897791527691,
"learning_rate": 7.996733190888783e-05,
"loss": 1.3729,
"step": 106
},
{
"epoch": 0.5631578947368421,
"grad_norm": 3.0709819435052794,
"learning_rate": 7.996112327779065e-05,
"loss": 1.3735,
"step": 107
},
{
"epoch": 0.5684210526315789,
"grad_norm": 2.4110812616502053,
"learning_rate": 7.995437513021393e-05,
"loss": 1.3625,
"step": 108
},
{
"epoch": 0.5736842105263158,
"grad_norm": 3.20735512285491,
"learning_rate": 7.994708755726469e-05,
"loss": 1.3646,
"step": 109
},
{
"epoch": 0.5789473684210527,
"grad_norm": 3.410126968058674,
"learning_rate": 7.993926065733265e-05,
"loss": 1.3828,
"step": 110
},
{
"epoch": 0.5842105263157895,
"grad_norm": 1.9981274214741556,
"learning_rate": 7.993089453608908e-05,
"loss": 1.3614,
"step": 111
},
{
"epoch": 0.5894736842105263,
"grad_norm": 3.8348881514308104,
"learning_rate": 7.992198930648527e-05,
"loss": 1.366,
"step": 112
},
{
"epoch": 0.5947368421052631,
"grad_norm": 3.6103553587856188,
"learning_rate": 7.991254508875098e-05,
"loss": 1.3797,
"step": 113
},
{
"epoch": 0.6,
"grad_norm": 2.545996054998508,
"learning_rate": 7.990256201039297e-05,
"loss": 1.37,
"step": 114
},
{
"epoch": 0.6052631578947368,
"grad_norm": 2.9285662609146597,
"learning_rate": 7.98920402061931e-05,
"loss": 1.3691,
"step": 115
},
{
"epoch": 0.6105263157894737,
"grad_norm": 3.497818597459857,
"learning_rate": 7.988097981820659e-05,
"loss": 1.3724,
"step": 116
},
{
"epoch": 0.6157894736842106,
"grad_norm": 1.955436154771224,
"learning_rate": 7.986938099576015e-05,
"loss": 1.3553,
"step": 117
},
{
"epoch": 0.6210526315789474,
"grad_norm": 2.024911219916847,
"learning_rate": 7.985724389544982e-05,
"loss": 1.3736,
"step": 118
},
{
"epoch": 0.6263157894736842,
"grad_norm": 2.726230215863742,
"learning_rate": 7.984456868113905e-05,
"loss": 1.3666,
"step": 119
},
{
"epoch": 0.631578947368421,
"grad_norm": 1.8902008783219175,
"learning_rate": 7.98313555239563e-05,
"loss": 1.358,
"step": 120
},
{
"epoch": 0.6368421052631579,
"grad_norm": 3.3343114876725215,
"learning_rate": 7.98176046022929e-05,
"loss": 1.3674,
"step": 121
},
{
"epoch": 0.6421052631578947,
"grad_norm": 2.8760888936249716,
"learning_rate": 7.980331610180046e-05,
"loss": 1.3598,
"step": 122
},
{
"epoch": 0.6473684210526316,
"grad_norm": 1.821327982649168,
"learning_rate": 7.978849021538855e-05,
"loss": 1.3559,
"step": 123
},
{
"epoch": 0.6526315789473685,
"grad_norm": 2.1330475395816038,
"learning_rate": 7.977312714322193e-05,
"loss": 1.3529,
"step": 124
},
{
"epoch": 0.6578947368421053,
"grad_norm": 2.6018688330295032,
"learning_rate": 7.975722709271799e-05,
"loss": 1.3537,
"step": 125
},
{
"epoch": 0.6631578947368421,
"grad_norm": 2.8283770858028143,
"learning_rate": 7.974079027854382e-05,
"loss": 1.3591,
"step": 126
},
{
"epoch": 0.6684210526315789,
"grad_norm": 2.0562890332720625,
"learning_rate": 7.972381692261343e-05,
"loss": 1.3523,
"step": 127
},
{
"epoch": 0.6736842105263158,
"grad_norm": 2.677519447523674,
"learning_rate": 7.970630725408467e-05,
"loss": 1.3588,
"step": 128
},
{
"epoch": 0.6789473684210526,
"grad_norm": 3.39520049259372,
"learning_rate": 7.968826150935615e-05,
"loss": 1.357,
"step": 129
},
{
"epoch": 0.6842105263157895,
"grad_norm": 1.326798562942608,
"learning_rate": 7.96696799320641e-05,
"loss": 1.3547,
"step": 130
},
{
"epoch": 0.6894736842105263,
"grad_norm": 5.713227145762471,
"learning_rate": 7.965056277307902e-05,
"loss": 1.405,
"step": 131
},
{
"epoch": 0.6947368421052632,
"grad_norm": 4.708892369834835,
"learning_rate": 7.963091029050231e-05,
"loss": 1.4096,
"step": 132
},
{
"epoch": 0.7,
"grad_norm": 3.526071821361426,
"learning_rate": 7.961072274966282e-05,
"loss": 1.3766,
"step": 133
},
{
"epoch": 0.7052631578947368,
"grad_norm": 3.421098464992638,
"learning_rate": 7.95900004231132e-05,
"loss": 1.372,
"step": 134
},
{
"epoch": 0.7105263157894737,
"grad_norm": 3.1868582792498468,
"learning_rate": 7.956874359062632e-05,
"loss": 1.3742,
"step": 135
},
{
"epoch": 0.7157894736842105,
"grad_norm": 2.5369692116526354,
"learning_rate": 7.954695253919138e-05,
"loss": 1.38,
"step": 136
},
{
"epoch": 0.7210526315789474,
"grad_norm": 4.029612203074803,
"learning_rate": 7.952462756301007e-05,
"loss": 1.3789,
"step": 137
},
{
"epoch": 0.7263157894736842,
"grad_norm": 3.071634072769698,
"learning_rate": 7.95017689634927e-05,
"loss": 1.3692,
"step": 138
},
{
"epoch": 0.7315789473684211,
"grad_norm": 3.38738695405503,
"learning_rate": 7.947837704925396e-05,
"loss": 1.3692,
"step": 139
},
{
"epoch": 0.7368421052631579,
"grad_norm": 2.900109575705123,
"learning_rate": 7.94544521361089e-05,
"loss": 1.3851,
"step": 140
},
{
"epoch": 0.7421052631578947,
"grad_norm": 2.7866463709429903,
"learning_rate": 7.942999454706858e-05,
"loss": 1.3797,
"step": 141
},
{
"epoch": 0.7473684210526316,
"grad_norm": 2.053753694637562,
"learning_rate": 7.940500461233572e-05,
"loss": 1.3697,
"step": 142
},
{
"epoch": 0.7526315789473684,
"grad_norm": 2.9697850807629464,
"learning_rate": 7.93794826693003e-05,
"loss": 1.349,
"step": 143
},
{
"epoch": 0.7578947368421053,
"grad_norm": 2.370919157592963,
"learning_rate": 7.935342906253492e-05,
"loss": 1.3556,
"step": 144
},
{
"epoch": 0.7631578947368421,
"grad_norm": 3.3074201358634125,
"learning_rate": 7.932684414379021e-05,
"loss": 1.3656,
"step": 145
},
{
"epoch": 0.7684210526315789,
"grad_norm": 2.2648377858473605,
"learning_rate": 7.929972827199006e-05,
"loss": 1.3704,
"step": 146
},
{
"epoch": 0.7736842105263158,
"grad_norm": 2.835496161570207,
"learning_rate": 7.927208181322679e-05,
"loss": 1.3466,
"step": 147
},
{
"epoch": 0.7789473684210526,
"grad_norm": 2.343363991518181,
"learning_rate": 7.924390514075616e-05,
"loss": 1.3726,
"step": 148
},
{
"epoch": 0.7842105263157895,
"grad_norm": 2.7560515706025455,
"learning_rate": 7.921519863499239e-05,
"loss": 1.3626,
"step": 149
},
{
"epoch": 0.7894736842105263,
"grad_norm": 2.4007554609918564,
"learning_rate": 7.918596268350296e-05,
"loss": 1.3587,
"step": 150
},
{
"epoch": 0.7947368421052632,
"grad_norm": 2.5965741897469896,
"learning_rate": 7.915619768100348e-05,
"loss": 1.3813,
"step": 151
},
{
"epoch": 0.8,
"grad_norm": 2.0691402568946606,
"learning_rate": 7.912590402935223e-05,
"loss": 1.3466,
"step": 152
},
{
"epoch": 0.8052631578947368,
"grad_norm": 2.682134570279511,
"learning_rate": 7.909508213754484e-05,
"loss": 1.3484,
"step": 153
},
{
"epoch": 0.8105263157894737,
"grad_norm": 2.961951846828786,
"learning_rate": 7.906373242170872e-05,
"loss": 1.356,
"step": 154
},
{
"epoch": 0.8157894736842105,
"grad_norm": 1.647563803925644,
"learning_rate": 7.903185530509743e-05,
"loss": 1.3314,
"step": 155
},
{
"epoch": 0.8210526315789474,
"grad_norm": 1.7184813388266553,
"learning_rate": 7.899945121808501e-05,
"loss": 1.3521,
"step": 156
},
{
"epoch": 0.8263157894736842,
"grad_norm": 2.795936841854527,
"learning_rate": 7.896652059816015e-05,
"loss": 1.3635,
"step": 157
},
{
"epoch": 0.8315789473684211,
"grad_norm": 3.0399857654385034,
"learning_rate": 7.893306388992023e-05,
"loss": 1.3619,
"step": 158
},
{
"epoch": 0.8368421052631579,
"grad_norm": 1.5156478386100931,
"learning_rate": 7.889908154506545e-05,
"loss": 1.332,
"step": 159
},
{
"epoch": 0.8421052631578947,
"grad_norm": 2.299123403411113,
"learning_rate": 7.886457402239256e-05,
"loss": 1.351,
"step": 160
},
{
"epoch": 0.8473684210526315,
"grad_norm": 2.307814996765452,
"learning_rate": 7.88295417877888e-05,
"loss": 1.3565,
"step": 161
},
{
"epoch": 0.8526315789473684,
"grad_norm": 3.2987572063667643,
"learning_rate": 7.879398531422558e-05,
"loss": 1.3719,
"step": 162
},
{
"epoch": 0.8578947368421053,
"grad_norm": 1.845792873065677,
"learning_rate": 7.875790508175202e-05,
"loss": 1.3384,
"step": 163
},
{
"epoch": 0.8631578947368421,
"grad_norm": 2.4609311250378942,
"learning_rate": 7.87213015774886e-05,
"loss": 1.3633,
"step": 164
},
{
"epoch": 0.868421052631579,
"grad_norm": 2.693256637820666,
"learning_rate": 7.868417529562043e-05,
"loss": 1.3639,
"step": 165
},
{
"epoch": 0.8736842105263158,
"grad_norm": 1.5086514518899445,
"learning_rate": 7.864652673739073e-05,
"loss": 1.3615,
"step": 166
},
{
"epoch": 0.8789473684210526,
"grad_norm": 3.1066465037527147,
"learning_rate": 7.860835641109395e-05,
"loss": 1.3507,
"step": 167
},
{
"epoch": 0.8842105263157894,
"grad_norm": 2.327552820376472,
"learning_rate": 7.856966483206897e-05,
"loss": 1.3458,
"step": 168
},
{
"epoch": 0.8894736842105263,
"grad_norm": 2.808477876459289,
"learning_rate": 7.853045252269208e-05,
"loss": 1.3601,
"step": 169
},
{
"epoch": 0.8947368421052632,
"grad_norm": 2.544696040692953,
"learning_rate": 7.849072001237001e-05,
"loss": 1.3529,
"step": 170
},
{
"epoch": 0.9,
"grad_norm": 2.9946344095632575,
"learning_rate": 7.845046783753276e-05,
"loss": 1.3612,
"step": 171
},
{
"epoch": 0.9052631578947369,
"grad_norm": 2.1670615109125646,
"learning_rate": 7.840969654162627e-05,
"loss": 1.3403,
"step": 172
},
{
"epoch": 0.9105263157894737,
"grad_norm": 2.4274632796911324,
"learning_rate": 7.83684066751052e-05,
"loss": 1.3492,
"step": 173
},
{
"epoch": 0.9157894736842105,
"grad_norm": 2.2078510708150416,
"learning_rate": 7.832659879542544e-05,
"loss": 1.3322,
"step": 174
},
{
"epoch": 0.9210526315789473,
"grad_norm": 2.834266147883312,
"learning_rate": 7.828427346703657e-05,
"loss": 1.3658,
"step": 175
},
{
"epoch": 0.9263157894736842,
"grad_norm": 2.5098411818246156,
"learning_rate": 7.824143126137431e-05,
"loss": 1.3343,
"step": 176
},
{
"epoch": 0.9315789473684211,
"grad_norm": 1.9856802860400529,
"learning_rate": 7.819807275685272e-05,
"loss": 1.3408,
"step": 177
},
{
"epoch": 0.9368421052631579,
"grad_norm": 2.4068321545997717,
"learning_rate": 7.815419853885644e-05,
"loss": 1.3482,
"step": 178
},
{
"epoch": 0.9421052631578948,
"grad_norm": 2.0424531148819507,
"learning_rate": 7.810980919973277e-05,
"loss": 1.3492,
"step": 179
},
{
"epoch": 0.9473684210526315,
"grad_norm": 2.88855449179049,
"learning_rate": 7.806490533878368e-05,
"loss": 1.3409,
"step": 180
},
{
"epoch": 0.9526315789473684,
"grad_norm": 2.2025647302444593,
"learning_rate": 7.801948756225772e-05,
"loss": 1.3552,
"step": 181
},
{
"epoch": 0.9578947368421052,
"grad_norm": 1.705774295065343,
"learning_rate": 7.797355648334185e-05,
"loss": 1.3298,
"step": 182
},
{
"epoch": 0.9631578947368421,
"grad_norm": 2.0591090166453907,
"learning_rate": 7.792711272215308e-05,
"loss": 1.3234,
"step": 183
},
{
"epoch": 0.968421052631579,
"grad_norm": 2.543058246007598,
"learning_rate": 7.788015690573025e-05,
"loss": 1.3454,
"step": 184
},
{
"epoch": 0.9736842105263158,
"grad_norm": 2.7945291673690273,
"learning_rate": 7.783268966802539e-05,
"loss": 1.3623,
"step": 185
},
{
"epoch": 0.9789473684210527,
"grad_norm": 1.2623547103194999,
"learning_rate": 7.778471164989532e-05,
"loss": 1.3253,
"step": 186
},
{
"epoch": 0.9842105263157894,
"grad_norm": 2.8671346135920555,
"learning_rate": 7.773622349909285e-05,
"loss": 1.3516,
"step": 187
},
{
"epoch": 0.9894736842105263,
"grad_norm": 2.37666359514784,
"learning_rate": 7.768722587025818e-05,
"loss": 1.333,
"step": 188
},
{
"epoch": 0.9947368421052631,
"grad_norm": 2.0642846376852697,
"learning_rate": 7.763771942490995e-05,
"loss": 1.3514,
"step": 189
},
{
"epoch": 1.0,
"grad_norm": 1.7892483033365818,
"learning_rate": 7.758770483143634e-05,
"loss": 1.3383,
"step": 190
},
{
"epoch": 1.0052631578947369,
"grad_norm": 2.3383926476340875,
"learning_rate": 7.753718276508609e-05,
"loss": 1.3296,
"step": 191
},
{
"epoch": 1.0105263157894737,
"grad_norm": 2.4843645129343686,
"learning_rate": 7.748615390795932e-05,
"loss": 1.3271,
"step": 192
},
{
"epoch": 1.0157894736842106,
"grad_norm": 2.418044856395934,
"learning_rate": 7.743461894899837e-05,
"loss": 1.3272,
"step": 193
},
{
"epoch": 1.0210526315789474,
"grad_norm": 1.5738686856678197,
"learning_rate": 7.738257858397844e-05,
"loss": 1.3345,
"step": 194
},
{
"epoch": 1.0263157894736843,
"grad_norm": 2.648547362628862,
"learning_rate": 7.733003351549829e-05,
"loss": 1.3334,
"step": 195
},
{
"epoch": 1.0315789473684212,
"grad_norm": 1.882461821114559,
"learning_rate": 7.727698445297066e-05,
"loss": 1.3129,
"step": 196
},
{
"epoch": 1.0368421052631578,
"grad_norm": 2.4800191556167586,
"learning_rate": 7.722343211261274e-05,
"loss": 1.3254,
"step": 197
},
{
"epoch": 1.0421052631578946,
"grad_norm": 2.23843270259424,
"learning_rate": 7.71693772174365e-05,
"loss": 1.3273,
"step": 198
},
{
"epoch": 1.0473684210526315,
"grad_norm": 1.9793326670930025,
"learning_rate": 7.71148204972389e-05,
"loss": 1.3286,
"step": 199
},
{
"epoch": 1.0526315789473684,
"grad_norm": 2.1730809011656502,
"learning_rate": 7.705976268859207e-05,
"loss": 1.3245,
"step": 200
},
{
"epoch": 1.0578947368421052,
"grad_norm": 2.185445170389663,
"learning_rate": 7.700420453483336e-05,
"loss": 1.3222,
"step": 201
},
{
"epoch": 1.063157894736842,
"grad_norm": 2.3909717751217903,
"learning_rate": 7.694814678605528e-05,
"loss": 1.325,
"step": 202
},
{
"epoch": 1.068421052631579,
"grad_norm": 1.923075293942803,
"learning_rate": 7.68915901990954e-05,
"loss": 1.3107,
"step": 203
},
{
"epoch": 1.0736842105263158,
"grad_norm": 1.7281637350526677,
"learning_rate": 7.683453553752611e-05,
"loss": 1.3252,
"step": 204
},
{
"epoch": 1.0789473684210527,
"grad_norm": 3.2565617242431055,
"learning_rate": 7.677698357164431e-05,
"loss": 1.3269,
"step": 205
},
{
"epoch": 1.0842105263157895,
"grad_norm": 1.2312572256380077,
"learning_rate": 7.671893507846109e-05,
"loss": 1.3208,
"step": 206
},
{
"epoch": 1.0894736842105264,
"grad_norm": 2.928938984187919,
"learning_rate": 7.66603908416911e-05,
"loss": 1.3313,
"step": 207
},
{
"epoch": 1.0947368421052632,
"grad_norm": 2.9307098888263026,
"learning_rate": 7.660135165174205e-05,
"loss": 1.3455,
"step": 208
},
{
"epoch": 1.1,
"grad_norm": 1.593997332291012,
"learning_rate": 7.654181830570404e-05,
"loss": 1.3103,
"step": 209
},
{
"epoch": 1.1052631578947367,
"grad_norm": 2.6269588682109952,
"learning_rate": 7.648179160733883e-05,
"loss": 1.3167,
"step": 210
},
{
"epoch": 1.1105263157894736,
"grad_norm": 2.4790660783022043,
"learning_rate": 7.642127236706887e-05,
"loss": 1.3164,
"step": 211
},
{
"epoch": 1.1157894736842104,
"grad_norm": 2.1512908725199544,
"learning_rate": 7.636026140196651e-05,
"loss": 1.3067,
"step": 212
},
{
"epoch": 1.1210526315789473,
"grad_norm": 1.5786680257092611,
"learning_rate": 7.629875953574282e-05,
"loss": 1.3248,
"step": 213
},
{
"epoch": 1.1263157894736842,
"grad_norm": 1.8684394631658845,
"learning_rate": 7.623676759873661e-05,
"loss": 1.3356,
"step": 214
},
{
"epoch": 1.131578947368421,
"grad_norm": 1.83011908562494,
"learning_rate": 7.61742864279031e-05,
"loss": 1.3243,
"step": 215
},
{
"epoch": 1.1368421052631579,
"grad_norm": 2.5453116567442686,
"learning_rate": 7.611131686680272e-05,
"loss": 1.3202,
"step": 216
},
{
"epoch": 1.1421052631578947,
"grad_norm": 1.899610653373559,
"learning_rate": 7.604785976558961e-05,
"loss": 1.3196,
"step": 217
},
{
"epoch": 1.1473684210526316,
"grad_norm": 2.7819319499079143,
"learning_rate": 7.598391598100029e-05,
"loss": 1.3223,
"step": 218
},
{
"epoch": 1.1526315789473685,
"grad_norm": 2.119997221986913,
"learning_rate": 7.591948637634193e-05,
"loss": 1.3304,
"step": 219
},
{
"epoch": 1.1578947368421053,
"grad_norm": 2.4856497917141254,
"learning_rate": 7.585457182148081e-05,
"loss": 1.3036,
"step": 220
},
{
"epoch": 1.1631578947368422,
"grad_norm": 2.1055551992495847,
"learning_rate": 7.578917319283055e-05,
"loss": 1.3269,
"step": 221
},
{
"epoch": 1.168421052631579,
"grad_norm": 2.2379057425917424,
"learning_rate": 7.572329137334023e-05,
"loss": 1.3084,
"step": 222
},
{
"epoch": 1.1736842105263159,
"grad_norm": 2.0190272934745055,
"learning_rate": 7.565692725248254e-05,
"loss": 1.3251,
"step": 223
},
{
"epoch": 1.1789473684210527,
"grad_norm": 1.2213400585349068,
"learning_rate": 7.559008172624174e-05,
"loss": 1.3089,
"step": 224
},
{
"epoch": 1.1842105263157894,
"grad_norm": 3.4238358307196375,
"learning_rate": 7.552275569710152e-05,
"loss": 1.3188,
"step": 225
},
{
"epoch": 1.1894736842105262,
"grad_norm": 1.9434637558797097,
"learning_rate": 7.545495007403287e-05,
"loss": 1.3197,
"step": 226
},
{
"epoch": 1.194736842105263,
"grad_norm": 3.2480882471479164,
"learning_rate": 7.538666577248184e-05,
"loss": 1.3248,
"step": 227
},
{
"epoch": 1.2,
"grad_norm": 2.686792838642632,
"learning_rate": 7.531790371435709e-05,
"loss": 1.3166,
"step": 228
},
{
"epoch": 1.2052631578947368,
"grad_norm": 2.667702689555652,
"learning_rate": 7.524866482801748e-05,
"loss": 1.3118,
"step": 229
},
{
"epoch": 1.2105263157894737,
"grad_norm": 2.0267106721992003,
"learning_rate": 7.517895004825956e-05,
"loss": 1.3311,
"step": 230
},
{
"epoch": 1.2157894736842105,
"grad_norm": 3.195120176439168,
"learning_rate": 7.510876031630496e-05,
"loss": 1.322,
"step": 231
},
{
"epoch": 1.2210526315789474,
"grad_norm": 1.9034234117765794,
"learning_rate": 7.503809657978762e-05,
"loss": 1.3226,
"step": 232
},
{
"epoch": 1.2263157894736842,
"grad_norm": 3.690905599022198,
"learning_rate": 7.496695979274103e-05,
"loss": 1.3255,
"step": 233
},
{
"epoch": 1.231578947368421,
"grad_norm": 3.145636629896195,
"learning_rate": 7.489535091558536e-05,
"loss": 1.3381,
"step": 234
},
{
"epoch": 1.236842105263158,
"grad_norm": 2.5072433312959843,
"learning_rate": 7.48232709151145e-05,
"loss": 1.3219,
"step": 235
},
{
"epoch": 1.2421052631578948,
"grad_norm": 3.2107352113754186,
"learning_rate": 7.475072076448298e-05,
"loss": 1.3227,
"step": 236
},
{
"epoch": 1.2473684210526317,
"grad_norm": 1.58975425995912,
"learning_rate": 7.467770144319283e-05,
"loss": 1.3333,
"step": 237
},
{
"epoch": 1.2526315789473683,
"grad_norm": 3.7587617169131082,
"learning_rate": 7.460421393708039e-05,
"loss": 1.3509,
"step": 238
},
{
"epoch": 1.2578947368421054,
"grad_norm": 2.5861078342959614,
"learning_rate": 7.453025923830296e-05,
"loss": 1.3361,
"step": 239
},
{
"epoch": 1.263157894736842,
"grad_norm": 3.398411462187095,
"learning_rate": 7.445583834532546e-05,
"loss": 1.3309,
"step": 240
},
{
"epoch": 1.268421052631579,
"grad_norm": 2.4423289107004664,
"learning_rate": 7.438095226290685e-05,
"loss": 1.337,
"step": 241
},
{
"epoch": 1.2736842105263158,
"grad_norm": 2.4746167081096324,
"learning_rate": 7.430560200208669e-05,
"loss": 1.3105,
"step": 242
},
{
"epoch": 1.2789473684210526,
"grad_norm": 2.7651610722472353,
"learning_rate": 7.42297885801714e-05,
"loss": 1.3243,
"step": 243
},
{
"epoch": 1.2842105263157895,
"grad_norm": 1.6511969920414749,
"learning_rate": 7.415351302072056e-05,
"loss": 1.3105,
"step": 244
},
{
"epoch": 1.2894736842105263,
"grad_norm": 3.104346761016083,
"learning_rate": 7.407677635353308e-05,
"loss": 1.3298,
"step": 245
},
{
"epoch": 1.2947368421052632,
"grad_norm": 2.3550214994148235,
"learning_rate": 7.399957961463332e-05,
"loss": 1.3649,
"step": 246
},
{
"epoch": 1.3,
"grad_norm": 2.266967394498034,
"learning_rate": 7.392192384625704e-05,
"loss": 1.3363,
"step": 247
},
{
"epoch": 1.305263157894737,
"grad_norm": 3.1011314193494104,
"learning_rate": 7.384381009683742e-05,
"loss": 1.3252,
"step": 248
},
{
"epoch": 1.3105263157894738,
"grad_norm": 2.133334459450928,
"learning_rate": 7.376523942099084e-05,
"loss": 1.3307,
"step": 249
},
{
"epoch": 1.3157894736842106,
"grad_norm": 4.326087201648726,
"learning_rate": 7.368621287950264e-05,
"loss": 1.4045,
"step": 250
},
{
"epoch": 1.3210526315789473,
"grad_norm": 22.674882226571523,
"learning_rate": 7.360673153931285e-05,
"loss": 1.3348,
"step": 251
},
{
"epoch": 1.3263157894736843,
"grad_norm": 3.064941459260646,
"learning_rate": 7.352679647350172e-05,
"loss": 1.3425,
"step": 252
},
{
"epoch": 1.331578947368421,
"grad_norm": 2.6376251183297055,
"learning_rate": 7.344640876127529e-05,
"loss": 1.3389,
"step": 253
},
{
"epoch": 1.3368421052631578,
"grad_norm": 3.518986507081023,
"learning_rate": 7.33655694879508e-05,
"loss": 1.325,
"step": 254
},
{
"epoch": 1.3421052631578947,
"grad_norm": 6.772119772438152,
"learning_rate": 7.328427974494201e-05,
"loss": 1.3435,
"step": 255
},
{
"epoch": 1.3473684210526315,
"grad_norm": 3.1439267189525966,
"learning_rate": 7.32025406297445e-05,
"loss": 1.3482,
"step": 256
},
{
"epoch": 1.3526315789473684,
"grad_norm": 2.9757174788747442,
"learning_rate": 7.312035324592081e-05,
"loss": 1.4253,
"step": 257
},
{
"epoch": 1.3578947368421053,
"grad_norm": 19.234550469937634,
"learning_rate": 7.303771870308561e-05,
"loss": 1.5748,
"step": 258
},
{
"epoch": 1.3631578947368421,
"grad_norm": 166.65509126340316,
"learning_rate": 7.295463811689069e-05,
"loss": 7.3386,
"step": 259
},
{
"epoch": 1.368421052631579,
"grad_norm": 37.296808447795385,
"learning_rate": 7.28711126090098e-05,
"loss": 7.6292,
"step": 260
},
{
"epoch": 1.3736842105263158,
"grad_norm": 206.77559967714524,
"learning_rate": 7.278714330712372e-05,
"loss": 5.9669,
"step": 261
},
{
"epoch": 1.3789473684210527,
"grad_norm": 31.10081965111831,
"learning_rate": 7.27027313449048e-05,
"loss": 1.9804,
"step": 262
},
{
"epoch": 1.3842105263157896,
"grad_norm": 273.4851676662734,
"learning_rate": 7.261787786200179e-05,
"loss": 4.0434,
"step": 263
},
{
"epoch": 1.3894736842105262,
"grad_norm": 21.94907796550795,
"learning_rate": 7.253258400402448e-05,
"loss": 2.3785,
"step": 264
},
{
"epoch": 1.3947368421052633,
"grad_norm": 87.34643583499242,
"learning_rate": 7.24468509225281e-05,
"loss": 3.2623,
"step": 265
},
{
"epoch": 1.4,
"grad_norm": 14.043694608373665,
"learning_rate": 7.236067977499791e-05,
"loss": 2.0359,
"step": 266
},
{
"epoch": 1.4052631578947368,
"grad_norm": 204.7560211103692,
"learning_rate": 7.227407172483348e-05,
"loss": 2.6066,
"step": 267
},
{
"epoch": 1.4105263157894736,
"grad_norm": 8.50429112485772,
"learning_rate": 7.218702794133304e-05,
"loss": 1.8554,
"step": 268
},
{
"epoch": 1.4157894736842105,
"grad_norm": 7.0947931831701805,
"learning_rate": 7.209954959967765e-05,
"loss": 1.7393,
"step": 269
},
{
"epoch": 1.4210526315789473,
"grad_norm": 3.181643344667253,
"learning_rate": 7.201163788091536e-05,
"loss": 1.5682,
"step": 270
},
{
"epoch": 1.4263157894736842,
"grad_norm": 2.0126327538765865,
"learning_rate": 7.192329397194529e-05,
"loss": 1.4786,
"step": 271
},
{
"epoch": 1.431578947368421,
"grad_norm": 2.7578093818109175,
"learning_rate": 7.183451906550155e-05,
"loss": 1.4642,
"step": 272
},
{
"epoch": 1.436842105263158,
"grad_norm": 1.9145504095924126,
"learning_rate": 7.174531436013712e-05,
"loss": 1.4291,
"step": 273
},
{
"epoch": 1.4421052631578948,
"grad_norm": 2.830916672053224,
"learning_rate": 7.165568106020779e-05,
"loss": 1.4538,
"step": 274
},
{
"epoch": 1.4473684210526316,
"grad_norm": 3.3750230003094464,
"learning_rate": 7.156562037585576e-05,
"loss": 1.4218,
"step": 275
},
{
"epoch": 1.4526315789473685,
"grad_norm": 1.886048219133163,
"learning_rate": 7.147513352299336e-05,
"loss": 1.4005,
"step": 276
},
{
"epoch": 1.4578947368421051,
"grad_norm": 4.612254413421206,
"learning_rate": 7.138422172328671e-05,
"loss": 1.4112,
"step": 277
},
{
"epoch": 1.4631578947368422,
"grad_norm": 3.475637100160211,
"learning_rate": 7.129288620413907e-05,
"loss": 1.388,
"step": 278
},
{
"epoch": 1.4684210526315788,
"grad_norm": 3.762823770210365,
"learning_rate": 7.120112819867437e-05,
"loss": 1.3941,
"step": 279
},
{
"epoch": 1.4736842105263157,
"grad_norm": 2.965363344704181,
"learning_rate": 7.110894894572056e-05,
"loss": 1.3815,
"step": 280
},
{
"epoch": 1.4789473684210526,
"grad_norm": 3.003148787485473,
"learning_rate": 7.101634968979287e-05,
"loss": 1.3805,
"step": 281
},
{
"epoch": 1.4842105263157894,
"grad_norm": 1.9778398666652557,
"learning_rate": 7.092333168107697e-05,
"loss": 1.3752,
"step": 282
},
{
"epoch": 1.4894736842105263,
"grad_norm": 3.5530639325816114,
"learning_rate": 7.082989617541217e-05,
"loss": 1.3919,
"step": 283
},
{
"epoch": 1.4947368421052631,
"grad_norm": 2.4964012979013144,
"learning_rate": 7.073604443427437e-05,
"loss": 1.3752,
"step": 284
},
{
"epoch": 1.5,
"grad_norm": 3.2586608061353224,
"learning_rate": 7.064177772475912e-05,
"loss": 1.3537,
"step": 285
},
{
"epoch": 1.5052631578947369,
"grad_norm": 2.7382891584432576,
"learning_rate": 7.054709731956449e-05,
"loss": 1.3548,
"step": 286
},
{
"epoch": 1.5105263157894737,
"grad_norm": 2.7983043107255714,
"learning_rate": 7.045200449697379e-05,
"loss": 1.355,
"step": 287
},
{
"epoch": 1.5157894736842106,
"grad_norm": 2.1575170098628207,
"learning_rate": 7.035650054083847e-05,
"loss": 1.3666,
"step": 288
},
{
"epoch": 1.5210526315789474,
"grad_norm": 2.280552356804481,
"learning_rate": 7.026058674056067e-05,
"loss": 1.3729,
"step": 289
},
{
"epoch": 1.526315789473684,
"grad_norm": 1.8204200442034197,
"learning_rate": 7.016426439107586e-05,
"loss": 1.3285,
"step": 290
},
{
"epoch": 1.5315789473684212,
"grad_norm": 2.2692718684429805,
"learning_rate": 7.006753479283535e-05,
"loss": 1.3432,
"step": 291
},
{
"epoch": 1.5368421052631578,
"grad_norm": 1.608298273784726,
"learning_rate": 6.99703992517887e-05,
"loss": 1.3457,
"step": 292
},
{
"epoch": 1.5421052631578949,
"grad_norm": 2.291066728931036,
"learning_rate": 6.987285907936617e-05,
"loss": 1.3489,
"step": 293
},
{
"epoch": 1.5473684210526315,
"grad_norm": 1.799873956016166,
"learning_rate": 6.977491559246091e-05,
"loss": 1.3538,
"step": 294
},
{
"epoch": 1.5526315789473686,
"grad_norm": 2.1836156231488144,
"learning_rate": 6.967657011341126e-05,
"loss": 1.3393,
"step": 295
},
{
"epoch": 1.5578947368421052,
"grad_norm": 1.656082168753184,
"learning_rate": 6.957782396998289e-05,
"loss": 1.3487,
"step": 296
},
{
"epoch": 1.563157894736842,
"grad_norm": 2.237518228348859,
"learning_rate": 6.94786784953508e-05,
"loss": 1.3431,
"step": 297
},
{
"epoch": 1.568421052631579,
"grad_norm": 1.8074440576933803,
"learning_rate": 6.937913502808142e-05,
"loss": 1.3338,
"step": 298
},
{
"epoch": 1.5736842105263158,
"grad_norm": 2.1852538797514134,
"learning_rate": 6.927919491211447e-05,
"loss": 1.3408,
"step": 299
},
{
"epoch": 1.5789473684210527,
"grad_norm": 1.686931533274294,
"learning_rate": 6.917885949674483e-05,
"loss": 1.337,
"step": 300
},
{
"epoch": 1.5842105263157895,
"grad_norm": 2.484073100527864,
"learning_rate": 6.907813013660437e-05,
"loss": 1.3315,
"step": 301
},
{
"epoch": 1.5894736842105264,
"grad_norm": 1.9730996981374016,
"learning_rate": 6.897700819164357e-05,
"loss": 1.3383,
"step": 302
},
{
"epoch": 1.594736842105263,
"grad_norm": 1.4953094506502813,
"learning_rate": 6.887549502711323e-05,
"loss": 1.3316,
"step": 303
},
{
"epoch": 1.6,
"grad_norm": 1.7333934091879961,
"learning_rate": 6.877359201354606e-05,
"loss": 1.3338,
"step": 304
},
{
"epoch": 1.6052631578947367,
"grad_norm": 1.701314271187227,
"learning_rate": 6.867130052673806e-05,
"loss": 1.3233,
"step": 305
},
{
"epoch": 1.6105263157894738,
"grad_norm": 2.5170048810500365,
"learning_rate": 6.856862194773008e-05,
"loss": 1.3418,
"step": 306
},
{
"epoch": 1.6157894736842104,
"grad_norm": 1.1423723633356422,
"learning_rate": 6.846555766278909e-05,
"loss": 1.3456,
"step": 307
},
{
"epoch": 1.6210526315789475,
"grad_norm": 2.1226546892123688,
"learning_rate": 6.83621090633895e-05,
"loss": 1.3199,
"step": 308
},
{
"epoch": 1.6263157894736842,
"grad_norm": 2.120207603951501,
"learning_rate": 6.825827754619434e-05,
"loss": 1.3252,
"step": 309
},
{
"epoch": 1.631578947368421,
"grad_norm": 1.3158750566710444,
"learning_rate": 6.815406451303647e-05,
"loss": 1.3213,
"step": 310
},
{
"epoch": 1.6368421052631579,
"grad_norm": 2.597320776495221,
"learning_rate": 6.804947137089955e-05,
"loss": 1.3112,
"step": 311
},
{
"epoch": 1.6421052631578947,
"grad_norm": 1.6685217693160599,
"learning_rate": 6.794449953189916e-05,
"loss": 1.3074,
"step": 312
},
{
"epoch": 1.6473684210526316,
"grad_norm": 2.5188932447525283,
"learning_rate": 6.783915041326364e-05,
"loss": 1.331,
"step": 313
},
{
"epoch": 1.6526315789473685,
"grad_norm": 2.071056305940592,
"learning_rate": 6.773342543731503e-05,
"loss": 1.3173,
"step": 314
},
{
"epoch": 1.6578947368421053,
"grad_norm": 2.427656153267591,
"learning_rate": 6.762732603144978e-05,
"loss": 1.3329,
"step": 315
},
{
"epoch": 1.663157894736842,
"grad_norm": 1.6471906450412725,
"learning_rate": 6.75208536281196e-05,
"loss": 1.311,
"step": 316
},
{
"epoch": 1.668421052631579,
"grad_norm": 2.3066742827555022,
"learning_rate": 6.7414009664812e-05,
"loss": 1.3349,
"step": 317
},
{
"epoch": 1.6736842105263157,
"grad_norm": 1.8458843126503317,
"learning_rate": 6.730679558403093e-05,
"loss": 1.3236,
"step": 318
},
{
"epoch": 1.6789473684210527,
"grad_norm": 2.1861813200392843,
"learning_rate": 6.719921283327736e-05,
"loss": 1.3268,
"step": 319
},
{
"epoch": 1.6842105263157894,
"grad_norm": 2.1870673388161124,
"learning_rate": 6.709126286502965e-05,
"loss": 1.3019,
"step": 320
},
{
"epoch": 1.6894736842105265,
"grad_norm": 1.4274808199921123,
"learning_rate": 6.698294713672395e-05,
"loss": 1.3255,
"step": 321
},
{
"epoch": 1.694736842105263,
"grad_norm": 1.5694017468203492,
"learning_rate": 6.687426711073462e-05,
"loss": 1.3048,
"step": 322
},
{
"epoch": 1.7,
"grad_norm": 1.1078521144544478,
"learning_rate": 6.676522425435433e-05,
"loss": 1.3087,
"step": 323
},
{
"epoch": 1.7052631578947368,
"grad_norm": 2.4538742138976386,
"learning_rate": 6.665582003977441e-05,
"loss": 1.3244,
"step": 324
},
{
"epoch": 1.7105263157894737,
"grad_norm": 1.7323261915367696,
"learning_rate": 6.654605594406486e-05,
"loss": 1.3093,
"step": 325
},
{
"epoch": 1.7157894736842105,
"grad_norm": 1.7174315153551183,
"learning_rate": 6.643593344915445e-05,
"loss": 1.3141,
"step": 326
},
{
"epoch": 1.7210526315789474,
"grad_norm": 1.4498322250506395,
"learning_rate": 6.632545404181074e-05,
"loss": 1.3251,
"step": 327
},
{
"epoch": 1.7263157894736842,
"grad_norm": 2.978144373846546,
"learning_rate": 6.62146192136199e-05,
"loss": 1.3117,
"step": 328
},
{
"epoch": 1.731578947368421,
"grad_norm": 1.8288925620523002,
"learning_rate": 6.610343046096674e-05,
"loss": 1.311,
"step": 329
},
{
"epoch": 1.736842105263158,
"grad_norm": 2.8409045314260255,
"learning_rate": 6.59918892850144e-05,
"loss": 1.3263,
"step": 330
},
{
"epoch": 1.7421052631578946,
"grad_norm": 1.9806940703831386,
"learning_rate": 6.587999719168401e-05,
"loss": 1.3179,
"step": 331
},
{
"epoch": 1.7473684210526317,
"grad_norm": 2.231645147378468,
"learning_rate": 6.576775569163458e-05,
"loss": 1.3216,
"step": 332
},
{
"epoch": 1.7526315789473683,
"grad_norm": 2.242436351469589,
"learning_rate": 6.565516630024236e-05,
"loss": 1.3263,
"step": 333
},
{
"epoch": 1.7578947368421054,
"grad_norm": 1.418447898215289,
"learning_rate": 6.554223053758055e-05,
"loss": 1.317,
"step": 334
},
{
"epoch": 1.763157894736842,
"grad_norm": 2.1049377231565036,
"learning_rate": 6.542894992839873e-05,
"loss": 1.3278,
"step": 335
},
{
"epoch": 1.768421052631579,
"grad_norm": 1.9649271286389844,
"learning_rate": 6.531532600210222e-05,
"loss": 1.3309,
"step": 336
},
{
"epoch": 1.7736842105263158,
"grad_norm": 1.3002559482544591,
"learning_rate": 6.520136029273151e-05,
"loss": 1.3003,
"step": 337
},
{
"epoch": 1.7789473684210526,
"grad_norm": 2.4684068562448136,
"learning_rate": 6.508705433894149e-05,
"loss": 1.32,
"step": 338
},
{
"epoch": 1.7842105263157895,
"grad_norm": 1.5743465960197915,
"learning_rate": 6.497240968398072e-05,
"loss": 1.3006,
"step": 339
},
{
"epoch": 1.7894736842105263,
"grad_norm": 2.59770911193071,
"learning_rate": 6.48574278756706e-05,
"loss": 1.3222,
"step": 340
},
{
"epoch": 1.7947368421052632,
"grad_norm": 1.7842505149097647,
"learning_rate": 6.474211046638438e-05,
"loss": 1.3161,
"step": 341
},
{
"epoch": 1.8,
"grad_norm": 2.7074992552378805,
"learning_rate": 6.462645901302633e-05,
"loss": 1.3281,
"step": 342
},
{
"epoch": 1.805263157894737,
"grad_norm": 1.8215475542278567,
"learning_rate": 6.451047507701065e-05,
"loss": 1.3282,
"step": 343
},
{
"epoch": 1.8105263157894735,
"grad_norm": 3.304881983512875,
"learning_rate": 6.439416022424036e-05,
"loss": 1.3391,
"step": 344
},
{
"epoch": 1.8157894736842106,
"grad_norm": 3.2476105816930954,
"learning_rate": 6.427751602508628e-05,
"loss": 1.3348,
"step": 345
},
{
"epoch": 1.8210526315789473,
"grad_norm": 1.5869549786160873,
"learning_rate": 6.416054405436564e-05,
"loss": 1.3201,
"step": 346
},
{
"epoch": 1.8263157894736843,
"grad_norm": 2.2683887128326723,
"learning_rate": 6.404324589132101e-05,
"loss": 1.3204,
"step": 347
},
{
"epoch": 1.831578947368421,
"grad_norm": 1.9620950054062172,
"learning_rate": 6.392562311959886e-05,
"loss": 1.3158,
"step": 348
},
{
"epoch": 1.836842105263158,
"grad_norm": 1.8047773439892525,
"learning_rate": 6.380767732722821e-05,
"loss": 1.3181,
"step": 349
},
{
"epoch": 1.8421052631578947,
"grad_norm": 1.911531036771628,
"learning_rate": 6.368941010659921e-05,
"loss": 1.3292,
"step": 350
},
{
"epoch": 1.8473684210526315,
"grad_norm": 1.636720765605733,
"learning_rate": 6.35708230544416e-05,
"loss": 1.3091,
"step": 351
},
{
"epoch": 1.8526315789473684,
"grad_norm": 1.424304904246396,
"learning_rate": 6.34519177718032e-05,
"loss": 1.3207,
"step": 352
},
{
"epoch": 1.8578947368421053,
"grad_norm": 1.3525184453889394,
"learning_rate": 6.333269586402827e-05,
"loss": 1.3125,
"step": 353
},
{
"epoch": 1.8631578947368421,
"grad_norm": 1.7281390731184902,
"learning_rate": 6.321315894073581e-05,
"loss": 1.3231,
"step": 354
},
{
"epoch": 1.868421052631579,
"grad_norm": 1.1089366431889842,
"learning_rate": 6.309330861579786e-05,
"loss": 1.3238,
"step": 355
},
{
"epoch": 1.8736842105263158,
"grad_norm": 1.876591117688095,
"learning_rate": 6.297314650731775e-05,
"loss": 1.3118,
"step": 356
},
{
"epoch": 1.8789473684210525,
"grad_norm": 1.61640922760774,
"learning_rate": 6.285267423760817e-05,
"loss": 1.3263,
"step": 357
},
{
"epoch": 1.8842105263157896,
"grad_norm": 1.4451990798983758,
"learning_rate": 6.273189343316929e-05,
"loss": 1.325,
"step": 358
},
{
"epoch": 1.8894736842105262,
"grad_norm": 1.3409307869705591,
"learning_rate": 6.261080572466688e-05,
"loss": 1.3057,
"step": 359
},
{
"epoch": 1.8947368421052633,
"grad_norm": 1.6052273256057499,
"learning_rate": 6.248941274691017e-05,
"loss": 1.3252,
"step": 360
},
{
"epoch": 1.9,
"grad_norm": 2.366978019871103,
"learning_rate": 6.236771613882987e-05,
"loss": 1.3179,
"step": 361
},
{
"epoch": 1.905263157894737,
"grad_norm": 1.1868922713453152,
"learning_rate": 6.224571754345602e-05,
"loss": 1.3082,
"step": 362
},
{
"epoch": 1.9105263157894736,
"grad_norm": 2.2556197419222612,
"learning_rate": 6.21234186078958e-05,
"loss": 1.3115,
"step": 363
},
{
"epoch": 1.9157894736842105,
"grad_norm": 1.7410078285379156,
"learning_rate": 6.200082098331126e-05,
"loss": 1.3281,
"step": 364
},
{
"epoch": 1.9210526315789473,
"grad_norm": 1.7950505417159182,
"learning_rate": 6.18779263248971e-05,
"loss": 1.3162,
"step": 365
},
{
"epoch": 1.9263157894736842,
"grad_norm": 1.8544654429983962,
"learning_rate": 6.175473629185822e-05,
"loss": 1.3205,
"step": 366
},
{
"epoch": 1.931578947368421,
"grad_norm": 1.7372962100479836,
"learning_rate": 6.163125254738751e-05,
"loss": 1.3065,
"step": 367
},
{
"epoch": 1.936842105263158,
"grad_norm": 2.242141298655648,
"learning_rate": 6.150747675864314e-05,
"loss": 1.2985,
"step": 368
},
{
"epoch": 1.9421052631578948,
"grad_norm": 1.481088212600443,
"learning_rate": 6.138341059672622e-05,
"loss": 1.3136,
"step": 369
},
{
"epoch": 1.9473684210526314,
"grad_norm": 2.1356181680202253,
"learning_rate": 6.125905573665824e-05,
"loss": 1.3282,
"step": 370
},
{
"epoch": 1.9526315789473685,
"grad_norm": 1.6259642009051207,
"learning_rate": 6.113441385735836e-05,
"loss": 1.3131,
"step": 371
},
{
"epoch": 1.9578947368421051,
"grad_norm": 1.973896595209029,
"learning_rate": 6.100948664162081e-05,
"loss": 1.3182,
"step": 372
},
{
"epoch": 1.9631578947368422,
"grad_norm": 1.616074252415091,
"learning_rate": 6.088427577609219e-05,
"loss": 1.3037,
"step": 373
},
{
"epoch": 1.9684210526315788,
"grad_norm": 1.777657189903051,
"learning_rate": 6.075878295124861e-05,
"loss": 1.3096,
"step": 374
},
{
"epoch": 1.973684210526316,
"grad_norm": 1.5325372367258376,
"learning_rate": 6.063300986137297e-05,
"loss": 1.3092,
"step": 375
},
{
"epoch": 1.9789473684210526,
"grad_norm": 1.7425893453777117,
"learning_rate": 6.0506958204531996e-05,
"loss": 1.3094,
"step": 376
},
{
"epoch": 1.9842105263157894,
"grad_norm": 1.2678177296707205,
"learning_rate": 6.0380629682553395e-05,
"loss": 1.2995,
"step": 377
},
{
"epoch": 1.9894736842105263,
"grad_norm": 2.083473793091378,
"learning_rate": 6.025402600100283e-05,
"loss": 1.3133,
"step": 378
},
{
"epoch": 1.9947368421052631,
"grad_norm": 1.5761354608098717,
"learning_rate": 6.012714886916088e-05,
"loss": 1.3232,
"step": 379
},
{
"epoch": 2.0,
"grad_norm": 1.6759654932294628,
"learning_rate": 6.000000000000001e-05,
"loss": 1.295,
"step": 380
},
{
"epoch": 2.0052631578947366,
"grad_norm": 1.6566803747791274,
"learning_rate": 5.987258111016139e-05,
"loss": 1.269,
"step": 381
},
{
"epoch": 2.0105263157894737,
"grad_norm": 1.7773404022104615,
"learning_rate": 5.974489391993182e-05,
"loss": 1.2756,
"step": 382
},
{
"epoch": 2.0157894736842104,
"grad_norm": 1.7734820944361986,
"learning_rate": 5.9616940153220336e-05,
"loss": 1.3024,
"step": 383
},
{
"epoch": 2.0210526315789474,
"grad_norm": 1.3832861885005663,
"learning_rate": 5.948872153753509e-05,
"loss": 1.292,
"step": 384
},
{
"epoch": 2.026315789473684,
"grad_norm": 1.9267936406726134,
"learning_rate": 5.936023980395997e-05,
"loss": 1.2974,
"step": 385
},
{
"epoch": 2.031578947368421,
"grad_norm": 1.1634482135657338,
"learning_rate": 5.923149668713118e-05,
"loss": 1.2864,
"step": 386
},
{
"epoch": 2.036842105263158,
"grad_norm": 1.4709242560357587,
"learning_rate": 5.9102493925213946e-05,
"loss": 1.2719,
"step": 387
},
{
"epoch": 2.042105263157895,
"grad_norm": 1.3716802116661737,
"learning_rate": 5.8973233259878914e-05,
"loss": 1.2688,
"step": 388
},
{
"epoch": 2.0473684210526315,
"grad_norm": 1.8540565451199285,
"learning_rate": 5.8843716436278696e-05,
"loss": 1.292,
"step": 389
},
{
"epoch": 2.0526315789473686,
"grad_norm": 1.6569753347566705,
"learning_rate": 5.871394520302432e-05,
"loss": 1.2923,
"step": 390
},
{
"epoch": 2.057894736842105,
"grad_norm": 1.3344056097550805,
"learning_rate": 5.85839213121616e-05,
"loss": 1.2783,
"step": 391
},
{
"epoch": 2.0631578947368423,
"grad_norm": 1.0756154226095422,
"learning_rate": 5.845364651914752e-05,
"loss": 1.2823,
"step": 392
},
{
"epoch": 2.068421052631579,
"grad_norm": 1.5938788684018976,
"learning_rate": 5.832312258282645e-05,
"loss": 1.2872,
"step": 393
},
{
"epoch": 2.0736842105263156,
"grad_norm": 1.9960858248177353,
"learning_rate": 5.8192351265406466e-05,
"loss": 1.2819,
"step": 394
},
{
"epoch": 2.0789473684210527,
"grad_norm": 1.500794950504469,
"learning_rate": 5.806133433243558e-05,
"loss": 1.3018,
"step": 395
},
{
"epoch": 2.0842105263157893,
"grad_norm": 1.9776756621227738,
"learning_rate": 5.793007355277783e-05,
"loss": 1.2947,
"step": 396
},
{
"epoch": 2.0894736842105264,
"grad_norm": 1.5136499333830533,
"learning_rate": 5.7798570698589465e-05,
"loss": 1.2847,
"step": 397
},
{
"epoch": 2.094736842105263,
"grad_norm": 2.7888675821510467,
"learning_rate": 5.7666827545294965e-05,
"loss": 1.2803,
"step": 398
},
{
"epoch": 2.1,
"grad_norm": 1.997214443213332,
"learning_rate": 5.75348458715631e-05,
"loss": 1.2889,
"step": 399
},
{
"epoch": 2.1052631578947367,
"grad_norm": 2.636705031350177,
"learning_rate": 5.740262745928293e-05,
"loss": 1.2964,
"step": 400
},
{
"epoch": 2.110526315789474,
"grad_norm": 2.1100662062402775,
"learning_rate": 5.727017409353971e-05,
"loss": 1.2878,
"step": 401
},
{
"epoch": 2.1157894736842104,
"grad_norm": 1.9907403756995032,
"learning_rate": 5.713748756259085e-05,
"loss": 1.2942,
"step": 402
},
{
"epoch": 2.1210526315789475,
"grad_norm": 2.0618250265894433,
"learning_rate": 5.700456965784167e-05,
"loss": 1.2857,
"step": 403
},
{
"epoch": 2.126315789473684,
"grad_norm": 0.9319289242731835,
"learning_rate": 5.687142217382129e-05,
"loss": 1.2708,
"step": 404
},
{
"epoch": 2.1315789473684212,
"grad_norm": 2.992807432876805,
"learning_rate": 5.673804690815845e-05,
"loss": 1.309,
"step": 405
},
{
"epoch": 2.136842105263158,
"grad_norm": 1.864224068302743,
"learning_rate": 5.660444566155709e-05,
"loss": 1.2854,
"step": 406
},
{
"epoch": 2.1421052631578945,
"grad_norm": 3.4214971672572596,
"learning_rate": 5.647062023777221e-05,
"loss": 1.2927,
"step": 407
},
{
"epoch": 2.1473684210526316,
"grad_norm": 2.508109225516717,
"learning_rate": 5.633657244358535e-05,
"loss": 1.2829,
"step": 408
},
{
"epoch": 2.1526315789473682,
"grad_norm": 3.929215425642367,
"learning_rate": 5.6202304088780335e-05,
"loss": 1.2946,
"step": 409
},
{
"epoch": 2.1578947368421053,
"grad_norm": 3.798085324745515,
"learning_rate": 5.606781698611879e-05,
"loss": 1.3013,
"step": 410
},
{
"epoch": 2.163157894736842,
"grad_norm": 2.0206510011506222,
"learning_rate": 5.593311295131562e-05,
"loss": 1.2917,
"step": 411
},
{
"epoch": 2.168421052631579,
"grad_norm": 3.092904855930784,
"learning_rate": 5.579819380301458e-05,
"loss": 1.2795,
"step": 412
},
{
"epoch": 2.1736842105263157,
"grad_norm": 2.2386119816911623,
"learning_rate": 5.5663061362763665e-05,
"loss": 1.2964,
"step": 413
},
{
"epoch": 2.1789473684210527,
"grad_norm": 2.8845075274191223,
"learning_rate": 5.552771745499051e-05,
"loss": 1.29,
"step": 414
},
{
"epoch": 2.1842105263157894,
"grad_norm": 2.355680412049654,
"learning_rate": 5.5392163906977835e-05,
"loss": 1.2802,
"step": 415
},
{
"epoch": 2.1894736842105265,
"grad_norm": 2.577365756174829,
"learning_rate": 5.525640254883865e-05,
"loss": 1.2894,
"step": 416
},
{
"epoch": 2.194736842105263,
"grad_norm": 2.0046660703267576,
"learning_rate": 5.512043521349166e-05,
"loss": 1.2873,
"step": 417
},
{
"epoch": 2.2,
"grad_norm": 2.576417069207704,
"learning_rate": 5.4984263736636494e-05,
"loss": 1.2759,
"step": 418
},
{
"epoch": 2.205263157894737,
"grad_norm": 1.9511016222282593,
"learning_rate": 5.4847889956728834e-05,
"loss": 1.298,
"step": 419
},
{
"epoch": 2.2105263157894735,
"grad_norm": 2.6670388356828285,
"learning_rate": 5.471131571495574e-05,
"loss": 1.2951,
"step": 420
},
{
"epoch": 2.2157894736842105,
"grad_norm": 2.126207826369636,
"learning_rate": 5.457454285521064e-05,
"loss": 1.2812,
"step": 421
},
{
"epoch": 2.221052631578947,
"grad_norm": 2.400820316806507,
"learning_rate": 5.4437573224068595e-05,
"loss": 1.2948,
"step": 422
},
{
"epoch": 2.2263157894736842,
"grad_norm": 1.7795504525185426,
"learning_rate": 5.4300408670761204e-05,
"loss": 1.2959,
"step": 423
},
{
"epoch": 2.231578947368421,
"grad_norm": 2.8829533810849663,
"learning_rate": 5.416305104715175e-05,
"loss": 1.3074,
"step": 424
},
{
"epoch": 2.236842105263158,
"grad_norm": 2.159110693359624,
"learning_rate": 5.4025502207710184e-05,
"loss": 1.2797,
"step": 425
},
{
"epoch": 2.2421052631578946,
"grad_norm": 3.0783424431722666,
"learning_rate": 5.388776400948803e-05,
"loss": 1.2864,
"step": 426
},
{
"epoch": 2.2473684210526317,
"grad_norm": 2.640451536165918,
"learning_rate": 5.3749838312093364e-05,
"loss": 1.2987,
"step": 427
},
{
"epoch": 2.2526315789473683,
"grad_norm": 2.413356511304884,
"learning_rate": 5.361172697766573e-05,
"loss": 1.2775,
"step": 428
},
{
"epoch": 2.2578947368421054,
"grad_norm": 2.334518355071201,
"learning_rate": 5.3473431870850904e-05,
"loss": 1.275,
"step": 429
},
{
"epoch": 2.263157894736842,
"grad_norm": 2.4122426514527984,
"learning_rate": 5.333495485877583e-05,
"loss": 1.2961,
"step": 430
},
{
"epoch": 2.268421052631579,
"grad_norm": 2.2362176845592208,
"learning_rate": 5.3196297811023316e-05,
"loss": 1.2937,
"step": 431
},
{
"epoch": 2.2736842105263158,
"grad_norm": 2.293429615790018,
"learning_rate": 5.305746259960689e-05,
"loss": 1.2852,
"step": 432
},
{
"epoch": 2.2789473684210524,
"grad_norm": 1.786254087945556,
"learning_rate": 5.291845109894544e-05,
"loss": 1.2799,
"step": 433
},
{
"epoch": 2.2842105263157895,
"grad_norm": 2.6945565744818407,
"learning_rate": 5.277926518583793e-05,
"loss": 1.2921,
"step": 434
},
{
"epoch": 2.2894736842105265,
"grad_norm": 2.2194659113523962,
"learning_rate": 5.263990673943811e-05,
"loss": 1.3046,
"step": 435
},
{
"epoch": 2.294736842105263,
"grad_norm": 2.547332541805207,
"learning_rate": 5.250037764122907e-05,
"loss": 1.2842,
"step": 436
},
{
"epoch": 2.3,
"grad_norm": 2.2417529148390325,
"learning_rate": 5.23606797749979e-05,
"loss": 1.2737,
"step": 437
},
{
"epoch": 2.305263157894737,
"grad_norm": 2.297738957213229,
"learning_rate": 5.2220815026810234e-05,
"loss": 1.2964,
"step": 438
},
{
"epoch": 2.3105263157894735,
"grad_norm": 2.0161964482231416,
"learning_rate": 5.208078528498476e-05,
"loss": 1.2734,
"step": 439
},
{
"epoch": 2.3157894736842106,
"grad_norm": 2.5589773064425896,
"learning_rate": 5.194059244006779e-05,
"loss": 1.3239,
"step": 440
},
{
"epoch": 2.3210526315789473,
"grad_norm": 2.5534996515172246,
"learning_rate": 5.180023838480765e-05,
"loss": 1.2839,
"step": 441
},
{
"epoch": 2.3263157894736843,
"grad_norm": 1.6213064440253335,
"learning_rate": 5.165972501412921e-05,
"loss": 1.2804,
"step": 442
},
{
"epoch": 2.331578947368421,
"grad_norm": 1.3934815609685396,
"learning_rate": 5.151905422510825e-05,
"loss": 1.2733,
"step": 443
},
{
"epoch": 2.336842105263158,
"grad_norm": 2.6675021957708447,
"learning_rate": 5.137822791694585e-05,
"loss": 1.2847,
"step": 444
},
{
"epoch": 2.3421052631578947,
"grad_norm": 2.0956189405862027,
"learning_rate": 5.123724799094279e-05,
"loss": 1.2705,
"step": 445
},
{
"epoch": 2.3473684210526318,
"grad_norm": 2.419629528561346,
"learning_rate": 5.109611635047379e-05,
"loss": 1.2879,
"step": 446
},
{
"epoch": 2.3526315789473684,
"grad_norm": 2.4104356876557755,
"learning_rate": 5.095483490096194e-05,
"loss": 1.2935,
"step": 447
},
{
"epoch": 2.3578947368421055,
"grad_norm": 1.8777710470149278,
"learning_rate": 5.081340554985287e-05,
"loss": 1.2775,
"step": 448
},
{
"epoch": 2.363157894736842,
"grad_norm": 1.487851978844643,
"learning_rate": 5.067183020658905e-05,
"loss": 1.2761,
"step": 449
},
{
"epoch": 2.3684210526315788,
"grad_norm": 2.4549351462483586,
"learning_rate": 5.053011078258397e-05,
"loss": 1.2692,
"step": 450
},
{
"epoch": 2.373684210526316,
"grad_norm": 1.8656568337670418,
"learning_rate": 5.03882491911964e-05,
"loss": 1.2911,
"step": 451
},
{
"epoch": 2.3789473684210525,
"grad_norm": 2.7641450741813265,
"learning_rate": 5.024624734770446e-05,
"loss": 1.2735,
"step": 452
},
{
"epoch": 2.3842105263157896,
"grad_norm": 2.722572050222999,
"learning_rate": 5.010410716927988e-05,
"loss": 1.2737,
"step": 453
},
{
"epoch": 2.389473684210526,
"grad_norm": 1.428940654329024,
"learning_rate": 4.9961830574962e-05,
"loss": 1.2888,
"step": 454
},
{
"epoch": 2.3947368421052633,
"grad_norm": 1.5780962302368826,
"learning_rate": 4.981941948563197e-05,
"loss": 1.2812,
"step": 455
},
{
"epoch": 2.4,
"grad_norm": 2.0344826746474283,
"learning_rate": 4.967687582398671e-05,
"loss": 1.2864,
"step": 456
},
{
"epoch": 2.405263157894737,
"grad_norm": 1.4319688356833826,
"learning_rate": 4.953420151451304e-05,
"loss": 1.2834,
"step": 457
},
{
"epoch": 2.4105263157894736,
"grad_norm": 2.601310717014097,
"learning_rate": 4.939139848346164e-05,
"loss": 1.2823,
"step": 458
},
{
"epoch": 2.4157894736842107,
"grad_norm": 2.348235653569354,
"learning_rate": 4.924846865882107e-05,
"loss": 1.2846,
"step": 459
},
{
"epoch": 2.4210526315789473,
"grad_norm": 1.726129892949758,
"learning_rate": 4.9105413970291747e-05,
"loss": 1.3011,
"step": 460
},
{
"epoch": 2.4263157894736844,
"grad_norm": 1.7111299716712474,
"learning_rate": 4.896223634925984e-05,
"loss": 1.3116,
"step": 461
},
{
"epoch": 2.431578947368421,
"grad_norm": 1.803266671516624,
"learning_rate": 4.8818937728771294e-05,
"loss": 1.272,
"step": 462
},
{
"epoch": 2.4368421052631577,
"grad_norm": 1.3759594624212466,
"learning_rate": 4.867552004350564e-05,
"loss": 1.289,
"step": 463
},
{
"epoch": 2.442105263157895,
"grad_norm": 2.3296183382561253,
"learning_rate": 4.853198522974988e-05,
"loss": 1.2911,
"step": 464
},
{
"epoch": 2.4473684210526314,
"grad_norm": 1.9742731162641471,
"learning_rate": 4.8388335225372416e-05,
"loss": 1.2656,
"step": 465
},
{
"epoch": 2.4526315789473685,
"grad_norm": 1.6842871202377092,
"learning_rate": 4.8244571969796817e-05,
"loss": 1.2891,
"step": 466
},
{
"epoch": 2.457894736842105,
"grad_norm": 1.5497120892994825,
"learning_rate": 4.810069740397569e-05,
"loss": 1.2844,
"step": 467
},
{
"epoch": 2.463157894736842,
"grad_norm": 1.824870478700358,
"learning_rate": 4.795671347036439e-05,
"loss": 1.2902,
"step": 468
},
{
"epoch": 2.468421052631579,
"grad_norm": 1.3990180325007069,
"learning_rate": 4.781262211289491e-05,
"loss": 1.281,
"step": 469
},
{
"epoch": 2.473684210526316,
"grad_norm": 2.320520504849803,
"learning_rate": 4.7668425276949546e-05,
"loss": 1.2838,
"step": 470
},
{
"epoch": 2.4789473684210526,
"grad_norm": 2.1234854667075758,
"learning_rate": 4.7524124909334653e-05,
"loss": 1.2797,
"step": 471
},
{
"epoch": 2.4842105263157896,
"grad_norm": 1.422390194322296,
"learning_rate": 4.7379722958254394e-05,
"loss": 1.2896,
"step": 472
},
{
"epoch": 2.4894736842105263,
"grad_norm": 1.3196019422752756,
"learning_rate": 4.7235221373284407e-05,
"loss": 1.2744,
"step": 473
},
{
"epoch": 2.4947368421052634,
"grad_norm": 1.7461193994213873,
"learning_rate": 4.709062210534547e-05,
"loss": 1.2887,
"step": 474
},
{
"epoch": 2.5,
"grad_norm": 1.3331786938242027,
"learning_rate": 4.694592710667723e-05,
"loss": 1.281,
"step": 475
},
{
"epoch": 2.5052631578947366,
"grad_norm": 2.0104890922006464,
"learning_rate": 4.680113833081173e-05,
"loss": 1.2786,
"step": 476
},
{
"epoch": 2.5105263157894737,
"grad_norm": 1.92734853764442,
"learning_rate": 4.665625773254716e-05,
"loss": 1.2844,
"step": 477
},
{
"epoch": 2.515789473684211,
"grad_norm": 1.3368646117339327,
"learning_rate": 4.6511287267921394e-05,
"loss": 1.2944,
"step": 478
},
{
"epoch": 2.5210526315789474,
"grad_norm": 1.158860061239002,
"learning_rate": 4.636622889418558e-05,
"loss": 1.2728,
"step": 479
},
{
"epoch": 2.526315789473684,
"grad_norm": 1.7510317771973813,
"learning_rate": 4.622108456977773e-05,
"loss": 1.2752,
"step": 480
},
{
"epoch": 2.531578947368421,
"grad_norm": 1.3943091910623553,
"learning_rate": 4.60758562542963e-05,
"loss": 1.3005,
"step": 481
},
{
"epoch": 2.536842105263158,
"grad_norm": 1.8549674666458555,
"learning_rate": 4.593054590847368e-05,
"loss": 1.281,
"step": 482
},
{
"epoch": 2.542105263157895,
"grad_norm": 1.7188026241177852,
"learning_rate": 4.57851554941498e-05,
"loss": 1.3061,
"step": 483
},
{
"epoch": 2.5473684210526315,
"grad_norm": 1.2685249463251793,
"learning_rate": 4.563968697424553e-05,
"loss": 1.2822,
"step": 484
},
{
"epoch": 2.5526315789473686,
"grad_norm": 1.549673702485011,
"learning_rate": 4.549414231273633e-05,
"loss": 1.2958,
"step": 485
},
{
"epoch": 2.557894736842105,
"grad_norm": 1.0822272735204688,
"learning_rate": 4.534852347462559e-05,
"loss": 1.2829,
"step": 486
},
{
"epoch": 2.5631578947368423,
"grad_norm": 0.9555789360037702,
"learning_rate": 4.5202832425918166e-05,
"loss": 1.3051,
"step": 487
},
{
"epoch": 2.568421052631579,
"grad_norm": 1.067488479666183,
"learning_rate": 4.5057071133593853e-05,
"loss": 1.275,
"step": 488
},
{
"epoch": 2.5736842105263156,
"grad_norm": 1.3214859289777758,
"learning_rate": 4.4911241565580796e-05,
"loss": 1.2887,
"step": 489
},
{
"epoch": 2.5789473684210527,
"grad_norm": 1.309909912707582,
"learning_rate": 4.476534569072895e-05,
"loss": 1.2933,
"step": 490
},
{
"epoch": 2.5842105263157897,
"grad_norm": 1.103182457932209,
"learning_rate": 4.4619385478783456e-05,
"loss": 1.2785,
"step": 491
},
{
"epoch": 2.5894736842105264,
"grad_norm": 1.050641547852913,
"learning_rate": 4.4473362900358065e-05,
"loss": 1.2877,
"step": 492
},
{
"epoch": 2.594736842105263,
"grad_norm": 1.1472680394377797,
"learning_rate": 4.432727992690857e-05,
"loss": 1.285,
"step": 493
},
{
"epoch": 2.6,
"grad_norm": 1.362629738278887,
"learning_rate": 4.418113853070614e-05,
"loss": 1.2774,
"step": 494
},
{
"epoch": 2.6052631578947367,
"grad_norm": 0.9212638160971107,
"learning_rate": 4.403494068481074e-05,
"loss": 1.2956,
"step": 495
},
{
"epoch": 2.610526315789474,
"grad_norm": 1.3060601473810125,
"learning_rate": 4.388868836304442e-05,
"loss": 1.2864,
"step": 496
},
{
"epoch": 2.6157894736842104,
"grad_norm": 0.9964781716303204,
"learning_rate": 4.374238353996472e-05,
"loss": 1.2846,
"step": 497
},
{
"epoch": 2.6210526315789475,
"grad_norm": 1.0334496502405375,
"learning_rate": 4.3596028190838045e-05,
"loss": 1.2751,
"step": 498
},
{
"epoch": 2.626315789473684,
"grad_norm": 0.8374244946961393,
"learning_rate": 4.3449624291612895e-05,
"loss": 1.2846,
"step": 499
},
{
"epoch": 2.6315789473684212,
"grad_norm": 1.271324423070232,
"learning_rate": 4.33031738188933e-05,
"loss": 1.2893,
"step": 500
},
{
"epoch": 2.636842105263158,
"grad_norm": 1.0290842796861808,
"learning_rate": 4.315667874991205e-05,
"loss": 1.2769,
"step": 501
},
{
"epoch": 2.6421052631578945,
"grad_norm": 1.6287387587682205,
"learning_rate": 4.3010141062504e-05,
"loss": 1.2808,
"step": 502
},
{
"epoch": 2.6473684210526316,
"grad_norm": 1.05633292251089,
"learning_rate": 4.286356273507949e-05,
"loss": 1.2752,
"step": 503
},
{
"epoch": 2.6526315789473687,
"grad_norm": 1.2639851855275497,
"learning_rate": 4.271694574659744e-05,
"loss": 1.2673,
"step": 504
},
{
"epoch": 2.6578947368421053,
"grad_norm": 1.3942797511952854,
"learning_rate": 4.257029207653881e-05,
"loss": 1.2725,
"step": 505
},
{
"epoch": 2.663157894736842,
"grad_norm": 0.9618496184388876,
"learning_rate": 4.242360370487976e-05,
"loss": 1.2747,
"step": 506
},
{
"epoch": 2.668421052631579,
"grad_norm": 1.141131756163478,
"learning_rate": 4.2276882612064936e-05,
"loss": 1.3005,
"step": 507
},
{
"epoch": 2.6736842105263157,
"grad_norm": 0.9471533928078693,
"learning_rate": 4.213013077898084e-05,
"loss": 1.2726,
"step": 508
},
{
"epoch": 2.6789473684210527,
"grad_norm": 0.9237490499511763,
"learning_rate": 4.1983350186928894e-05,
"loss": 1.2801,
"step": 509
},
{
"epoch": 2.6842105263157894,
"grad_norm": 1.1110718726310678,
"learning_rate": 4.183654281759888e-05,
"loss": 1.2674,
"step": 510
},
{
"epoch": 2.6894736842105265,
"grad_norm": 1.2581508468500637,
"learning_rate": 4.168971065304205e-05,
"loss": 1.2809,
"step": 511
},
{
"epoch": 2.694736842105263,
"grad_norm": 0.9468779696627782,
"learning_rate": 4.154285567564442e-05,
"loss": 1.2796,
"step": 512
},
{
"epoch": 2.7,
"grad_norm": 1.3134333293058689,
"learning_rate": 4.139597986810005e-05,
"loss": 1.2698,
"step": 513
},
{
"epoch": 2.705263157894737,
"grad_norm": 0.9243484356050305,
"learning_rate": 4.124908521338416e-05,
"loss": 1.2745,
"step": 514
},
{
"epoch": 2.7105263157894735,
"grad_norm": 0.9728548533723144,
"learning_rate": 4.110217369472649e-05,
"loss": 1.2925,
"step": 515
},
{
"epoch": 2.7157894736842105,
"grad_norm": 0.7367530477112044,
"learning_rate": 4.095524729558441e-05,
"loss": 1.2677,
"step": 516
},
{
"epoch": 2.7210526315789476,
"grad_norm": 0.9279276228473495,
"learning_rate": 4.080830799961622e-05,
"loss": 1.2802,
"step": 517
},
{
"epoch": 2.7263157894736842,
"grad_norm": 1.3931410811444014,
"learning_rate": 4.0661357790654345e-05,
"loss": 1.262,
"step": 518
},
{
"epoch": 2.731578947368421,
"grad_norm": 0.9883528178224094,
"learning_rate": 4.0514398652678514e-05,
"loss": 1.2964,
"step": 519
},
{
"epoch": 2.736842105263158,
"grad_norm": 1.3163227077320665,
"learning_rate": 4.0367432569789065e-05,
"loss": 1.2805,
"step": 520
},
{
"epoch": 2.7421052631578946,
"grad_norm": 0.7397578201219466,
"learning_rate": 4.0220461526180023e-05,
"loss": 1.2773,
"step": 521
},
{
"epoch": 2.7473684210526317,
"grad_norm": 0.9990315603511356,
"learning_rate": 4.007348750611245e-05,
"loss": 1.292,
"step": 522
},
{
"epoch": 2.7526315789473683,
"grad_norm": 1.1681928253101022,
"learning_rate": 3.9926512493887555e-05,
"loss": 1.2893,
"step": 523
},
{
"epoch": 2.7578947368421054,
"grad_norm": 1.1014263843169803,
"learning_rate": 3.977953847381998e-05,
"loss": 1.2715,
"step": 524
},
{
"epoch": 2.763157894736842,
"grad_norm": 1.1159951506001466,
"learning_rate": 3.963256743021095e-05,
"loss": 1.2785,
"step": 525
},
{
"epoch": 2.768421052631579,
"grad_norm": 1.2507849560008404,
"learning_rate": 3.9485601347321486e-05,
"loss": 1.2906,
"step": 526
},
{
"epoch": 2.7736842105263158,
"grad_norm": 0.8936481542314029,
"learning_rate": 3.933864220934566e-05,
"loss": 1.2669,
"step": 527
},
{
"epoch": 2.7789473684210524,
"grad_norm": 0.9849739951015418,
"learning_rate": 3.919169200038379e-05,
"loss": 1.2771,
"step": 528
},
{
"epoch": 2.7842105263157895,
"grad_norm": 0.837609493415301,
"learning_rate": 3.904475270441561e-05,
"loss": 1.266,
"step": 529
},
{
"epoch": 2.7894736842105265,
"grad_norm": 0.7219038051900546,
"learning_rate": 3.889782630527353e-05,
"loss": 1.2726,
"step": 530
},
{
"epoch": 2.794736842105263,
"grad_norm": 0.7505423536800923,
"learning_rate": 3.875091478661585e-05,
"loss": 1.2703,
"step": 531
},
{
"epoch": 2.8,
"grad_norm": 0.7301064067667716,
"learning_rate": 3.860402013189998e-05,
"loss": 1.2812,
"step": 532
},
{
"epoch": 2.805263157894737,
"grad_norm": 0.822332511103024,
"learning_rate": 3.845714432435558e-05,
"loss": 1.2718,
"step": 533
},
{
"epoch": 2.8105263157894735,
"grad_norm": 0.7783007350783687,
"learning_rate": 3.8310289346957965e-05,
"loss": 1.2574,
"step": 534
},
{
"epoch": 2.8157894736842106,
"grad_norm": 0.957233585531184,
"learning_rate": 3.816345718240113e-05,
"loss": 1.2805,
"step": 535
},
{
"epoch": 2.8210526315789473,
"grad_norm": 1.1226403614971794,
"learning_rate": 3.8016649813071106e-05,
"loss": 1.2983,
"step": 536
},
{
"epoch": 2.8263157894736843,
"grad_norm": 1.306496898793406,
"learning_rate": 3.7869869221019177e-05,
"loss": 1.2727,
"step": 537
},
{
"epoch": 2.831578947368421,
"grad_norm": 0.5971018924485769,
"learning_rate": 3.772311738793507e-05,
"loss": 1.2834,
"step": 538
},
{
"epoch": 2.836842105263158,
"grad_norm": 0.9138436747802899,
"learning_rate": 3.757639629512026e-05,
"loss": 1.2871,
"step": 539
},
{
"epoch": 2.8421052631578947,
"grad_norm": 1.2871243761882003,
"learning_rate": 3.74297079234612e-05,
"loss": 1.2797,
"step": 540
},
{
"epoch": 2.8473684210526313,
"grad_norm": 0.951777404285425,
"learning_rate": 3.7283054253402574e-05,
"loss": 1.2754,
"step": 541
},
{
"epoch": 2.8526315789473684,
"grad_norm": 0.9411344162055937,
"learning_rate": 3.713643726492053e-05,
"loss": 1.2721,
"step": 542
},
{
"epoch": 2.8578947368421055,
"grad_norm": 1.317846119765965,
"learning_rate": 3.698985893749599e-05,
"loss": 1.2887,
"step": 543
},
{
"epoch": 2.863157894736842,
"grad_norm": 0.8089940300375972,
"learning_rate": 3.6843321250087966e-05,
"loss": 1.2848,
"step": 544
},
{
"epoch": 2.8684210526315788,
"grad_norm": 0.5361830354797774,
"learning_rate": 3.669682618110671e-05,
"loss": 1.2657,
"step": 545
},
{
"epoch": 2.873684210526316,
"grad_norm": 0.8183317690070797,
"learning_rate": 3.655037570838711e-05,
"loss": 1.2866,
"step": 546
},
{
"epoch": 2.8789473684210525,
"grad_norm": 1.2353845817797051,
"learning_rate": 3.640397180916197e-05,
"loss": 1.2806,
"step": 547
},
{
"epoch": 2.8842105263157896,
"grad_norm": 1.275438903457986,
"learning_rate": 3.62576164600353e-05,
"loss": 1.3042,
"step": 548
},
{
"epoch": 2.889473684210526,
"grad_norm": 0.5384567882124249,
"learning_rate": 3.611131163695561e-05,
"loss": 1.2689,
"step": 549
},
{
"epoch": 2.8947368421052633,
"grad_norm": 0.7803333541473619,
"learning_rate": 3.5965059315189274e-05,
"loss": 1.2797,
"step": 550
},
{
"epoch": 2.9,
"grad_norm": 1.4689702016559818,
"learning_rate": 3.581886146929387e-05,
"loss": 1.2648,
"step": 551
},
{
"epoch": 2.905263157894737,
"grad_norm": 0.6546922206210783,
"learning_rate": 3.567272007309145e-05,
"loss": 1.279,
"step": 552
},
{
"epoch": 2.9105263157894736,
"grad_norm": 0.706490220466843,
"learning_rate": 3.552663709964194e-05,
"loss": 1.2735,
"step": 553
},
{
"epoch": 2.9157894736842103,
"grad_norm": 1.3658529786651115,
"learning_rate": 3.538061452121656e-05,
"loss": 1.2916,
"step": 554
},
{
"epoch": 2.9210526315789473,
"grad_norm": 0.9489706293231372,
"learning_rate": 3.523465430927106e-05,
"loss": 1.2918,
"step": 555
},
{
"epoch": 2.9263157894736844,
"grad_norm": 0.8764748345395458,
"learning_rate": 3.50887584344192e-05,
"loss": 1.3015,
"step": 556
},
{
"epoch": 2.931578947368421,
"grad_norm": 0.5355041821626928,
"learning_rate": 3.494292886640615e-05,
"loss": 1.2751,
"step": 557
},
{
"epoch": 2.9368421052631577,
"grad_norm": 0.9242505060104863,
"learning_rate": 3.479716757408185e-05,
"loss": 1.2819,
"step": 558
},
{
"epoch": 2.942105263157895,
"grad_norm": 1.2554507748275814,
"learning_rate": 3.465147652537443e-05,
"loss": 1.276,
"step": 559
},
{
"epoch": 2.9473684210526314,
"grad_norm": 0.9095910936651702,
"learning_rate": 3.4505857687263675e-05,
"loss": 1.2753,
"step": 560
},
{
"epoch": 2.9526315789473685,
"grad_norm": 0.6651860681009616,
"learning_rate": 3.4360313025754476e-05,
"loss": 1.2695,
"step": 561
},
{
"epoch": 2.957894736842105,
"grad_norm": 0.9291042715583687,
"learning_rate": 3.421484450585023e-05,
"loss": 1.2961,
"step": 562
},
{
"epoch": 2.963157894736842,
"grad_norm": 1.3279167632034623,
"learning_rate": 3.406945409152632e-05,
"loss": 1.2858,
"step": 563
},
{
"epoch": 2.968421052631579,
"grad_norm": 0.5900225146576717,
"learning_rate": 3.392414374570371e-05,
"loss": 1.2786,
"step": 564
},
{
"epoch": 2.973684210526316,
"grad_norm": 0.8721553828236943,
"learning_rate": 3.377891543022229e-05,
"loss": 1.2712,
"step": 565
},
{
"epoch": 2.9789473684210526,
"grad_norm": 1.0505950441437681,
"learning_rate": 3.363377110581442e-05,
"loss": 1.2719,
"step": 566
},
{
"epoch": 2.984210526315789,
"grad_norm": 1.3040980751717048,
"learning_rate": 3.348871273207861e-05,
"loss": 1.2961,
"step": 567
},
{
"epoch": 2.9894736842105263,
"grad_norm": 0.7750168697948207,
"learning_rate": 3.334374226745285e-05,
"loss": 1.287,
"step": 568
},
{
"epoch": 2.9947368421052634,
"grad_norm": 0.797465388668966,
"learning_rate": 3.319886166918829e-05,
"loss": 1.2798,
"step": 569
},
{
"epoch": 3.0,
"grad_norm": 0.7195120795363182,
"learning_rate": 3.305407289332279e-05,
"loss": 1.2516,
"step": 570
},
{
"epoch": 3.0052631578947366,
"grad_norm": 0.9683227012264453,
"learning_rate": 3.290937789465454e-05,
"loss": 1.245,
"step": 571
},
{
"epoch": 3.0105263157894737,
"grad_norm": 1.0758712880323096,
"learning_rate": 3.276477862671562e-05,
"loss": 1.2628,
"step": 572
},
{
"epoch": 3.0157894736842104,
"grad_norm": 0.9883859720793654,
"learning_rate": 3.262027704174561e-05,
"loss": 1.2509,
"step": 573
},
{
"epoch": 3.0210526315789474,
"grad_norm": 0.9615705195781193,
"learning_rate": 3.247587509066535e-05,
"loss": 1.264,
"step": 574
},
{
"epoch": 3.026315789473684,
"grad_norm": 0.7910888117572926,
"learning_rate": 3.2331574723050474e-05,
"loss": 1.2454,
"step": 575
},
{
"epoch": 3.031578947368421,
"grad_norm": 0.6384643157937262,
"learning_rate": 3.218737788710509e-05,
"loss": 1.2538,
"step": 576
},
{
"epoch": 3.036842105263158,
"grad_norm": 0.5368984975332023,
"learning_rate": 3.2043286529635614e-05,
"loss": 1.2587,
"step": 577
},
{
"epoch": 3.042105263157895,
"grad_norm": 0.6163486545813159,
"learning_rate": 3.189930259602433e-05,
"loss": 1.2452,
"step": 578
},
{
"epoch": 3.0473684210526315,
"grad_norm": 0.72931652275654,
"learning_rate": 3.175542803020319e-05,
"loss": 1.2414,
"step": 579
},
{
"epoch": 3.0526315789473686,
"grad_norm": 0.8975216647677429,
"learning_rate": 3.161166477462759e-05,
"loss": 1.2562,
"step": 580
},
{
"epoch": 3.057894736842105,
"grad_norm": 0.6779341882405682,
"learning_rate": 3.146801477025013e-05,
"loss": 1.259,
"step": 581
},
{
"epoch": 3.0631578947368423,
"grad_norm": 0.35876586554623435,
"learning_rate": 3.132447995649438e-05,
"loss": 1.2439,
"step": 582
},
{
"epoch": 3.068421052631579,
"grad_norm": 0.4633889396883284,
"learning_rate": 3.11810622712287e-05,
"loss": 1.2443,
"step": 583
},
{
"epoch": 3.0736842105263156,
"grad_norm": 0.37725222875354836,
"learning_rate": 3.103776365074017e-05,
"loss": 1.244,
"step": 584
},
{
"epoch": 3.0789473684210527,
"grad_norm": 0.48272932389464906,
"learning_rate": 3.089458602970828e-05,
"loss": 1.2509,
"step": 585
},
{
"epoch": 3.0842105263157893,
"grad_norm": 0.5490877544866288,
"learning_rate": 3.075153134117893e-05,
"loss": 1.264,
"step": 586
},
{
"epoch": 3.0894736842105264,
"grad_norm": 0.4207792008385385,
"learning_rate": 3.060860151653837e-05,
"loss": 1.2519,
"step": 587
},
{
"epoch": 3.094736842105263,
"grad_norm": 0.44337087568296857,
"learning_rate": 3.046579848548697e-05,
"loss": 1.2387,
"step": 588
},
{
"epoch": 3.1,
"grad_norm": 0.4345387632778484,
"learning_rate": 3.0323124176013297e-05,
"loss": 1.2471,
"step": 589
},
{
"epoch": 3.1052631578947367,
"grad_norm": 0.34646556904906206,
"learning_rate": 3.0180580514368037e-05,
"loss": 1.2574,
"step": 590
},
{
"epoch": 3.110526315789474,
"grad_norm": 0.5480997618564871,
"learning_rate": 3.0038169425038007e-05,
"loss": 1.2483,
"step": 591
},
{
"epoch": 3.1157894736842104,
"grad_norm": 0.3643135054525602,
"learning_rate": 2.9895892830720137e-05,
"loss": 1.2586,
"step": 592
},
{
"epoch": 3.1210526315789475,
"grad_norm": 0.3565417102254677,
"learning_rate": 2.9753752652295538e-05,
"loss": 1.2391,
"step": 593
},
{
"epoch": 3.126315789473684,
"grad_norm": 0.4591251447075665,
"learning_rate": 2.961175080880362e-05,
"loss": 1.2496,
"step": 594
},
{
"epoch": 3.1315789473684212,
"grad_norm": 0.46367424343953406,
"learning_rate": 2.9469889217416045e-05,
"loss": 1.2466,
"step": 595
},
{
"epoch": 3.136842105263158,
"grad_norm": 0.3997287611132656,
"learning_rate": 2.9328169793410954e-05,
"loss": 1.2458,
"step": 596
},
{
"epoch": 3.1421052631578945,
"grad_norm": 0.39902269987295985,
"learning_rate": 2.918659445014713e-05,
"loss": 1.2415,
"step": 597
},
{
"epoch": 3.1473684210526316,
"grad_norm": 0.3404609072909432,
"learning_rate": 2.9045165099038066e-05,
"loss": 1.2631,
"step": 598
},
{
"epoch": 3.1526315789473682,
"grad_norm": 0.3396481784449944,
"learning_rate": 2.890388364952623e-05,
"loss": 1.2548,
"step": 599
},
{
"epoch": 3.1578947368421053,
"grad_norm": 0.37782629450525207,
"learning_rate": 2.8762752009057232e-05,
"loss": 1.2617,
"step": 600
},
{
"epoch": 3.163157894736842,
"grad_norm": 0.5589592818613579,
"learning_rate": 2.8621772083054157e-05,
"loss": 1.2594,
"step": 601
},
{
"epoch": 3.168421052631579,
"grad_norm": 0.42966453281721634,
"learning_rate": 2.8480945774891764e-05,
"loss": 1.2413,
"step": 602
},
{
"epoch": 3.1736842105263157,
"grad_norm": 0.37959826731834306,
"learning_rate": 2.83402749858708e-05,
"loss": 1.2509,
"step": 603
},
{
"epoch": 3.1789473684210527,
"grad_norm": 0.4661717251353946,
"learning_rate": 2.819976161519236e-05,
"loss": 1.2629,
"step": 604
},
{
"epoch": 3.1842105263157894,
"grad_norm": 0.31707150511990617,
"learning_rate": 2.805940755993223e-05,
"loss": 1.2446,
"step": 605
},
{
"epoch": 3.1894736842105265,
"grad_norm": 0.3596061389333874,
"learning_rate": 2.7919214715015236e-05,
"loss": 1.2487,
"step": 606
},
{
"epoch": 3.194736842105263,
"grad_norm": 0.3125529192407293,
"learning_rate": 2.7779184973189773e-05,
"loss": 1.2575,
"step": 607
},
{
"epoch": 3.2,
"grad_norm": 0.39929761965783167,
"learning_rate": 2.7639320225002108e-05,
"loss": 1.2563,
"step": 608
},
{
"epoch": 3.205263157894737,
"grad_norm": 0.357481051645098,
"learning_rate": 2.7499622358770936e-05,
"loss": 1.2399,
"step": 609
},
{
"epoch": 3.2105263157894735,
"grad_norm": 0.3253036562346321,
"learning_rate": 2.7360093260561904e-05,
"loss": 1.2587,
"step": 610
},
{
"epoch": 3.2157894736842105,
"grad_norm": 1.0866422737509416,
"learning_rate": 2.722073481416208e-05,
"loss": 1.253,
"step": 611
},
{
"epoch": 3.221052631578947,
"grad_norm": 0.3704975946750915,
"learning_rate": 2.7081548901054574e-05,
"loss": 1.2449,
"step": 612
},
{
"epoch": 3.2263157894736842,
"grad_norm": 0.39681349064147786,
"learning_rate": 2.6942537400393117e-05,
"loss": 1.2393,
"step": 613
},
{
"epoch": 3.231578947368421,
"grad_norm": 0.38789399072411884,
"learning_rate": 2.680370218897669e-05,
"loss": 1.2476,
"step": 614
},
{
"epoch": 3.236842105263158,
"grad_norm": 0.6056318300360599,
"learning_rate": 2.6665045141224193e-05,
"loss": 1.2498,
"step": 615
},
{
"epoch": 3.2421052631578946,
"grad_norm": 0.5268591378002944,
"learning_rate": 2.6526568129149103e-05,
"loss": 1.2509,
"step": 616
},
{
"epoch": 3.2473684210526317,
"grad_norm": 0.5275312902783164,
"learning_rate": 2.638827302233428e-05,
"loss": 1.2581,
"step": 617
},
{
"epoch": 3.2526315789473683,
"grad_norm": 0.37709353602618134,
"learning_rate": 2.625016168790664e-05,
"loss": 1.2533,
"step": 618
},
{
"epoch": 3.2578947368421054,
"grad_norm": 0.3270640411740736,
"learning_rate": 2.611223599051198e-05,
"loss": 1.2743,
"step": 619
},
{
"epoch": 3.263157894736842,
"grad_norm": 0.32059620385654264,
"learning_rate": 2.597449779228983e-05,
"loss": 1.2568,
"step": 620
},
{
"epoch": 3.268421052631579,
"grad_norm": 0.39808261047072035,
"learning_rate": 2.5836948952848255e-05,
"loss": 1.2525,
"step": 621
},
{
"epoch": 3.2736842105263158,
"grad_norm": 0.34113291757836145,
"learning_rate": 2.5699591329238812e-05,
"loss": 1.268,
"step": 622
},
{
"epoch": 3.2789473684210524,
"grad_norm": 0.3042065217936969,
"learning_rate": 2.5562426775931418e-05,
"loss": 1.2483,
"step": 623
},
{
"epoch": 3.2842105263157895,
"grad_norm": 0.3974087061640213,
"learning_rate": 2.5425457144789364e-05,
"loss": 1.2609,
"step": 624
},
{
"epoch": 3.2894736842105265,
"grad_norm": 0.321409927169932,
"learning_rate": 2.5288684285044283e-05,
"loss": 1.255,
"step": 625
},
{
"epoch": 3.294736842105263,
"grad_norm": 0.385869925356024,
"learning_rate": 2.5152110043271166e-05,
"loss": 1.2576,
"step": 626
},
{
"epoch": 3.3,
"grad_norm": 0.3637124957498029,
"learning_rate": 2.501573626336352e-05,
"loss": 1.2411,
"step": 627
},
{
"epoch": 3.305263157894737,
"grad_norm": 0.3710385685313032,
"learning_rate": 2.4879564786508343e-05,
"loss": 1.2592,
"step": 628
},
{
"epoch": 3.3105263157894735,
"grad_norm": 0.4487560727745529,
"learning_rate": 2.474359745116136e-05,
"loss": 1.2404,
"step": 629
},
{
"epoch": 3.3157894736842106,
"grad_norm": 0.3231869771450256,
"learning_rate": 2.460783609302218e-05,
"loss": 1.2547,
"step": 630
},
{
"epoch": 3.3210526315789473,
"grad_norm": 0.4088431022056057,
"learning_rate": 2.4472282545009493e-05,
"loss": 1.2548,
"step": 631
},
{
"epoch": 3.3263157894736843,
"grad_norm": 0.29515450703495905,
"learning_rate": 2.4336938637236352e-05,
"loss": 1.2525,
"step": 632
},
{
"epoch": 3.331578947368421,
"grad_norm": 0.33297468568328076,
"learning_rate": 2.4201806196985426e-05,
"loss": 1.2737,
"step": 633
},
{
"epoch": 3.336842105263158,
"grad_norm": 0.3335294632136315,
"learning_rate": 2.4066887048684394e-05,
"loss": 1.2447,
"step": 634
},
{
"epoch": 3.3421052631578947,
"grad_norm": 0.2879112803644998,
"learning_rate": 2.393218301388123e-05,
"loss": 1.2715,
"step": 635
},
{
"epoch": 3.3473684210526318,
"grad_norm": 0.3133592323848536,
"learning_rate": 2.3797695911219668e-05,
"loss": 1.2561,
"step": 636
},
{
"epoch": 3.3526315789473684,
"grad_norm": 0.2430811283889928,
"learning_rate": 2.3663427556414664e-05,
"loss": 1.2601,
"step": 637
},
{
"epoch": 3.3578947368421055,
"grad_norm": 0.3579114126056535,
"learning_rate": 2.352937976222781e-05,
"loss": 1.253,
"step": 638
},
{
"epoch": 3.363157894736842,
"grad_norm": 0.26852343656836425,
"learning_rate": 2.3395554338442908e-05,
"loss": 1.245,
"step": 639
},
{
"epoch": 3.3684210526315788,
"grad_norm": 0.3011001164622397,
"learning_rate": 2.3261953091841553e-05,
"loss": 1.2546,
"step": 640
},
{
"epoch": 3.373684210526316,
"grad_norm": 0.26481840311987703,
"learning_rate": 2.3128577826178723e-05,
"loss": 1.2606,
"step": 641
},
{
"epoch": 3.3789473684210525,
"grad_norm": 0.3257272912007352,
"learning_rate": 2.2995430342158365e-05,
"loss": 1.2353,
"step": 642
},
{
"epoch": 3.3842105263157896,
"grad_norm": 0.38000488426555273,
"learning_rate": 2.2862512437409162e-05,
"loss": 1.2423,
"step": 643
},
{
"epoch": 3.389473684210526,
"grad_norm": 0.29866098174675637,
"learning_rate": 2.272982590646029e-05,
"loss": 1.2653,
"step": 644
},
{
"epoch": 3.3947368421052633,
"grad_norm": 0.7482169914063777,
"learning_rate": 2.2597372540717083e-05,
"loss": 1.2591,
"step": 645
},
{
"epoch": 3.4,
"grad_norm": 0.48442639740243737,
"learning_rate": 2.24651541284369e-05,
"loss": 1.2748,
"step": 646
},
{
"epoch": 3.405263157894737,
"grad_norm": 0.3933443985045218,
"learning_rate": 2.233317245470504e-05,
"loss": 1.2491,
"step": 647
},
{
"epoch": 3.4105263157894736,
"grad_norm": 0.4653265340743596,
"learning_rate": 2.220142930141054e-05,
"loss": 1.2592,
"step": 648
},
{
"epoch": 3.4157894736842107,
"grad_norm": 0.42673076337011967,
"learning_rate": 2.206992644722216e-05,
"loss": 1.2396,
"step": 649
},
{
"epoch": 3.4210526315789473,
"grad_norm": 0.33876188196334395,
"learning_rate": 2.1938665667564435e-05,
"loss": 1.2436,
"step": 650
},
{
"epoch": 3.4263157894736844,
"grad_norm": 0.3630588058950603,
"learning_rate": 2.1807648734593558e-05,
"loss": 1.2557,
"step": 651
},
{
"epoch": 3.431578947368421,
"grad_norm": 0.3529402619316208,
"learning_rate": 2.167687741717358e-05,
"loss": 1.2536,
"step": 652
},
{
"epoch": 3.4368421052631577,
"grad_norm": 0.3145009910486473,
"learning_rate": 2.1546353480852495e-05,
"loss": 1.2465,
"step": 653
},
{
"epoch": 3.442105263157895,
"grad_norm": 0.2825566028878834,
"learning_rate": 2.1416078687838403e-05,
"loss": 1.2543,
"step": 654
},
{
"epoch": 3.4473684210526314,
"grad_norm": 0.2872680469582709,
"learning_rate": 2.1286054796975696e-05,
"loss": 1.2637,
"step": 655
},
{
"epoch": 3.4526315789473685,
"grad_norm": 0.2802498708050248,
"learning_rate": 2.115628356372131e-05,
"loss": 1.245,
"step": 656
},
{
"epoch": 3.457894736842105,
"grad_norm": 0.2779169417503312,
"learning_rate": 2.1026766740121096e-05,
"loss": 1.2548,
"step": 657
},
{
"epoch": 3.463157894736842,
"grad_norm": 0.27790502165031583,
"learning_rate": 2.089750607478606e-05,
"loss": 1.2482,
"step": 658
},
{
"epoch": 3.468421052631579,
"grad_norm": 0.3106234637273863,
"learning_rate": 2.076850331286881e-05,
"loss": 1.2474,
"step": 659
},
{
"epoch": 3.473684210526316,
"grad_norm": 0.2460612966298966,
"learning_rate": 2.063976019604006e-05,
"loss": 1.2578,
"step": 660
},
{
"epoch": 3.4789473684210526,
"grad_norm": 0.4002624603687612,
"learning_rate": 2.0511278462464933e-05,
"loss": 1.2323,
"step": 661
},
{
"epoch": 3.4842105263157896,
"grad_norm": 0.3558072656216221,
"learning_rate": 2.038305984677969e-05,
"loss": 1.2513,
"step": 662
},
{
"epoch": 3.4894736842105263,
"grad_norm": 0.32674276214626774,
"learning_rate": 2.025510608006819e-05,
"loss": 1.248,
"step": 663
},
{
"epoch": 3.4947368421052634,
"grad_norm": 0.3685362965399088,
"learning_rate": 2.012741888983861e-05,
"loss": 1.2612,
"step": 664
},
{
"epoch": 3.5,
"grad_norm": 0.3851874388241183,
"learning_rate": 2.0000000000000012e-05,
"loss": 1.26,
"step": 665
},
{
"epoch": 3.5052631578947366,
"grad_norm": 0.2922093360847206,
"learning_rate": 1.9872851130839126e-05,
"loss": 1.2503,
"step": 666
},
{
"epoch": 3.5105263157894737,
"grad_norm": 0.2982128935849179,
"learning_rate": 1.9745973998997177e-05,
"loss": 1.2461,
"step": 667
},
{
"epoch": 3.515789473684211,
"grad_norm": 0.36881831273338744,
"learning_rate": 1.9619370317446612e-05,
"loss": 1.2627,
"step": 668
},
{
"epoch": 3.5210526315789474,
"grad_norm": 0.25559075127742553,
"learning_rate": 1.9493041795468018e-05,
"loss": 1.2474,
"step": 669
},
{
"epoch": 3.526315789473684,
"grad_norm": 0.6103223603779421,
"learning_rate": 1.9366990138627054e-05,
"loss": 1.2553,
"step": 670
},
{
"epoch": 3.531578947368421,
"grad_norm": 0.32053875904249984,
"learning_rate": 1.9241217048751406e-05,
"loss": 1.2716,
"step": 671
},
{
"epoch": 3.536842105263158,
"grad_norm": 0.32627511828160094,
"learning_rate": 1.911572422390783e-05,
"loss": 1.2509,
"step": 672
},
{
"epoch": 3.542105263157895,
"grad_norm": 0.31231339980121575,
"learning_rate": 1.899051335837919e-05,
"loss": 1.2542,
"step": 673
},
{
"epoch": 3.5473684210526315,
"grad_norm": 0.31642734990082777,
"learning_rate": 1.886558614264165e-05,
"loss": 1.2544,
"step": 674
},
{
"epoch": 3.5526315789473686,
"grad_norm": 0.41419322615420073,
"learning_rate": 1.8740944263341773e-05,
"loss": 1.2722,
"step": 675
},
{
"epoch": 3.557894736842105,
"grad_norm": 0.2575000448429207,
"learning_rate": 1.8616589403273776e-05,
"loss": 1.251,
"step": 676
},
{
"epoch": 3.5631578947368423,
"grad_norm": 0.45829370611833337,
"learning_rate": 1.8492523241356877e-05,
"loss": 1.2552,
"step": 677
},
{
"epoch": 3.568421052631579,
"grad_norm": 0.3876144668681015,
"learning_rate": 1.8368747452612504e-05,
"loss": 1.2756,
"step": 678
},
{
"epoch": 3.5736842105263156,
"grad_norm": 0.3605137220418223,
"learning_rate": 1.8245263708141782e-05,
"loss": 1.242,
"step": 679
},
{
"epoch": 3.5789473684210527,
"grad_norm": 0.3947355937612717,
"learning_rate": 1.8122073675102935e-05,
"loss": 1.2556,
"step": 680
},
{
"epoch": 3.5842105263157897,
"grad_norm": 0.29347916836402094,
"learning_rate": 1.7999179016688763e-05,
"loss": 1.26,
"step": 681
},
{
"epoch": 3.5894736842105264,
"grad_norm": 0.32495295214844105,
"learning_rate": 1.7876581392104225e-05,
"loss": 1.2496,
"step": 682
},
{
"epoch": 3.594736842105263,
"grad_norm": 0.2493724682619427,
"learning_rate": 1.7754282456543977e-05,
"loss": 1.2514,
"step": 683
},
{
"epoch": 3.6,
"grad_norm": 0.35605401548647925,
"learning_rate": 1.7632283861170135e-05,
"loss": 1.2539,
"step": 684
},
{
"epoch": 3.6052631578947367,
"grad_norm": 0.2630345804072707,
"learning_rate": 1.7510587253089842e-05,
"loss": 1.2579,
"step": 685
},
{
"epoch": 3.610526315789474,
"grad_norm": 0.2772719177300871,
"learning_rate": 1.7389194275333124e-05,
"loss": 1.2471,
"step": 686
},
{
"epoch": 3.6157894736842104,
"grad_norm": 0.3256551364716347,
"learning_rate": 1.7268106566830713e-05,
"loss": 1.2562,
"step": 687
},
{
"epoch": 3.6210526315789475,
"grad_norm": 0.2942105351769792,
"learning_rate": 1.7147325762391848e-05,
"loss": 1.2664,
"step": 688
},
{
"epoch": 3.626315789473684,
"grad_norm": 0.29601761914650015,
"learning_rate": 1.702685349268226e-05,
"loss": 1.2559,
"step": 689
},
{
"epoch": 3.6315789473684212,
"grad_norm": 0.2759560921461832,
"learning_rate": 1.690669138420215e-05,
"loss": 1.2591,
"step": 690
},
{
"epoch": 3.636842105263158,
"grad_norm": 0.2440653651529168,
"learning_rate": 1.6786841059264217e-05,
"loss": 1.2574,
"step": 691
},
{
"epoch": 3.6421052631578945,
"grad_norm": 0.303898127022955,
"learning_rate": 1.6667304135971756e-05,
"loss": 1.2547,
"step": 692
},
{
"epoch": 3.6473684210526316,
"grad_norm": 0.2481861381786453,
"learning_rate": 1.65480822281968e-05,
"loss": 1.2488,
"step": 693
},
{
"epoch": 3.6526315789473687,
"grad_norm": 0.2565499348104272,
"learning_rate": 1.6429176945558413e-05,
"loss": 1.2561,
"step": 694
},
{
"epoch": 3.6578947368421053,
"grad_norm": 0.3224687182653659,
"learning_rate": 1.6310589893400804e-05,
"loss": 1.247,
"step": 695
},
{
"epoch": 3.663157894736842,
"grad_norm": 0.25279520055905946,
"learning_rate": 1.6192322672771793e-05,
"loss": 1.2636,
"step": 696
},
{
"epoch": 3.668421052631579,
"grad_norm": 0.3239078414093973,
"learning_rate": 1.6074376880401147e-05,
"loss": 1.2431,
"step": 697
},
{
"epoch": 3.6736842105263157,
"grad_norm": 0.25211963429157525,
"learning_rate": 1.5956754108678996e-05,
"loss": 1.2489,
"step": 698
},
{
"epoch": 3.6789473684210527,
"grad_norm": 0.3288796816421695,
"learning_rate": 1.5839455945634372e-05,
"loss": 1.2433,
"step": 699
},
{
"epoch": 3.6842105263157894,
"grad_norm": 0.2570823868070139,
"learning_rate": 1.5722483974913737e-05,
"loss": 1.2437,
"step": 700
},
{
"epoch": 3.6894736842105265,
"grad_norm": 0.23205884441696503,
"learning_rate": 1.560583977575964e-05,
"loss": 1.2558,
"step": 701
},
{
"epoch": 3.694736842105263,
"grad_norm": 0.254586157360098,
"learning_rate": 1.5489524922989367e-05,
"loss": 1.2677,
"step": 702
},
{
"epoch": 3.7,
"grad_norm": 0.2528078741758432,
"learning_rate": 1.537354098697367e-05,
"loss": 1.2521,
"step": 703
},
{
"epoch": 3.705263157894737,
"grad_norm": 0.30438966904710707,
"learning_rate": 1.525788953361563e-05,
"loss": 1.2569,
"step": 704
},
{
"epoch": 3.7105263157894735,
"grad_norm": 0.24585215855274448,
"learning_rate": 1.5142572124329418e-05,
"loss": 1.2582,
"step": 705
},
{
"epoch": 3.7157894736842105,
"grad_norm": 0.23812179037555448,
"learning_rate": 1.5027590316019276e-05,
"loss": 1.2582,
"step": 706
},
{
"epoch": 3.7210526315789476,
"grad_norm": 0.2258598951803704,
"learning_rate": 1.491294566105852e-05,
"loss": 1.2398,
"step": 707
},
{
"epoch": 3.7263157894736842,
"grad_norm": 0.23820145432975506,
"learning_rate": 1.4798639707268509e-05,
"loss": 1.26,
"step": 708
},
{
"epoch": 3.731578947368421,
"grad_norm": 0.27098791758934043,
"learning_rate": 1.4684673997897795e-05,
"loss": 1.2467,
"step": 709
},
{
"epoch": 3.736842105263158,
"grad_norm": 0.1895081621315529,
"learning_rate": 1.457105007160129e-05,
"loss": 1.2469,
"step": 710
},
{
"epoch": 3.7421052631578946,
"grad_norm": 0.24431380487075854,
"learning_rate": 1.4457769462419461e-05,
"loss": 1.2505,
"step": 711
},
{
"epoch": 3.7473684210526317,
"grad_norm": 0.2598287894690381,
"learning_rate": 1.4344833699757662e-05,
"loss": 1.2733,
"step": 712
},
{
"epoch": 3.7526315789473683,
"grad_norm": 0.24173801356915325,
"learning_rate": 1.4232244308365437e-05,
"loss": 1.2515,
"step": 713
},
{
"epoch": 3.7578947368421054,
"grad_norm": 0.2744768545995936,
"learning_rate": 1.4120002808315999e-05,
"loss": 1.2446,
"step": 714
},
{
"epoch": 3.763157894736842,
"grad_norm": 0.29075680429359135,
"learning_rate": 1.4008110714985623e-05,
"loss": 1.2576,
"step": 715
},
{
"epoch": 3.768421052631579,
"grad_norm": 0.1679499052346039,
"learning_rate": 1.3896569539033253e-05,
"loss": 1.2434,
"step": 716
},
{
"epoch": 3.7736842105263158,
"grad_norm": 0.21354680460803685,
"learning_rate": 1.3785380786380103e-05,
"loss": 1.2642,
"step": 717
},
{
"epoch": 3.7789473684210524,
"grad_norm": 0.24355235079533985,
"learning_rate": 1.367454595818928e-05,
"loss": 1.2449,
"step": 718
},
{
"epoch": 3.7842105263157895,
"grad_norm": 0.17842505149174132,
"learning_rate": 1.3564066550845558e-05,
"loss": 1.2399,
"step": 719
},
{
"epoch": 3.7894736842105265,
"grad_norm": 0.2363958816949115,
"learning_rate": 1.3453944055935151e-05,
"loss": 1.2471,
"step": 720
},
{
"epoch": 3.794736842105263,
"grad_norm": 0.20243183669778259,
"learning_rate": 1.3344179960225603e-05,
"loss": 1.2535,
"step": 721
},
{
"epoch": 3.8,
"grad_norm": 0.2471952644451058,
"learning_rate": 1.3234775745645684e-05,
"loss": 1.2484,
"step": 722
},
{
"epoch": 3.805263157894737,
"grad_norm": 0.21944384742054443,
"learning_rate": 1.3125732889265393e-05,
"loss": 1.2444,
"step": 723
},
{
"epoch": 3.8105263157894735,
"grad_norm": 0.22384339943654685,
"learning_rate": 1.3017052863276054e-05,
"loss": 1.2544,
"step": 724
},
{
"epoch": 3.8157894736842106,
"grad_norm": 0.20643300200194556,
"learning_rate": 1.2908737134970367e-05,
"loss": 1.2455,
"step": 725
},
{
"epoch": 3.8210526315789473,
"grad_norm": 0.22387663232782792,
"learning_rate": 1.2800787166722634e-05,
"loss": 1.2415,
"step": 726
},
{
"epoch": 3.8263157894736843,
"grad_norm": 0.23601246798864953,
"learning_rate": 1.2693204415969068e-05,
"loss": 1.2488,
"step": 727
},
{
"epoch": 3.831578947368421,
"grad_norm": 0.21781387567237637,
"learning_rate": 1.2585990335188014e-05,
"loss": 1.2346,
"step": 728
},
{
"epoch": 3.836842105263158,
"grad_norm": 0.20954968812529903,
"learning_rate": 1.2479146371880408e-05,
"loss": 1.25,
"step": 729
},
{
"epoch": 3.8421052631578947,
"grad_norm": 0.2935272323831709,
"learning_rate": 1.2372673968550229e-05,
"loss": 1.2575,
"step": 730
},
{
"epoch": 3.8473684210526313,
"grad_norm": 0.23428135792560334,
"learning_rate": 1.2266574562684994e-05,
"loss": 1.2477,
"step": 731
},
{
"epoch": 3.8526315789473684,
"grad_norm": 0.18658016102303704,
"learning_rate": 1.2160849586736375e-05,
"loss": 1.256,
"step": 732
},
{
"epoch": 3.8578947368421055,
"grad_norm": 0.23105098493810466,
"learning_rate": 1.2055500468100849e-05,
"loss": 1.2399,
"step": 733
},
{
"epoch": 3.863157894736842,
"grad_norm": 0.1929616859489707,
"learning_rate": 1.1950528629100457e-05,
"loss": 1.2515,
"step": 734
},
{
"epoch": 3.8684210526315788,
"grad_norm": 0.218750003790284,
"learning_rate": 1.1845935486963546e-05,
"loss": 1.2489,
"step": 735
},
{
"epoch": 3.873684210526316,
"grad_norm": 0.19977098349774547,
"learning_rate": 1.1741722453805657e-05,
"loss": 1.2449,
"step": 736
},
{
"epoch": 3.8789473684210525,
"grad_norm": 0.23507506446012338,
"learning_rate": 1.163789093661051e-05,
"loss": 1.2562,
"step": 737
},
{
"epoch": 3.8842105263157896,
"grad_norm": 0.19034197687876206,
"learning_rate": 1.1534442337210919e-05,
"loss": 1.2528,
"step": 738
},
{
"epoch": 3.889473684210526,
"grad_norm": 0.25267159420496116,
"learning_rate": 1.1431378052269934e-05,
"loss": 1.2571,
"step": 739
},
{
"epoch": 3.8947368421052633,
"grad_norm": 0.21369948030483346,
"learning_rate": 1.1328699473261957e-05,
"loss": 1.241,
"step": 740
},
{
"epoch": 3.9,
"grad_norm": 0.23307740618119258,
"learning_rate": 1.1226407986453963e-05,
"loss": 1.2557,
"step": 741
},
{
"epoch": 3.905263157894737,
"grad_norm": 0.19115969783367653,
"learning_rate": 1.1124504972886782e-05,
"loss": 1.2525,
"step": 742
},
{
"epoch": 3.9105263157894736,
"grad_norm": 0.2681117091243346,
"learning_rate": 1.1022991808356442e-05,
"loss": 1.248,
"step": 743
},
{
"epoch": 3.9157894736842103,
"grad_norm": 0.1651284103554666,
"learning_rate": 1.0921869863395642e-05,
"loss": 1.242,
"step": 744
},
{
"epoch": 3.9210526315789473,
"grad_norm": 0.24161510420189317,
"learning_rate": 1.0821140503255174e-05,
"loss": 1.2555,
"step": 745
},
{
"epoch": 3.9263157894736844,
"grad_norm": 0.20280135248319278,
"learning_rate": 1.0720805087885533e-05,
"loss": 1.2578,
"step": 746
},
{
"epoch": 3.931578947368421,
"grad_norm": 0.3284807144434915,
"learning_rate": 1.0620864971918579e-05,
"loss": 1.259,
"step": 747
},
{
"epoch": 3.9368421052631577,
"grad_norm": 0.22538990852777954,
"learning_rate": 1.05213215046492e-05,
"loss": 1.2597,
"step": 748
},
{
"epoch": 3.942105263157895,
"grad_norm": 0.19055951323654136,
"learning_rate": 1.0422176030017117e-05,
"loss": 1.2443,
"step": 749
},
{
"epoch": 3.9473684210526314,
"grad_norm": 0.18646041833135787,
"learning_rate": 1.0323429886588743e-05,
"loss": 1.2388,
"step": 750
},
{
"epoch": 3.9526315789473685,
"grad_norm": 0.19285379546461523,
"learning_rate": 1.0225084407539109e-05,
"loss": 1.2335,
"step": 751
},
{
"epoch": 3.957894736842105,
"grad_norm": 0.1997818414436052,
"learning_rate": 1.0127140920633857e-05,
"loss": 1.2439,
"step": 752
},
{
"epoch": 3.963157894736842,
"grad_norm": 0.20149581036707856,
"learning_rate": 1.0029600748211314e-05,
"loss": 1.2415,
"step": 753
},
{
"epoch": 3.968421052631579,
"grad_norm": 0.19260248911961064,
"learning_rate": 9.932465207164675e-06,
"loss": 1.2633,
"step": 754
},
{
"epoch": 3.973684210526316,
"grad_norm": 0.21099578591151794,
"learning_rate": 9.835735608924155e-06,
"loss": 1.231,
"step": 755
},
{
"epoch": 3.9789473684210526,
"grad_norm": 0.17132739675169845,
"learning_rate": 9.739413259439337e-06,
"loss": 1.2451,
"step": 756
},
{
"epoch": 3.984210526315789,
"grad_norm": 0.21904215059223633,
"learning_rate": 9.643499459161538e-06,
"loss": 1.2523,
"step": 757
},
{
"epoch": 3.9894736842105263,
"grad_norm": 0.224551193557602,
"learning_rate": 9.547995503026217e-06,
"loss": 1.2478,
"step": 758
},
{
"epoch": 3.9947368421052634,
"grad_norm": 0.19238609932248696,
"learning_rate": 9.452902680435527e-06,
"loss": 1.249,
"step": 759
},
{
"epoch": 4.0,
"grad_norm": 0.22055223309269914,
"learning_rate": 9.358222275240884e-06,
"loss": 1.2167,
"step": 760
},
{
"epoch": 4.005263157894737,
"grad_norm": 0.20140128214778716,
"learning_rate": 9.263955565725648e-06,
"loss": 1.2391,
"step": 761
},
{
"epoch": 4.010526315789473,
"grad_norm": 0.2068373210972995,
"learning_rate": 9.170103824587855e-06,
"loss": 1.2331,
"step": 762
},
{
"epoch": 4.015789473684211,
"grad_norm": 0.18232115316386402,
"learning_rate": 9.07666831892304e-06,
"loss": 1.2121,
"step": 763
},
{
"epoch": 4.021052631578947,
"grad_norm": 0.2188152260857773,
"learning_rate": 8.983650310207142e-06,
"loss": 1.2232,
"step": 764
},
{
"epoch": 4.026315789473684,
"grad_norm": 0.1880274936269495,
"learning_rate": 8.89105105427945e-06,
"loss": 1.2272,
"step": 765
},
{
"epoch": 4.031578947368421,
"grad_norm": 0.17030491611623289,
"learning_rate": 8.798871801325632e-06,
"loss": 1.2284,
"step": 766
},
{
"epoch": 4.036842105263158,
"grad_norm": 0.1887119280020856,
"learning_rate": 8.707113795860938e-06,
"loss": 1.2364,
"step": 767
},
{
"epoch": 4.042105263157895,
"grad_norm": 0.18907111180220373,
"learning_rate": 8.615778276713293e-06,
"loss": 1.2277,
"step": 768
},
{
"epoch": 4.0473684210526315,
"grad_norm": 0.17028701794910334,
"learning_rate": 8.524866477006637e-06,
"loss": 1.2268,
"step": 769
},
{
"epoch": 4.052631578947368,
"grad_norm": 0.1927239082270522,
"learning_rate": 8.434379624144261e-06,
"loss": 1.2202,
"step": 770
},
{
"epoch": 4.057894736842106,
"grad_norm": 0.18231681740661396,
"learning_rate": 8.344318939792232e-06,
"loss": 1.2103,
"step": 771
},
{
"epoch": 4.063157894736842,
"grad_norm": 0.2108141165888399,
"learning_rate": 8.254685639862896e-06,
"loss": 1.2289,
"step": 772
},
{
"epoch": 4.068421052631579,
"grad_norm": 0.21501105777435195,
"learning_rate": 8.165480934498462e-06,
"loss": 1.2304,
"step": 773
},
{
"epoch": 4.073684210526316,
"grad_norm": 0.22014095135466175,
"learning_rate": 8.076706028054709e-06,
"loss": 1.2395,
"step": 774
},
{
"epoch": 4.078947368421052,
"grad_norm": 0.18281510166398557,
"learning_rate": 7.988362119084642e-06,
"loss": 1.232,
"step": 775
},
{
"epoch": 4.08421052631579,
"grad_norm": 0.21712131045816194,
"learning_rate": 7.90045040032236e-06,
"loss": 1.2423,
"step": 776
},
{
"epoch": 4.089473684210526,
"grad_norm": 0.19226805462323326,
"learning_rate": 7.812972058666974e-06,
"loss": 1.2295,
"step": 777
},
{
"epoch": 4.094736842105263,
"grad_norm": 0.175015352113717,
"learning_rate": 7.725928275166534e-06,
"loss": 1.2282,
"step": 778
},
{
"epoch": 4.1,
"grad_norm": 0.2095750364202842,
"learning_rate": 7.639320225002106e-06,
"loss": 1.2244,
"step": 779
},
{
"epoch": 4.105263157894737,
"grad_norm": 0.19644672306841843,
"learning_rate": 7.553149077471915e-06,
"loss": 1.2217,
"step": 780
},
{
"epoch": 4.110526315789474,
"grad_norm": 0.2000635414888708,
"learning_rate": 7.46741599597554e-06,
"loss": 1.2319,
"step": 781
},
{
"epoch": 4.11578947368421,
"grad_norm": 0.1746543551783459,
"learning_rate": 7.382122137998209e-06,
"loss": 1.2282,
"step": 782
},
{
"epoch": 4.121052631578947,
"grad_norm": 0.17481980918717463,
"learning_rate": 7.297268655095213e-06,
"loss": 1.2395,
"step": 783
},
{
"epoch": 4.126315789473685,
"grad_norm": 0.17610089627569894,
"learning_rate": 7.212856692876289e-06,
"loss": 1.2319,
"step": 784
},
{
"epoch": 4.131578947368421,
"grad_norm": 0.17566117386443802,
"learning_rate": 7.128887390990198e-06,
"loss": 1.2245,
"step": 785
},
{
"epoch": 4.136842105263158,
"grad_norm": 0.18888285977402394,
"learning_rate": 7.045361883109318e-06,
"loss": 1.2363,
"step": 786
},
{
"epoch": 4.1421052631578945,
"grad_norm": 0.1679963465599155,
"learning_rate": 6.962281296914386e-06,
"loss": 1.2319,
"step": 787
},
{
"epoch": 4.147368421052631,
"grad_norm": 0.17232128719198106,
"learning_rate": 6.8796467540791986e-06,
"loss": 1.2312,
"step": 788
},
{
"epoch": 4.152631578947369,
"grad_norm": 0.19685528274227304,
"learning_rate": 6.797459370255519e-06,
"loss": 1.2324,
"step": 789
},
{
"epoch": 4.157894736842105,
"grad_norm": 0.1583456150516079,
"learning_rate": 6.715720255058e-06,
"loss": 1.24,
"step": 790
},
{
"epoch": 4.163157894736842,
"grad_norm": 0.172328795648275,
"learning_rate": 6.634430512049213e-06,
"loss": 1.2513,
"step": 791
},
{
"epoch": 4.168421052631579,
"grad_norm": 0.16257107292586506,
"learning_rate": 6.553591238724712e-06,
"loss": 1.2275,
"step": 792
},
{
"epoch": 4.173684210526316,
"grad_norm": 0.14389724088218966,
"learning_rate": 6.4732035264982904e-06,
"loss": 1.2348,
"step": 793
},
{
"epoch": 4.178947368421053,
"grad_norm": 0.15689066100797078,
"learning_rate": 6.39326846068717e-06,
"loss": 1.2179,
"step": 794
},
{
"epoch": 4.184210526315789,
"grad_norm": 0.18533318047509703,
"learning_rate": 6.313787120497376e-06,
"loss": 1.236,
"step": 795
},
{
"epoch": 4.189473684210526,
"grad_norm": 0.1459277700590749,
"learning_rate": 6.234760579009167e-06,
"loss": 1.2435,
"step": 796
},
{
"epoch": 4.1947368421052635,
"grad_norm": 0.155103015306397,
"learning_rate": 6.1561899031625794e-06,
"loss": 1.2282,
"step": 797
},
{
"epoch": 4.2,
"grad_norm": 0.1477347804716696,
"learning_rate": 6.078076153742962e-06,
"loss": 1.2249,
"step": 798
},
{
"epoch": 4.205263157894737,
"grad_norm": 0.15276423763995617,
"learning_rate": 6.000420385366687e-06,
"loss": 1.2297,
"step": 799
},
{
"epoch": 4.2105263157894735,
"grad_norm": 0.15126290143221918,
"learning_rate": 5.923223646466923e-06,
"loss": 1.2387,
"step": 800
},
{
"epoch": 4.215789473684211,
"grad_norm": 0.15492928616201465,
"learning_rate": 5.846486979279449e-06,
"loss": 1.2367,
"step": 801
},
{
"epoch": 4.221052631578948,
"grad_norm": 0.17188412280549703,
"learning_rate": 5.770211419828604e-06,
"loss": 1.2322,
"step": 802
},
{
"epoch": 4.226315789473684,
"grad_norm": 0.15097184444197026,
"learning_rate": 5.694397997913319e-06,
"loss": 1.2321,
"step": 803
},
{
"epoch": 4.231578947368421,
"grad_norm": 0.1453328152503722,
"learning_rate": 5.619047737093164e-06,
"loss": 1.2384,
"step": 804
},
{
"epoch": 4.2368421052631575,
"grad_norm": 0.18220366542871314,
"learning_rate": 5.5441616546745646e-06,
"loss": 1.2383,
"step": 805
},
{
"epoch": 4.242105263157895,
"grad_norm": 0.167450785630923,
"learning_rate": 5.469740761697044e-06,
"loss": 1.2426,
"step": 806
},
{
"epoch": 4.247368421052632,
"grad_norm": 0.14931148609570408,
"learning_rate": 5.395786062919622e-06,
"loss": 1.2333,
"step": 807
},
{
"epoch": 4.252631578947368,
"grad_norm": 0.16803901696022852,
"learning_rate": 5.322298556807179e-06,
"loss": 1.2417,
"step": 808
},
{
"epoch": 4.257894736842105,
"grad_norm": 0.16226281008273294,
"learning_rate": 5.249279235517031e-06,
"loss": 1.2329,
"step": 809
},
{
"epoch": 4.2631578947368425,
"grad_norm": 0.13999210516589425,
"learning_rate": 5.176729084885508e-06,
"loss": 1.2412,
"step": 810
},
{
"epoch": 4.268421052631579,
"grad_norm": 0.18759320208384703,
"learning_rate": 5.10464908441465e-06,
"loss": 1.2357,
"step": 811
},
{
"epoch": 4.273684210526316,
"grad_norm": 0.17340200354001228,
"learning_rate": 5.033040207258979e-06,
"loss": 1.2271,
"step": 812
},
{
"epoch": 4.278947368421052,
"grad_norm": 0.15950936103038027,
"learning_rate": 4.9619034202123884e-06,
"loss": 1.2151,
"step": 813
},
{
"epoch": 4.284210526315789,
"grad_norm": 0.15140529416594803,
"learning_rate": 4.891239683695044e-06,
"loss": 1.232,
"step": 814
},
{
"epoch": 4.2894736842105265,
"grad_norm": 0.13998854140642578,
"learning_rate": 4.821049951740442e-06,
"loss": 1.2255,
"step": 815
},
{
"epoch": 4.294736842105263,
"grad_norm": 0.1499375273785916,
"learning_rate": 4.751335171982527e-06,
"loss": 1.2314,
"step": 816
},
{
"epoch": 4.3,
"grad_norm": 0.158191412279702,
"learning_rate": 4.6820962856429205e-06,
"loss": 1.234,
"step": 817
},
{
"epoch": 4.3052631578947365,
"grad_norm": 0.14057999268017907,
"learning_rate": 4.613334227518165e-06,
"loss": 1.2427,
"step": 818
},
{
"epoch": 4.310526315789474,
"grad_norm": 0.1433236145981049,
"learning_rate": 4.545049925967137e-06,
"loss": 1.2313,
"step": 819
},
{
"epoch": 4.315789473684211,
"grad_norm": 0.1303171009707853,
"learning_rate": 4.4772443028985004e-06,
"loss": 1.2297,
"step": 820
},
{
"epoch": 4.321052631578947,
"grad_norm": 0.15617718436585057,
"learning_rate": 4.409918273758278e-06,
"loss": 1.2412,
"step": 821
},
{
"epoch": 4.326315789473684,
"grad_norm": 0.1476554630303936,
"learning_rate": 4.343072747517459e-06,
"loss": 1.2387,
"step": 822
},
{
"epoch": 4.331578947368421,
"grad_norm": 0.1362192280798835,
"learning_rate": 4.276708626659778e-06,
"loss": 1.2349,
"step": 823
},
{
"epoch": 4.336842105263158,
"grad_norm": 0.15051183831923126,
"learning_rate": 4.2108268071694616e-06,
"loss": 1.2122,
"step": 824
},
{
"epoch": 4.342105263157895,
"grad_norm": 0.1445207500529269,
"learning_rate": 4.1454281785191995e-06,
"loss": 1.2224,
"step": 825
},
{
"epoch": 4.347368421052631,
"grad_norm": 0.14362316701732558,
"learning_rate": 4.080513623658075e-06,
"loss": 1.2186,
"step": 826
},
{
"epoch": 4.352631578947369,
"grad_norm": 0.1479016471804495,
"learning_rate": 4.0160840189997155e-06,
"loss": 1.2324,
"step": 827
},
{
"epoch": 4.3578947368421055,
"grad_norm": 0.1397296336400901,
"learning_rate": 3.952140234410396e-06,
"loss": 1.2309,
"step": 828
},
{
"epoch": 4.363157894736842,
"grad_norm": 0.12213103880797943,
"learning_rate": 3.888683133197293e-06,
"loss": 1.2231,
"step": 829
},
{
"epoch": 4.368421052631579,
"grad_norm": 0.13169031113809618,
"learning_rate": 3.825713572096903e-06,
"loss": 1.2264,
"step": 830
},
{
"epoch": 4.373684210526315,
"grad_norm": 0.1415624842501799,
"learning_rate": 3.7632324012633992e-06,
"loss": 1.2444,
"step": 831
},
{
"epoch": 4.378947368421053,
"grad_norm": 0.14671744800493622,
"learning_rate": 3.701240464257181e-06,
"loss": 1.2183,
"step": 832
},
{
"epoch": 4.38421052631579,
"grad_norm": 0.13323253635868224,
"learning_rate": 3.6397385980335e-06,
"loss": 1.2156,
"step": 833
},
{
"epoch": 4.389473684210526,
"grad_norm": 0.13127420581089705,
"learning_rate": 3.5787276329311315e-06,
"loss": 1.2231,
"step": 834
},
{
"epoch": 4.394736842105263,
"grad_norm": 0.133896287855281,
"learning_rate": 3.518208392661184e-06,
"loss": 1.2293,
"step": 835
},
{
"epoch": 4.4,
"grad_norm": 0.13026051571456285,
"learning_rate": 3.458181694295961e-06,
"loss": 1.2395,
"step": 836
},
{
"epoch": 4.405263157894737,
"grad_norm": 0.15010438499441964,
"learning_rate": 3.398648348257969e-06,
"loss": 1.2323,
"step": 837
},
{
"epoch": 4.410526315789474,
"grad_norm": 0.1501949658490909,
"learning_rate": 3.3396091583089275e-06,
"loss": 1.2186,
"step": 838
},
{
"epoch": 4.41578947368421,
"grad_norm": 0.1433763217867749,
"learning_rate": 3.281064921538919e-06,
"loss": 1.2379,
"step": 839
},
{
"epoch": 4.421052631578947,
"grad_norm": 0.1323412042853926,
"learning_rate": 3.2230164283556918e-06,
"loss": 1.2231,
"step": 840
},
{
"epoch": 4.426315789473684,
"grad_norm": 0.13635885589343097,
"learning_rate": 3.1654644624739082e-06,
"loss": 1.2297,
"step": 841
},
{
"epoch": 4.431578947368421,
"grad_norm": 0.1323736124785357,
"learning_rate": 3.1084098009046106e-06,
"loss": 1.235,
"step": 842
},
{
"epoch": 4.436842105263158,
"grad_norm": 0.13102236402292022,
"learning_rate": 3.0518532139447267e-06,
"loss": 1.2307,
"step": 843
},
{
"epoch": 4.442105263157894,
"grad_norm": 0.15428668766385725,
"learning_rate": 2.995795465166644e-06,
"loss": 1.226,
"step": 844
},
{
"epoch": 4.447368421052632,
"grad_norm": 0.13778512976226498,
"learning_rate": 2.9402373114079295e-06,
"loss": 1.2276,
"step": 845
},
{
"epoch": 4.4526315789473685,
"grad_norm": 0.13474950636883365,
"learning_rate": 2.8851795027610997e-06,
"loss": 1.2228,
"step": 846
},
{
"epoch": 4.457894736842105,
"grad_norm": 0.1353883744809194,
"learning_rate": 2.83062278256351e-06,
"loss": 1.2339,
"step": 847
},
{
"epoch": 4.463157894736842,
"grad_norm": 0.13137189130014673,
"learning_rate": 2.776567887387267e-06,
"loss": 1.2301,
"step": 848
},
{
"epoch": 4.468421052631579,
"grad_norm": 0.13126591401950521,
"learning_rate": 2.723015547029344e-06,
"loss": 1.2468,
"step": 849
},
{
"epoch": 4.473684210526316,
"grad_norm": 0.1415673262181535,
"learning_rate": 2.669966484501716e-06,
"loss": 1.2245,
"step": 850
},
{
"epoch": 4.478947368421053,
"grad_norm": 0.1320404723499411,
"learning_rate": 2.6174214160215704e-06,
"loss": 1.2352,
"step": 851
},
{
"epoch": 4.484210526315789,
"grad_norm": 0.12633771710003897,
"learning_rate": 2.5653810510016454e-06,
"loss": 1.2339,
"step": 852
},
{
"epoch": 4.489473684210527,
"grad_norm": 0.12316620532344269,
"learning_rate": 2.5138460920406884e-06,
"loss": 1.2317,
"step": 853
},
{
"epoch": 4.494736842105263,
"grad_norm": 0.13602160694846396,
"learning_rate": 2.462817234913919e-06,
"loss": 1.2273,
"step": 854
},
{
"epoch": 4.5,
"grad_norm": 0.1415982613618782,
"learning_rate": 2.4122951685636674e-06,
"loss": 1.2243,
"step": 855
},
{
"epoch": 4.505263157894737,
"grad_norm": 0.3127501030193754,
"learning_rate": 2.3622805750900567e-06,
"loss": 1.2222,
"step": 856
},
{
"epoch": 4.510526315789473,
"grad_norm": 0.13107864455151064,
"learning_rate": 2.3127741297418283e-06,
"loss": 1.2366,
"step": 857
},
{
"epoch": 4.515789473684211,
"grad_norm": 0.13657403397982118,
"learning_rate": 2.2637765009071576e-06,
"loss": 1.2337,
"step": 858
},
{
"epoch": 4.521052631578947,
"grad_norm": 0.1302788270408855,
"learning_rate": 2.215288350104694e-06,
"loss": 1.2253,
"step": 859
},
{
"epoch": 4.526315789473684,
"grad_norm": 0.12664051031696197,
"learning_rate": 2.1673103319746146e-06,
"loss": 1.225,
"step": 860
},
{
"epoch": 4.531578947368421,
"grad_norm": 0.14352455601262662,
"learning_rate": 2.1198430942697625e-06,
"loss": 1.2251,
"step": 861
},
{
"epoch": 4.536842105263158,
"grad_norm": 0.13649018750914618,
"learning_rate": 2.0728872778469224e-06,
"loss": 1.2407,
"step": 862
},
{
"epoch": 4.542105263157895,
"grad_norm": 0.20719895993192947,
"learning_rate": 2.026443516658163e-06,
"loss": 1.2272,
"step": 863
},
{
"epoch": 4.5473684210526315,
"grad_norm": 0.13901759037964037,
"learning_rate": 1.9805124377422834e-06,
"loss": 1.2368,
"step": 864
},
{
"epoch": 4.552631578947368,
"grad_norm": 0.1307517829866866,
"learning_rate": 1.93509466121633e-06,
"loss": 1.2318,
"step": 865
},
{
"epoch": 4.557894736842105,
"grad_norm": 0.12095060465165465,
"learning_rate": 1.8901908002672442e-06,
"loss": 1.2359,
"step": 866
},
{
"epoch": 4.563157894736842,
"grad_norm": 0.11915812322895941,
"learning_rate": 1.8458014611435705e-06,
"loss": 1.2426,
"step": 867
},
{
"epoch": 4.568421052631579,
"grad_norm": 0.1240067541225263,
"learning_rate": 1.80192724314729e-06,
"loss": 1.2163,
"step": 868
},
{
"epoch": 4.573684210526316,
"grad_norm": 0.12397138277266245,
"learning_rate": 1.7585687386256944e-06,
"loss": 1.2428,
"step": 869
},
{
"epoch": 4.578947368421053,
"grad_norm": 0.13123863782173215,
"learning_rate": 1.7157265329634354e-06,
"loss": 1.2413,
"step": 870
},
{
"epoch": 4.58421052631579,
"grad_norm": 0.13114076151140514,
"learning_rate": 1.6734012045745762e-06,
"loss": 1.2255,
"step": 871
},
{
"epoch": 4.589473684210526,
"grad_norm": 0.11880095160971275,
"learning_rate": 1.6315933248948068e-06,
"loss": 1.2325,
"step": 872
},
{
"epoch": 4.594736842105263,
"grad_norm": 0.133304713863376,
"learning_rate": 1.5903034583737343e-06,
"loss": 1.2406,
"step": 873
},
{
"epoch": 4.6,
"grad_norm": 0.12445121300617833,
"learning_rate": 1.5495321624672443e-06,
"loss": 1.2323,
"step": 874
},
{
"epoch": 4.605263157894737,
"grad_norm": 0.11989093911414492,
"learning_rate": 1.5092799876299835e-06,
"loss": 1.2152,
"step": 875
},
{
"epoch": 4.610526315789474,
"grad_norm": 0.12318779958969978,
"learning_rate": 1.4695474773079287e-06,
"loss": 1.2274,
"step": 876
},
{
"epoch": 4.61578947368421,
"grad_norm": 0.11755163812164948,
"learning_rate": 1.4303351679310473e-06,
"loss": 1.2323,
"step": 877
},
{
"epoch": 4.621052631578947,
"grad_norm": 0.1271239652597123,
"learning_rate": 1.3916435889060575e-06,
"loss": 1.2281,
"step": 878
},
{
"epoch": 4.626315789473685,
"grad_norm": 0.12826240424195157,
"learning_rate": 1.353473262609275e-06,
"loss": 1.2273,
"step": 879
},
{
"epoch": 4.631578947368421,
"grad_norm": 0.12460164768857226,
"learning_rate": 1.3158247043795735e-06,
"loss": 1.2264,
"step": 880
},
{
"epoch": 4.636842105263158,
"grad_norm": 0.11779769435490604,
"learning_rate": 1.278698422511413e-06,
"loss": 1.2243,
"step": 881
},
{
"epoch": 4.6421052631578945,
"grad_norm": 0.11403097697307746,
"learning_rate": 1.242094918247978e-06,
"loss": 1.2283,
"step": 882
},
{
"epoch": 4.647368421052631,
"grad_norm": 0.12118016084867007,
"learning_rate": 1.2060146857744282e-06,
"loss": 1.2392,
"step": 883
},
{
"epoch": 4.652631578947369,
"grad_norm": 0.12319740930061163,
"learning_rate": 1.1704582122112008e-06,
"loss": 1.2088,
"step": 884
},
{
"epoch": 4.657894736842105,
"grad_norm": 0.11386564708274247,
"learning_rate": 1.1354259776074472e-06,
"loss": 1.233,
"step": 885
},
{
"epoch": 4.663157894736842,
"grad_norm": 0.11374999316034942,
"learning_rate": 1.1009184549345632e-06,
"loss": 1.2386,
"step": 886
},
{
"epoch": 4.668421052631579,
"grad_norm": 0.12522042587937965,
"learning_rate": 1.0669361100797704e-06,
"loss": 1.2418,
"step": 887
},
{
"epoch": 4.673684210526316,
"grad_norm": 0.11429258921626788,
"learning_rate": 1.0334794018398652e-06,
"loss": 1.2178,
"step": 888
},
{
"epoch": 4.678947368421053,
"grad_norm": 0.34812757148076545,
"learning_rate": 1.0005487819149917e-06,
"loss": 1.2272,
"step": 889
},
{
"epoch": 4.684210526315789,
"grad_norm": 0.1182967297844485,
"learning_rate": 9.681446949025752e-07,
"loss": 1.2191,
"step": 890
},
{
"epoch": 4.689473684210526,
"grad_norm": 0.1272033760667648,
"learning_rate": 9.362675782912923e-07,
"loss": 1.2356,
"step": 891
},
{
"epoch": 4.6947368421052635,
"grad_norm": 0.12672455306432165,
"learning_rate": 9.049178624551635e-07,
"loss": 1.2285,
"step": 892
},
{
"epoch": 4.7,
"grad_norm": 0.3617879606840202,
"learning_rate": 8.740959706477725e-07,
"loss": 1.2656,
"step": 893
},
{
"epoch": 4.705263157894737,
"grad_norm": 0.10997692184574041,
"learning_rate": 8.438023189965272e-07,
"loss": 1.2358,
"step": 894
},
{
"epoch": 4.7105263157894735,
"grad_norm": 0.12136967224479166,
"learning_rate": 8.140373164970428e-07,
"loss": 1.2146,
"step": 895
},
{
"epoch": 4.715789473684211,
"grad_norm": 0.2009841140710602,
"learning_rate": 7.848013650076258e-07,
"loss": 1.2284,
"step": 896
},
{
"epoch": 4.721052631578948,
"grad_norm": 0.11466884057407387,
"learning_rate": 7.560948592438521e-07,
"loss": 1.241,
"step": 897
},
{
"epoch": 4.726315789473684,
"grad_norm": 0.11496880440793267,
"learning_rate": 7.279181867732199e-07,
"loss": 1.2151,
"step": 898
},
{
"epoch": 4.731578947368421,
"grad_norm": 0.12172797181082162,
"learning_rate": 7.002717280099403e-07,
"loss": 1.2227,
"step": 899
},
{
"epoch": 4.7368421052631575,
"grad_norm": 0.12443319588453902,
"learning_rate": 6.731558562097995e-07,
"loss": 1.2329,
"step": 900
},
{
"epoch": 4.742105263157895,
"grad_norm": 0.12280105376114048,
"learning_rate": 6.465709374650964e-07,
"loss": 1.2343,
"step": 901
},
{
"epoch": 4.747368421052632,
"grad_norm": 0.11762303018802715,
"learning_rate": 6.205173306997125e-07,
"loss": 1.2267,
"step": 902
},
{
"epoch": 4.752631578947368,
"grad_norm": 0.11816128091190285,
"learning_rate": 5.949953876642855e-07,
"loss": 1.2293,
"step": 903
},
{
"epoch": 4.757894736842105,
"grad_norm": 0.1156560558591028,
"learning_rate": 5.700054529314347e-07,
"loss": 1.2315,
"step": 904
},
{
"epoch": 4.7631578947368425,
"grad_norm": 0.11137701754866611,
"learning_rate": 5.455478638911071e-07,
"loss": 1.2394,
"step": 905
},
{
"epoch": 4.768421052631579,
"grad_norm": 0.11181750038905715,
"learning_rate": 5.216229507460435e-07,
"loss": 1.2208,
"step": 906
},
{
"epoch": 4.773684210526316,
"grad_norm": 0.12036980268449626,
"learning_rate": 4.982310365073107e-07,
"loss": 1.2235,
"step": 907
},
{
"epoch": 4.778947368421052,
"grad_norm": 0.12359942605818125,
"learning_rate": 4.75372436989936e-07,
"loss": 1.2308,
"step": 908
},
{
"epoch": 4.784210526315789,
"grad_norm": 0.13220645490519645,
"learning_rate": 4.530474608086355e-07,
"loss": 1.214,
"step": 909
},
{
"epoch": 4.7894736842105265,
"grad_norm": 0.12206816510347139,
"learning_rate": 4.3125640937368373e-07,
"loss": 1.2194,
"step": 910
},
{
"epoch": 4.794736842105263,
"grad_norm": 0.11617994515280962,
"learning_rate": 4.0999957688679706e-07,
"loss": 1.2241,
"step": 911
},
{
"epoch": 4.8,
"grad_norm": 0.1148058679734953,
"learning_rate": 3.8927725033718553e-07,
"loss": 1.2223,
"step": 912
},
{
"epoch": 4.8052631578947365,
"grad_norm": 0.11823614340464102,
"learning_rate": 3.690897094976942e-07,
"loss": 1.2238,
"step": 913
},
{
"epoch": 4.810526315789474,
"grad_norm": 0.11790591732140702,
"learning_rate": 3.4943722692099224e-07,
"loss": 1.2153,
"step": 914
},
{
"epoch": 4.815789473684211,
"grad_norm": 0.11877977952867706,
"learning_rate": 3.3032006793590977e-07,
"loss": 1.2334,
"step": 915
},
{
"epoch": 4.821052631578947,
"grad_norm": 0.12246828468344964,
"learning_rate": 3.117384906438581e-07,
"loss": 1.2386,
"step": 916
},
{
"epoch": 4.826315789473684,
"grad_norm": 0.10958575864964563,
"learning_rate": 2.936927459153438e-07,
"loss": 1.2392,
"step": 917
},
{
"epoch": 4.831578947368421,
"grad_norm": 0.11159223936915229,
"learning_rate": 2.761830773865759e-07,
"loss": 1.225,
"step": 918
},
{
"epoch": 4.836842105263158,
"grad_norm": 0.11067027350647266,
"learning_rate": 2.5920972145618394e-07,
"loss": 1.2182,
"step": 919
},
{
"epoch": 4.842105263157895,
"grad_norm": 0.11845597460367807,
"learning_rate": 2.4277290728202063e-07,
"loss": 1.2303,
"step": 920
},
{
"epoch": 4.847368421052631,
"grad_norm": 0.11321338292881286,
"learning_rate": 2.2687285677807536e-07,
"loss": 1.2286,
"step": 921
},
{
"epoch": 4.852631578947369,
"grad_norm": 0.10918511827532087,
"learning_rate": 2.1150978461146332e-07,
"loss": 1.2303,
"step": 922
},
{
"epoch": 4.8578947368421055,
"grad_norm": 0.11331613848290951,
"learning_rate": 1.9668389819954338e-07,
"loss": 1.2238,
"step": 923
},
{
"epoch": 4.863157894736842,
"grad_norm": 0.11227809077874316,
"learning_rate": 1.8239539770711133e-07,
"loss": 1.229,
"step": 924
},
{
"epoch": 4.868421052631579,
"grad_norm": 0.11188390803054302,
"learning_rate": 1.6864447604370004e-07,
"loss": 1.2315,
"step": 925
},
{
"epoch": 4.873684210526315,
"grad_norm": 0.1126954574845899,
"learning_rate": 1.5543131886096352e-07,
"loss": 1.2281,
"step": 926
},
{
"epoch": 4.878947368421053,
"grad_norm": 0.1154664007961282,
"learning_rate": 1.427561045501902e-07,
"loss": 1.2372,
"step": 927
},
{
"epoch": 4.88421052631579,
"grad_norm": 0.11176844779105831,
"learning_rate": 1.3061900423986917e-07,
"loss": 1.2268,
"step": 928
},
{
"epoch": 4.889473684210526,
"grad_norm": 0.11214271981901136,
"learning_rate": 1.1902018179340779e-07,
"loss": 1.2211,
"step": 929
},
{
"epoch": 4.894736842105263,
"grad_norm": 0.11806437367689042,
"learning_rate": 1.0795979380690657e-07,
"loss": 1.2232,
"step": 930
},
{
"epoch": 4.9,
"grad_norm": 0.12131946872074126,
"learning_rate": 9.74379896070321e-08,
"loss": 1.2392,
"step": 931
},
{
"epoch": 4.905263157894737,
"grad_norm": 0.11758661501722971,
"learning_rate": 8.745491124901861e-08,
"loss": 1.2215,
"step": 932
},
{
"epoch": 4.910526315789474,
"grad_norm": 0.10980377112088192,
"learning_rate": 7.80106935147451e-08,
"loss": 1.2412,
"step": 933
},
{
"epoch": 4.91578947368421,
"grad_norm": 0.11037951117364361,
"learning_rate": 6.910546391092343e-08,
"loss": 1.2198,
"step": 934
},
{
"epoch": 4.921052631578947,
"grad_norm": 0.11711041285423687,
"learning_rate": 6.073934266735303e-08,
"loss": 1.2256,
"step": 935
},
{
"epoch": 4.926315789473684,
"grad_norm": 0.11213122469542446,
"learning_rate": 5.291244273531782e-08,
"loss": 1.2389,
"step": 936
},
{
"epoch": 4.931578947368421,
"grad_norm": 0.11158223482551854,
"learning_rate": 4.562486978606728e-08,
"loss": 1.2358,
"step": 937
},
{
"epoch": 4.936842105263158,
"grad_norm": 0.10743288484662021,
"learning_rate": 3.887672220936445e-08,
"loss": 1.2142,
"step": 938
},
{
"epoch": 4.942105263157895,
"grad_norm": 0.11480044753648233,
"learning_rate": 3.266809111218017e-08,
"loss": 1.2304,
"step": 939
},
{
"epoch": 4.947368421052632,
"grad_norm": 0.12600905075056,
"learning_rate": 2.699906031745414e-08,
"loss": 1.2348,
"step": 940
},
{
"epoch": 4.9526315789473685,
"grad_norm": 0.10693815172707843,
"learning_rate": 2.1869706362958044e-08,
"loss": 1.2329,
"step": 941
},
{
"epoch": 4.957894736842105,
"grad_norm": 0.11368842959943799,
"learning_rate": 1.7280098500283005e-08,
"loss": 1.2461,
"step": 942
},
{
"epoch": 4.963157894736842,
"grad_norm": 0.11074973929231093,
"learning_rate": 1.3230298693871491e-08,
"loss": 1.2364,
"step": 943
},
{
"epoch": 4.968421052631579,
"grad_norm": 0.11094251507004392,
"learning_rate": 9.720361620217943e-09,
"loss": 1.2314,
"step": 944
},
{
"epoch": 4.973684210526316,
"grad_norm": 0.11419040432886776,
"learning_rate": 6.750334667091629e-09,
"loss": 1.23,
"step": 945
},
{
"epoch": 4.978947368421053,
"grad_norm": 0.2986955255592173,
"learning_rate": 4.320257932928229e-09,
"loss": 1.2347,
"step": 946
},
{
"epoch": 4.984210526315789,
"grad_norm": 0.10907490264263059,
"learning_rate": 2.4301642262791748e-09,
"loss": 1.2327,
"step": 947
},
{
"epoch": 4.989473684210527,
"grad_norm": 0.11799915395997997,
"learning_rate": 1.0800790653675564e-09,
"loss": 1.2269,
"step": 948
},
{
"epoch": 4.994736842105263,
"grad_norm": 0.11484395305342408,
"learning_rate": 2.700206777328518e-10,
"loss": 1.2454,
"step": 949
},
{
"epoch": 5.0,
"grad_norm": 0.11735650459931829,
"learning_rate": 0.0,
"loss": 1.2331,
"step": 950
},
{
"epoch": 5.0,
"step": 950,
"total_flos": 1.59373351452672e+16,
"train_loss": 1.3301890049482648,
"train_runtime": 16504.5508,
"train_samples_per_second": 29.374,
"train_steps_per_second": 0.058
}
],
"logging_steps": 1,
"max_steps": 950,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.59373351452672e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}