{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 950, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005263157894736842, "grad_norm": 7.792475124460211, "learning_rate": 8.421052631578948e-07, "loss": 1.844, "step": 1 }, { "epoch": 0.010526315789473684, "grad_norm": 7.77107258632735, "learning_rate": 1.6842105263157895e-06, "loss": 1.8342, "step": 2 }, { "epoch": 0.015789473684210527, "grad_norm": 7.734735147966475, "learning_rate": 2.5263157894736844e-06, "loss": 1.8324, "step": 3 }, { "epoch": 0.021052631578947368, "grad_norm": 7.248228363662717, "learning_rate": 3.368421052631579e-06, "loss": 1.816, "step": 4 }, { "epoch": 0.02631578947368421, "grad_norm": 5.778539169912221, "learning_rate": 4.210526315789474e-06, "loss": 1.7849, "step": 5 }, { "epoch": 0.031578947368421054, "grad_norm": 3.239108511490633, "learning_rate": 5.052631578947369e-06, "loss": 1.7388, "step": 6 }, { "epoch": 0.03684210526315789, "grad_norm": 2.659157940796987, "learning_rate": 5.8947368421052634e-06, "loss": 1.7195, "step": 7 }, { "epoch": 0.042105263157894736, "grad_norm": 5.788438535248494, "learning_rate": 6.736842105263158e-06, "loss": 1.716, "step": 8 }, { "epoch": 0.04736842105263158, "grad_norm": 5.78966752434045, "learning_rate": 7.578947368421054e-06, "loss": 1.7132, "step": 9 }, { "epoch": 0.05263157894736842, "grad_norm": 5.662263812934936, "learning_rate": 8.421052631578948e-06, "loss": 1.7155, "step": 10 }, { "epoch": 0.05789473684210526, "grad_norm": 4.3906745364224955, "learning_rate": 9.263157894736842e-06, "loss": 1.6617, "step": 11 }, { "epoch": 0.06315789473684211, "grad_norm": 3.7326640381933247, "learning_rate": 1.0105263157894738e-05, "loss": 1.6634, "step": 12 }, { "epoch": 0.06842105263157895, "grad_norm": 2.669371108068819, "learning_rate": 1.0947368421052633e-05, "loss": 1.6463, "step": 13 }, { "epoch": 0.07368421052631578, "grad_norm": 2.080061860580032, "learning_rate": 1.1789473684210527e-05, "loss": 1.5925, "step": 14 }, { "epoch": 0.07894736842105263, "grad_norm": 2.245619823013294, "learning_rate": 1.263157894736842e-05, "loss": 1.5797, "step": 15 }, { "epoch": 0.08421052631578947, "grad_norm": 2.0053863294690806, "learning_rate": 1.3473684210526316e-05, "loss": 1.5613, "step": 16 }, { "epoch": 0.08947368421052632, "grad_norm": 1.9625639332300135, "learning_rate": 1.4315789473684212e-05, "loss": 1.5638, "step": 17 }, { "epoch": 0.09473684210526316, "grad_norm": 1.5893875551469467, "learning_rate": 1.5157894736842107e-05, "loss": 1.5522, "step": 18 }, { "epoch": 0.1, "grad_norm": 1.5906140921890974, "learning_rate": 1.6000000000000003e-05, "loss": 1.5209, "step": 19 }, { "epoch": 0.10526315789473684, "grad_norm": 1.559169647702599, "learning_rate": 1.6842105263157896e-05, "loss": 1.4911, "step": 20 }, { "epoch": 0.11052631578947368, "grad_norm": 1.2982321447862488, "learning_rate": 1.768421052631579e-05, "loss": 1.4991, "step": 21 }, { "epoch": 0.11578947368421053, "grad_norm": 1.3143336518371307, "learning_rate": 1.8526315789473684e-05, "loss": 1.5012, "step": 22 }, { "epoch": 0.12105263157894737, "grad_norm": 1.279000313318411, "learning_rate": 1.936842105263158e-05, "loss": 1.5011, "step": 23 }, { "epoch": 0.12631578947368421, "grad_norm": 0.922398830661329, "learning_rate": 2.0210526315789475e-05, "loss": 1.4624, "step": 24 }, { "epoch": 0.13157894736842105, "grad_norm": 1.3274580990016782, "learning_rate": 2.105263157894737e-05, "loss": 1.4645, "step": 25 }, { "epoch": 0.1368421052631579, "grad_norm": 1.1341340679878056, "learning_rate": 2.1894736842105266e-05, "loss": 1.4715, "step": 26 }, { "epoch": 0.14210526315789473, "grad_norm": 1.2628910185228979, "learning_rate": 2.273684210526316e-05, "loss": 1.4584, "step": 27 }, { "epoch": 0.14736842105263157, "grad_norm": 1.57433223809749, "learning_rate": 2.3578947368421054e-05, "loss": 1.448, "step": 28 }, { "epoch": 0.15263157894736842, "grad_norm": 0.8438420059614518, "learning_rate": 2.442105263157895e-05, "loss": 1.4442, "step": 29 }, { "epoch": 0.15789473684210525, "grad_norm": 1.280060736418443, "learning_rate": 2.526315789473684e-05, "loss": 1.4577, "step": 30 }, { "epoch": 0.1631578947368421, "grad_norm": 1.8388712899282178, "learning_rate": 2.610526315789474e-05, "loss": 1.4354, "step": 31 }, { "epoch": 0.16842105263157894, "grad_norm": 1.1283525934214154, "learning_rate": 2.6947368421052632e-05, "loss": 1.4458, "step": 32 }, { "epoch": 0.1736842105263158, "grad_norm": 1.559904631956879, "learning_rate": 2.778947368421053e-05, "loss": 1.4337, "step": 33 }, { "epoch": 0.17894736842105263, "grad_norm": 1.433428530147804, "learning_rate": 2.8631578947368423e-05, "loss": 1.4311, "step": 34 }, { "epoch": 0.18421052631578946, "grad_norm": 1.4475127262626666, "learning_rate": 2.9473684210526317e-05, "loss": 1.4296, "step": 35 }, { "epoch": 0.18947368421052632, "grad_norm": 1.6258635588736965, "learning_rate": 3.0315789473684214e-05, "loss": 1.403, "step": 36 }, { "epoch": 0.19473684210526315, "grad_norm": 0.9140879296838869, "learning_rate": 3.1157894736842105e-05, "loss": 1.4149, "step": 37 }, { "epoch": 0.2, "grad_norm": 1.355462077600805, "learning_rate": 3.2000000000000005e-05, "loss": 1.4241, "step": 38 }, { "epoch": 0.20526315789473684, "grad_norm": 2.025270623377536, "learning_rate": 3.28421052631579e-05, "loss": 1.4269, "step": 39 }, { "epoch": 0.21052631578947367, "grad_norm": 1.5911005582582893, "learning_rate": 3.368421052631579e-05, "loss": 1.4261, "step": 40 }, { "epoch": 0.21578947368421053, "grad_norm": 1.7736374990816877, "learning_rate": 3.452631578947369e-05, "loss": 1.4248, "step": 41 }, { "epoch": 0.22105263157894736, "grad_norm": 2.020103172778917, "learning_rate": 3.536842105263158e-05, "loss": 1.4019, "step": 42 }, { "epoch": 0.22631578947368422, "grad_norm": 2.1032904325246693, "learning_rate": 3.621052631578948e-05, "loss": 1.4138, "step": 43 }, { "epoch": 0.23157894736842105, "grad_norm": 2.034839240989353, "learning_rate": 3.705263157894737e-05, "loss": 1.4099, "step": 44 }, { "epoch": 0.23684210526315788, "grad_norm": 1.5310564952941104, "learning_rate": 3.789473684210526e-05, "loss": 1.419, "step": 45 }, { "epoch": 0.24210526315789474, "grad_norm": 2.3582192444588594, "learning_rate": 3.873684210526316e-05, "loss": 1.3989, "step": 46 }, { "epoch": 0.24736842105263157, "grad_norm": 1.2404229618115798, "learning_rate": 3.9578947368421056e-05, "loss": 1.4034, "step": 47 }, { "epoch": 0.25263157894736843, "grad_norm": 2.631335015977353, "learning_rate": 4.042105263157895e-05, "loss": 1.4067, "step": 48 }, { "epoch": 0.2578947368421053, "grad_norm": 1.7741169104229972, "learning_rate": 4.126315789473685e-05, "loss": 1.3842, "step": 49 }, { "epoch": 0.2631578947368421, "grad_norm": 2.7958288897467813, "learning_rate": 4.210526315789474e-05, "loss": 1.4165, "step": 50 }, { "epoch": 0.26842105263157895, "grad_norm": 2.106018978323054, "learning_rate": 4.294736842105264e-05, "loss": 1.408, "step": 51 }, { "epoch": 0.2736842105263158, "grad_norm": 2.1854136752650284, "learning_rate": 4.378947368421053e-05, "loss": 1.4039, "step": 52 }, { "epoch": 0.2789473684210526, "grad_norm": 2.1320541034792897, "learning_rate": 4.463157894736842e-05, "loss": 1.3792, "step": 53 }, { "epoch": 0.28421052631578947, "grad_norm": 2.1929241850886183, "learning_rate": 4.547368421052632e-05, "loss": 1.405, "step": 54 }, { "epoch": 0.2894736842105263, "grad_norm": 2.4083834163686406, "learning_rate": 4.6315789473684214e-05, "loss": 1.397, "step": 55 }, { "epoch": 0.29473684210526313, "grad_norm": 1.8900388617787558, "learning_rate": 4.715789473684211e-05, "loss": 1.398, "step": 56 }, { "epoch": 0.3, "grad_norm": 1.8869758461876107, "learning_rate": 4.8e-05, "loss": 1.399, "step": 57 }, { "epoch": 0.30526315789473685, "grad_norm": 1.9387939885602292, "learning_rate": 4.88421052631579e-05, "loss": 1.4029, "step": 58 }, { "epoch": 0.3105263157894737, "grad_norm": 2.8691231917123, "learning_rate": 4.9684210526315796e-05, "loss": 1.3774, "step": 59 }, { "epoch": 0.3157894736842105, "grad_norm": 2.023114295041414, "learning_rate": 5.052631578947368e-05, "loss": 1.3925, "step": 60 }, { "epoch": 0.32105263157894737, "grad_norm": 1.8266981690923911, "learning_rate": 5.136842105263158e-05, "loss": 1.3752, "step": 61 }, { "epoch": 0.3263157894736842, "grad_norm": 2.749570229085121, "learning_rate": 5.221052631578948e-05, "loss": 1.3846, "step": 62 }, { "epoch": 0.33157894736842103, "grad_norm": 1.577733284649954, "learning_rate": 5.305263157894737e-05, "loss": 1.3983, "step": 63 }, { "epoch": 0.3368421052631579, "grad_norm": 3.4308320005068897, "learning_rate": 5.3894736842105265e-05, "loss": 1.3898, "step": 64 }, { "epoch": 0.34210526315789475, "grad_norm": 2.7293556039435583, "learning_rate": 5.4736842105263165e-05, "loss": 1.3801, "step": 65 }, { "epoch": 0.3473684210526316, "grad_norm": 2.7986318090963036, "learning_rate": 5.557894736842106e-05, "loss": 1.3929, "step": 66 }, { "epoch": 0.3526315789473684, "grad_norm": 2.5730126104338407, "learning_rate": 5.642105263157895e-05, "loss": 1.382, "step": 67 }, { "epoch": 0.35789473684210527, "grad_norm": 2.467033189056739, "learning_rate": 5.726315789473685e-05, "loss": 1.3738, "step": 68 }, { "epoch": 0.3631578947368421, "grad_norm": 1.5266025457633567, "learning_rate": 5.810526315789475e-05, "loss": 1.394, "step": 69 }, { "epoch": 0.3684210526315789, "grad_norm": 2.1446942030427567, "learning_rate": 5.8947368421052634e-05, "loss": 1.3845, "step": 70 }, { "epoch": 0.3736842105263158, "grad_norm": 2.3417890559923986, "learning_rate": 5.978947368421053e-05, "loss": 1.3841, "step": 71 }, { "epoch": 0.37894736842105264, "grad_norm": 1.9604854202915063, "learning_rate": 6.063157894736843e-05, "loss": 1.3834, "step": 72 }, { "epoch": 0.38421052631578945, "grad_norm": 2.4507109733612578, "learning_rate": 6.147368421052632e-05, "loss": 1.4071, "step": 73 }, { "epoch": 0.3894736842105263, "grad_norm": 3.5995720583445063, "learning_rate": 6.231578947368421e-05, "loss": 1.3704, "step": 74 }, { "epoch": 0.39473684210526316, "grad_norm": 2.0182422302151797, "learning_rate": 6.315789473684212e-05, "loss": 1.379, "step": 75 }, { "epoch": 0.4, "grad_norm": 4.1134652051924, "learning_rate": 6.400000000000001e-05, "loss": 1.3788, "step": 76 }, { "epoch": 0.4052631578947368, "grad_norm": 2.929511327190635, "learning_rate": 6.484210526315789e-05, "loss": 1.3914, "step": 77 }, { "epoch": 0.4105263157894737, "grad_norm": 3.5649183111031406, "learning_rate": 6.56842105263158e-05, "loss": 1.3715, "step": 78 }, { "epoch": 0.41578947368421054, "grad_norm": 3.365859410878661, "learning_rate": 6.652631578947369e-05, "loss": 1.3986, "step": 79 }, { "epoch": 0.42105263157894735, "grad_norm": 3.304005403851428, "learning_rate": 6.736842105263159e-05, "loss": 1.3783, "step": 80 }, { "epoch": 0.4263157894736842, "grad_norm": 2.894847638309193, "learning_rate": 6.821052631578948e-05, "loss": 1.3802, "step": 81 }, { "epoch": 0.43157894736842106, "grad_norm": 2.698808992917365, "learning_rate": 6.905263157894737e-05, "loss": 1.3761, "step": 82 }, { "epoch": 0.4368421052631579, "grad_norm": 2.163408526778829, "learning_rate": 6.989473684210527e-05, "loss": 1.3911, "step": 83 }, { "epoch": 0.4421052631578947, "grad_norm": 3.2099232827707014, "learning_rate": 7.073684210526316e-05, "loss": 1.3926, "step": 84 }, { "epoch": 0.4473684210526316, "grad_norm": 2.475621296000252, "learning_rate": 7.157894736842105e-05, "loss": 1.3932, "step": 85 }, { "epoch": 0.45263157894736844, "grad_norm": 2.9203370802467936, "learning_rate": 7.242105263157896e-05, "loss": 1.3822, "step": 86 }, { "epoch": 0.45789473684210524, "grad_norm": 2.3862423279450393, "learning_rate": 7.326315789473684e-05, "loss": 1.3721, "step": 87 }, { "epoch": 0.4631578947368421, "grad_norm": 3.275399501836966, "learning_rate": 7.410526315789474e-05, "loss": 1.4002, "step": 88 }, { "epoch": 0.46842105263157896, "grad_norm": 2.5587038178412533, "learning_rate": 7.494736842105264e-05, "loss": 1.3812, "step": 89 }, { "epoch": 0.47368421052631576, "grad_norm": 3.0292582788342375, "learning_rate": 7.578947368421052e-05, "loss": 1.3679, "step": 90 }, { "epoch": 0.4789473684210526, "grad_norm": 2.4725567771769366, "learning_rate": 7.663157894736843e-05, "loss": 1.3825, "step": 91 }, { "epoch": 0.4842105263157895, "grad_norm": 2.983972110527992, "learning_rate": 7.747368421052633e-05, "loss": 1.3727, "step": 92 }, { "epoch": 0.48947368421052634, "grad_norm": 2.498982942091222, "learning_rate": 7.831578947368422e-05, "loss": 1.3769, "step": 93 }, { "epoch": 0.49473684210526314, "grad_norm": 3.526109045872143, "learning_rate": 7.915789473684211e-05, "loss": 1.3788, "step": 94 }, { "epoch": 0.5, "grad_norm": 3.1483584936822284, "learning_rate": 8e-05, "loss": 1.3643, "step": 95 }, { "epoch": 0.5052631578947369, "grad_norm": 2.0080980217269517, "learning_rate": 7.999972997932227e-05, "loss": 1.3846, "step": 96 }, { "epoch": 0.5105263157894737, "grad_norm": 4.353229694894079, "learning_rate": 7.999891992093464e-05, "loss": 1.3787, "step": 97 }, { "epoch": 0.5157894736842106, "grad_norm": 2.878217785277169, "learning_rate": 7.999756983577373e-05, "loss": 1.3695, "step": 98 }, { "epoch": 0.5210526315789473, "grad_norm": 1.962122819070823, "learning_rate": 7.999567974206707e-05, "loss": 1.364, "step": 99 }, { "epoch": 0.5263157894736842, "grad_norm": 3.9822305254995887, "learning_rate": 7.999324966533291e-05, "loss": 1.3928, "step": 100 }, { "epoch": 0.531578947368421, "grad_norm": 2.4880539516369713, "learning_rate": 7.999027963837979e-05, "loss": 1.3656, "step": 101 }, { "epoch": 0.5368421052631579, "grad_norm": 4.910309768750227, "learning_rate": 7.998676970130614e-05, "loss": 1.3802, "step": 102 }, { "epoch": 0.5421052631578948, "grad_norm": 2.905093024244942, "learning_rate": 7.998271990149972e-05, "loss": 1.3731, "step": 103 }, { "epoch": 0.5473684210526316, "grad_norm": 3.8609393179697284, "learning_rate": 7.997813029363704e-05, "loss": 1.4037, "step": 104 }, { "epoch": 0.5526315789473685, "grad_norm": 3.6672144152714865, "learning_rate": 7.997300093968255e-05, "loss": 1.3739, "step": 105 }, { "epoch": 0.5578947368421052, "grad_norm": 3.0897791527691, "learning_rate": 7.996733190888783e-05, "loss": 1.3729, "step": 106 }, { "epoch": 0.5631578947368421, "grad_norm": 3.0709819435052794, "learning_rate": 7.996112327779065e-05, "loss": 1.3735, "step": 107 }, { "epoch": 0.5684210526315789, "grad_norm": 2.4110812616502053, "learning_rate": 7.995437513021393e-05, "loss": 1.3625, "step": 108 }, { "epoch": 0.5736842105263158, "grad_norm": 3.20735512285491, "learning_rate": 7.994708755726469e-05, "loss": 1.3646, "step": 109 }, { "epoch": 0.5789473684210527, "grad_norm": 3.410126968058674, "learning_rate": 7.993926065733265e-05, "loss": 1.3828, "step": 110 }, { "epoch": 0.5842105263157895, "grad_norm": 1.9981274214741556, "learning_rate": 7.993089453608908e-05, "loss": 1.3614, "step": 111 }, { "epoch": 0.5894736842105263, "grad_norm": 3.8348881514308104, "learning_rate": 7.992198930648527e-05, "loss": 1.366, "step": 112 }, { "epoch": 0.5947368421052631, "grad_norm": 3.6103553587856188, "learning_rate": 7.991254508875098e-05, "loss": 1.3797, "step": 113 }, { "epoch": 0.6, "grad_norm": 2.545996054998508, "learning_rate": 7.990256201039297e-05, "loss": 1.37, "step": 114 }, { "epoch": 0.6052631578947368, "grad_norm": 2.9285662609146597, "learning_rate": 7.98920402061931e-05, "loss": 1.3691, "step": 115 }, { "epoch": 0.6105263157894737, "grad_norm": 3.497818597459857, "learning_rate": 7.988097981820659e-05, "loss": 1.3724, "step": 116 }, { "epoch": 0.6157894736842106, "grad_norm": 1.955436154771224, "learning_rate": 7.986938099576015e-05, "loss": 1.3553, "step": 117 }, { "epoch": 0.6210526315789474, "grad_norm": 2.024911219916847, "learning_rate": 7.985724389544982e-05, "loss": 1.3736, "step": 118 }, { "epoch": 0.6263157894736842, "grad_norm": 2.726230215863742, "learning_rate": 7.984456868113905e-05, "loss": 1.3666, "step": 119 }, { "epoch": 0.631578947368421, "grad_norm": 1.8902008783219175, "learning_rate": 7.98313555239563e-05, "loss": 1.358, "step": 120 }, { "epoch": 0.6368421052631579, "grad_norm": 3.3343114876725215, "learning_rate": 7.98176046022929e-05, "loss": 1.3674, "step": 121 }, { "epoch": 0.6421052631578947, "grad_norm": 2.8760888936249716, "learning_rate": 7.980331610180046e-05, "loss": 1.3598, "step": 122 }, { "epoch": 0.6473684210526316, "grad_norm": 1.821327982649168, "learning_rate": 7.978849021538855e-05, "loss": 1.3559, "step": 123 }, { "epoch": 0.6526315789473685, "grad_norm": 2.1330475395816038, "learning_rate": 7.977312714322193e-05, "loss": 1.3529, "step": 124 }, { "epoch": 0.6578947368421053, "grad_norm": 2.6018688330295032, "learning_rate": 7.975722709271799e-05, "loss": 1.3537, "step": 125 }, { "epoch": 0.6631578947368421, "grad_norm": 2.8283770858028143, "learning_rate": 7.974079027854382e-05, "loss": 1.3591, "step": 126 }, { "epoch": 0.6684210526315789, "grad_norm": 2.0562890332720625, "learning_rate": 7.972381692261343e-05, "loss": 1.3523, "step": 127 }, { "epoch": 0.6736842105263158, "grad_norm": 2.677519447523674, "learning_rate": 7.970630725408467e-05, "loss": 1.3588, "step": 128 }, { "epoch": 0.6789473684210526, "grad_norm": 3.39520049259372, "learning_rate": 7.968826150935615e-05, "loss": 1.357, "step": 129 }, { "epoch": 0.6842105263157895, "grad_norm": 1.326798562942608, "learning_rate": 7.96696799320641e-05, "loss": 1.3547, "step": 130 }, { "epoch": 0.6894736842105263, "grad_norm": 5.713227145762471, "learning_rate": 7.965056277307902e-05, "loss": 1.405, "step": 131 }, { "epoch": 0.6947368421052632, "grad_norm": 4.708892369834835, "learning_rate": 7.963091029050231e-05, "loss": 1.4096, "step": 132 }, { "epoch": 0.7, "grad_norm": 3.526071821361426, "learning_rate": 7.961072274966282e-05, "loss": 1.3766, "step": 133 }, { "epoch": 0.7052631578947368, "grad_norm": 3.421098464992638, "learning_rate": 7.95900004231132e-05, "loss": 1.372, "step": 134 }, { "epoch": 0.7105263157894737, "grad_norm": 3.1868582792498468, "learning_rate": 7.956874359062632e-05, "loss": 1.3742, "step": 135 }, { "epoch": 0.7157894736842105, "grad_norm": 2.5369692116526354, "learning_rate": 7.954695253919138e-05, "loss": 1.38, "step": 136 }, { "epoch": 0.7210526315789474, "grad_norm": 4.029612203074803, "learning_rate": 7.952462756301007e-05, "loss": 1.3789, "step": 137 }, { "epoch": 0.7263157894736842, "grad_norm": 3.071634072769698, "learning_rate": 7.95017689634927e-05, "loss": 1.3692, "step": 138 }, { "epoch": 0.7315789473684211, "grad_norm": 3.38738695405503, "learning_rate": 7.947837704925396e-05, "loss": 1.3692, "step": 139 }, { "epoch": 0.7368421052631579, "grad_norm": 2.900109575705123, "learning_rate": 7.94544521361089e-05, "loss": 1.3851, "step": 140 }, { "epoch": 0.7421052631578947, "grad_norm": 2.7866463709429903, "learning_rate": 7.942999454706858e-05, "loss": 1.3797, "step": 141 }, { "epoch": 0.7473684210526316, "grad_norm": 2.053753694637562, "learning_rate": 7.940500461233572e-05, "loss": 1.3697, "step": 142 }, { "epoch": 0.7526315789473684, "grad_norm": 2.9697850807629464, "learning_rate": 7.93794826693003e-05, "loss": 1.349, "step": 143 }, { "epoch": 0.7578947368421053, "grad_norm": 2.370919157592963, "learning_rate": 7.935342906253492e-05, "loss": 1.3556, "step": 144 }, { "epoch": 0.7631578947368421, "grad_norm": 3.3074201358634125, "learning_rate": 7.932684414379021e-05, "loss": 1.3656, "step": 145 }, { "epoch": 0.7684210526315789, "grad_norm": 2.2648377858473605, "learning_rate": 7.929972827199006e-05, "loss": 1.3704, "step": 146 }, { "epoch": 0.7736842105263158, "grad_norm": 2.835496161570207, "learning_rate": 7.927208181322679e-05, "loss": 1.3466, "step": 147 }, { "epoch": 0.7789473684210526, "grad_norm": 2.343363991518181, "learning_rate": 7.924390514075616e-05, "loss": 1.3726, "step": 148 }, { "epoch": 0.7842105263157895, "grad_norm": 2.7560515706025455, "learning_rate": 7.921519863499239e-05, "loss": 1.3626, "step": 149 }, { "epoch": 0.7894736842105263, "grad_norm": 2.4007554609918564, "learning_rate": 7.918596268350296e-05, "loss": 1.3587, "step": 150 }, { "epoch": 0.7947368421052632, "grad_norm": 2.5965741897469896, "learning_rate": 7.915619768100348e-05, "loss": 1.3813, "step": 151 }, { "epoch": 0.8, "grad_norm": 2.0691402568946606, "learning_rate": 7.912590402935223e-05, "loss": 1.3466, "step": 152 }, { "epoch": 0.8052631578947368, "grad_norm": 2.682134570279511, "learning_rate": 7.909508213754484e-05, "loss": 1.3484, "step": 153 }, { "epoch": 0.8105263157894737, "grad_norm": 2.961951846828786, "learning_rate": 7.906373242170872e-05, "loss": 1.356, "step": 154 }, { "epoch": 0.8157894736842105, "grad_norm": 1.647563803925644, "learning_rate": 7.903185530509743e-05, "loss": 1.3314, "step": 155 }, { "epoch": 0.8210526315789474, "grad_norm": 1.7184813388266553, "learning_rate": 7.899945121808501e-05, "loss": 1.3521, "step": 156 }, { "epoch": 0.8263157894736842, "grad_norm": 2.795936841854527, "learning_rate": 7.896652059816015e-05, "loss": 1.3635, "step": 157 }, { "epoch": 0.8315789473684211, "grad_norm": 3.0399857654385034, "learning_rate": 7.893306388992023e-05, "loss": 1.3619, "step": 158 }, { "epoch": 0.8368421052631579, "grad_norm": 1.5156478386100931, "learning_rate": 7.889908154506545e-05, "loss": 1.332, "step": 159 }, { "epoch": 0.8421052631578947, "grad_norm": 2.299123403411113, "learning_rate": 7.886457402239256e-05, "loss": 1.351, "step": 160 }, { "epoch": 0.8473684210526315, "grad_norm": 2.307814996765452, "learning_rate": 7.88295417877888e-05, "loss": 1.3565, "step": 161 }, { "epoch": 0.8526315789473684, "grad_norm": 3.2987572063667643, "learning_rate": 7.879398531422558e-05, "loss": 1.3719, "step": 162 }, { "epoch": 0.8578947368421053, "grad_norm": 1.845792873065677, "learning_rate": 7.875790508175202e-05, "loss": 1.3384, "step": 163 }, { "epoch": 0.8631578947368421, "grad_norm": 2.4609311250378942, "learning_rate": 7.87213015774886e-05, "loss": 1.3633, "step": 164 }, { "epoch": 0.868421052631579, "grad_norm": 2.693256637820666, "learning_rate": 7.868417529562043e-05, "loss": 1.3639, "step": 165 }, { "epoch": 0.8736842105263158, "grad_norm": 1.5086514518899445, "learning_rate": 7.864652673739073e-05, "loss": 1.3615, "step": 166 }, { "epoch": 0.8789473684210526, "grad_norm": 3.1066465037527147, "learning_rate": 7.860835641109395e-05, "loss": 1.3507, "step": 167 }, { "epoch": 0.8842105263157894, "grad_norm": 2.327552820376472, "learning_rate": 7.856966483206897e-05, "loss": 1.3458, "step": 168 }, { "epoch": 0.8894736842105263, "grad_norm": 2.808477876459289, "learning_rate": 7.853045252269208e-05, "loss": 1.3601, "step": 169 }, { "epoch": 0.8947368421052632, "grad_norm": 2.544696040692953, "learning_rate": 7.849072001237001e-05, "loss": 1.3529, "step": 170 }, { "epoch": 0.9, "grad_norm": 2.9946344095632575, "learning_rate": 7.845046783753276e-05, "loss": 1.3612, "step": 171 }, { "epoch": 0.9052631578947369, "grad_norm": 2.1670615109125646, "learning_rate": 7.840969654162627e-05, "loss": 1.3403, "step": 172 }, { "epoch": 0.9105263157894737, "grad_norm": 2.4274632796911324, "learning_rate": 7.83684066751052e-05, "loss": 1.3492, "step": 173 }, { "epoch": 0.9157894736842105, "grad_norm": 2.2078510708150416, "learning_rate": 7.832659879542544e-05, "loss": 1.3322, "step": 174 }, { "epoch": 0.9210526315789473, "grad_norm": 2.834266147883312, "learning_rate": 7.828427346703657e-05, "loss": 1.3658, "step": 175 }, { "epoch": 0.9263157894736842, "grad_norm": 2.5098411818246156, "learning_rate": 7.824143126137431e-05, "loss": 1.3343, "step": 176 }, { "epoch": 0.9315789473684211, "grad_norm": 1.9856802860400529, "learning_rate": 7.819807275685272e-05, "loss": 1.3408, "step": 177 }, { "epoch": 0.9368421052631579, "grad_norm": 2.4068321545997717, "learning_rate": 7.815419853885644e-05, "loss": 1.3482, "step": 178 }, { "epoch": 0.9421052631578948, "grad_norm": 2.0424531148819507, "learning_rate": 7.810980919973277e-05, "loss": 1.3492, "step": 179 }, { "epoch": 0.9473684210526315, "grad_norm": 2.88855449179049, "learning_rate": 7.806490533878368e-05, "loss": 1.3409, "step": 180 }, { "epoch": 0.9526315789473684, "grad_norm": 2.2025647302444593, "learning_rate": 7.801948756225772e-05, "loss": 1.3552, "step": 181 }, { "epoch": 0.9578947368421052, "grad_norm": 1.705774295065343, "learning_rate": 7.797355648334185e-05, "loss": 1.3298, "step": 182 }, { "epoch": 0.9631578947368421, "grad_norm": 2.0591090166453907, "learning_rate": 7.792711272215308e-05, "loss": 1.3234, "step": 183 }, { "epoch": 0.968421052631579, "grad_norm": 2.543058246007598, "learning_rate": 7.788015690573025e-05, "loss": 1.3454, "step": 184 }, { "epoch": 0.9736842105263158, "grad_norm": 2.7945291673690273, "learning_rate": 7.783268966802539e-05, "loss": 1.3623, "step": 185 }, { "epoch": 0.9789473684210527, "grad_norm": 1.2623547103194999, "learning_rate": 7.778471164989532e-05, "loss": 1.3253, "step": 186 }, { "epoch": 0.9842105263157894, "grad_norm": 2.8671346135920555, "learning_rate": 7.773622349909285e-05, "loss": 1.3516, "step": 187 }, { "epoch": 0.9894736842105263, "grad_norm": 2.37666359514784, "learning_rate": 7.768722587025818e-05, "loss": 1.333, "step": 188 }, { "epoch": 0.9947368421052631, "grad_norm": 2.0642846376852697, "learning_rate": 7.763771942490995e-05, "loss": 1.3514, "step": 189 }, { "epoch": 1.0, "grad_norm": 1.7892483033365818, "learning_rate": 7.758770483143634e-05, "loss": 1.3383, "step": 190 }, { "epoch": 1.0052631578947369, "grad_norm": 2.3383926476340875, "learning_rate": 7.753718276508609e-05, "loss": 1.3296, "step": 191 }, { "epoch": 1.0105263157894737, "grad_norm": 2.4843645129343686, "learning_rate": 7.748615390795932e-05, "loss": 1.3271, "step": 192 }, { "epoch": 1.0157894736842106, "grad_norm": 2.418044856395934, "learning_rate": 7.743461894899837e-05, "loss": 1.3272, "step": 193 }, { "epoch": 1.0210526315789474, "grad_norm": 1.5738686856678197, "learning_rate": 7.738257858397844e-05, "loss": 1.3345, "step": 194 }, { "epoch": 1.0263157894736843, "grad_norm": 2.648547362628862, "learning_rate": 7.733003351549829e-05, "loss": 1.3334, "step": 195 }, { "epoch": 1.0315789473684212, "grad_norm": 1.882461821114559, "learning_rate": 7.727698445297066e-05, "loss": 1.3129, "step": 196 }, { "epoch": 1.0368421052631578, "grad_norm": 2.4800191556167586, "learning_rate": 7.722343211261274e-05, "loss": 1.3254, "step": 197 }, { "epoch": 1.0421052631578946, "grad_norm": 2.23843270259424, "learning_rate": 7.71693772174365e-05, "loss": 1.3273, "step": 198 }, { "epoch": 1.0473684210526315, "grad_norm": 1.9793326670930025, "learning_rate": 7.71148204972389e-05, "loss": 1.3286, "step": 199 }, { "epoch": 1.0526315789473684, "grad_norm": 2.1730809011656502, "learning_rate": 7.705976268859207e-05, "loss": 1.3245, "step": 200 }, { "epoch": 1.0578947368421052, "grad_norm": 2.185445170389663, "learning_rate": 7.700420453483336e-05, "loss": 1.3222, "step": 201 }, { "epoch": 1.063157894736842, "grad_norm": 2.3909717751217903, "learning_rate": 7.694814678605528e-05, "loss": 1.325, "step": 202 }, { "epoch": 1.068421052631579, "grad_norm": 1.923075293942803, "learning_rate": 7.68915901990954e-05, "loss": 1.3107, "step": 203 }, { "epoch": 1.0736842105263158, "grad_norm": 1.7281637350526677, "learning_rate": 7.683453553752611e-05, "loss": 1.3252, "step": 204 }, { "epoch": 1.0789473684210527, "grad_norm": 3.2565617242431055, "learning_rate": 7.677698357164431e-05, "loss": 1.3269, "step": 205 }, { "epoch": 1.0842105263157895, "grad_norm": 1.2312572256380077, "learning_rate": 7.671893507846109e-05, "loss": 1.3208, "step": 206 }, { "epoch": 1.0894736842105264, "grad_norm": 2.928938984187919, "learning_rate": 7.66603908416911e-05, "loss": 1.3313, "step": 207 }, { "epoch": 1.0947368421052632, "grad_norm": 2.9307098888263026, "learning_rate": 7.660135165174205e-05, "loss": 1.3455, "step": 208 }, { "epoch": 1.1, "grad_norm": 1.593997332291012, "learning_rate": 7.654181830570404e-05, "loss": 1.3103, "step": 209 }, { "epoch": 1.1052631578947367, "grad_norm": 2.6269588682109952, "learning_rate": 7.648179160733883e-05, "loss": 1.3167, "step": 210 }, { "epoch": 1.1105263157894736, "grad_norm": 2.4790660783022043, "learning_rate": 7.642127236706887e-05, "loss": 1.3164, "step": 211 }, { "epoch": 1.1157894736842104, "grad_norm": 2.1512908725199544, "learning_rate": 7.636026140196651e-05, "loss": 1.3067, "step": 212 }, { "epoch": 1.1210526315789473, "grad_norm": 1.5786680257092611, "learning_rate": 7.629875953574282e-05, "loss": 1.3248, "step": 213 }, { "epoch": 1.1263157894736842, "grad_norm": 1.8684394631658845, "learning_rate": 7.623676759873661e-05, "loss": 1.3356, "step": 214 }, { "epoch": 1.131578947368421, "grad_norm": 1.83011908562494, "learning_rate": 7.61742864279031e-05, "loss": 1.3243, "step": 215 }, { "epoch": 1.1368421052631579, "grad_norm": 2.5453116567442686, "learning_rate": 7.611131686680272e-05, "loss": 1.3202, "step": 216 }, { "epoch": 1.1421052631578947, "grad_norm": 1.899610653373559, "learning_rate": 7.604785976558961e-05, "loss": 1.3196, "step": 217 }, { "epoch": 1.1473684210526316, "grad_norm": 2.7819319499079143, "learning_rate": 7.598391598100029e-05, "loss": 1.3223, "step": 218 }, { "epoch": 1.1526315789473685, "grad_norm": 2.119997221986913, "learning_rate": 7.591948637634193e-05, "loss": 1.3304, "step": 219 }, { "epoch": 1.1578947368421053, "grad_norm": 2.4856497917141254, "learning_rate": 7.585457182148081e-05, "loss": 1.3036, "step": 220 }, { "epoch": 1.1631578947368422, "grad_norm": 2.1055551992495847, "learning_rate": 7.578917319283055e-05, "loss": 1.3269, "step": 221 }, { "epoch": 1.168421052631579, "grad_norm": 2.2379057425917424, "learning_rate": 7.572329137334023e-05, "loss": 1.3084, "step": 222 }, { "epoch": 1.1736842105263159, "grad_norm": 2.0190272934745055, "learning_rate": 7.565692725248254e-05, "loss": 1.3251, "step": 223 }, { "epoch": 1.1789473684210527, "grad_norm": 1.2213400585349068, "learning_rate": 7.559008172624174e-05, "loss": 1.3089, "step": 224 }, { "epoch": 1.1842105263157894, "grad_norm": 3.4238358307196375, "learning_rate": 7.552275569710152e-05, "loss": 1.3188, "step": 225 }, { "epoch": 1.1894736842105262, "grad_norm": 1.9434637558797097, "learning_rate": 7.545495007403287e-05, "loss": 1.3197, "step": 226 }, { "epoch": 1.194736842105263, "grad_norm": 3.2480882471479164, "learning_rate": 7.538666577248184e-05, "loss": 1.3248, "step": 227 }, { "epoch": 1.2, "grad_norm": 2.686792838642632, "learning_rate": 7.531790371435709e-05, "loss": 1.3166, "step": 228 }, { "epoch": 1.2052631578947368, "grad_norm": 2.667702689555652, "learning_rate": 7.524866482801748e-05, "loss": 1.3118, "step": 229 }, { "epoch": 1.2105263157894737, "grad_norm": 2.0267106721992003, "learning_rate": 7.517895004825956e-05, "loss": 1.3311, "step": 230 }, { "epoch": 1.2157894736842105, "grad_norm": 3.195120176439168, "learning_rate": 7.510876031630496e-05, "loss": 1.322, "step": 231 }, { "epoch": 1.2210526315789474, "grad_norm": 1.9034234117765794, "learning_rate": 7.503809657978762e-05, "loss": 1.3226, "step": 232 }, { "epoch": 1.2263157894736842, "grad_norm": 3.690905599022198, "learning_rate": 7.496695979274103e-05, "loss": 1.3255, "step": 233 }, { "epoch": 1.231578947368421, "grad_norm": 3.145636629896195, "learning_rate": 7.489535091558536e-05, "loss": 1.3381, "step": 234 }, { "epoch": 1.236842105263158, "grad_norm": 2.5072433312959843, "learning_rate": 7.48232709151145e-05, "loss": 1.3219, "step": 235 }, { "epoch": 1.2421052631578948, "grad_norm": 3.2107352113754186, "learning_rate": 7.475072076448298e-05, "loss": 1.3227, "step": 236 }, { "epoch": 1.2473684210526317, "grad_norm": 1.58975425995912, "learning_rate": 7.467770144319283e-05, "loss": 1.3333, "step": 237 }, { "epoch": 1.2526315789473683, "grad_norm": 3.7587617169131082, "learning_rate": 7.460421393708039e-05, "loss": 1.3509, "step": 238 }, { "epoch": 1.2578947368421054, "grad_norm": 2.5861078342959614, "learning_rate": 7.453025923830296e-05, "loss": 1.3361, "step": 239 }, { "epoch": 1.263157894736842, "grad_norm": 3.398411462187095, "learning_rate": 7.445583834532546e-05, "loss": 1.3309, "step": 240 }, { "epoch": 1.268421052631579, "grad_norm": 2.4423289107004664, "learning_rate": 7.438095226290685e-05, "loss": 1.337, "step": 241 }, { "epoch": 1.2736842105263158, "grad_norm": 2.4746167081096324, "learning_rate": 7.430560200208669e-05, "loss": 1.3105, "step": 242 }, { "epoch": 1.2789473684210526, "grad_norm": 2.7651610722472353, "learning_rate": 7.42297885801714e-05, "loss": 1.3243, "step": 243 }, { "epoch": 1.2842105263157895, "grad_norm": 1.6511969920414749, "learning_rate": 7.415351302072056e-05, "loss": 1.3105, "step": 244 }, { "epoch": 1.2894736842105263, "grad_norm": 3.104346761016083, "learning_rate": 7.407677635353308e-05, "loss": 1.3298, "step": 245 }, { "epoch": 1.2947368421052632, "grad_norm": 2.3550214994148235, "learning_rate": 7.399957961463332e-05, "loss": 1.3649, "step": 246 }, { "epoch": 1.3, "grad_norm": 2.266967394498034, "learning_rate": 7.392192384625704e-05, "loss": 1.3363, "step": 247 }, { "epoch": 1.305263157894737, "grad_norm": 3.1011314193494104, "learning_rate": 7.384381009683742e-05, "loss": 1.3252, "step": 248 }, { "epoch": 1.3105263157894738, "grad_norm": 2.133334459450928, "learning_rate": 7.376523942099084e-05, "loss": 1.3307, "step": 249 }, { "epoch": 1.3157894736842106, "grad_norm": 4.326087201648726, "learning_rate": 7.368621287950264e-05, "loss": 1.4045, "step": 250 }, { "epoch": 1.3210526315789473, "grad_norm": 22.674882226571523, "learning_rate": 7.360673153931285e-05, "loss": 1.3348, "step": 251 }, { "epoch": 1.3263157894736843, "grad_norm": 3.064941459260646, "learning_rate": 7.352679647350172e-05, "loss": 1.3425, "step": 252 }, { "epoch": 1.331578947368421, "grad_norm": 2.6376251183297055, "learning_rate": 7.344640876127529e-05, "loss": 1.3389, "step": 253 }, { "epoch": 1.3368421052631578, "grad_norm": 3.518986507081023, "learning_rate": 7.33655694879508e-05, "loss": 1.325, "step": 254 }, { "epoch": 1.3421052631578947, "grad_norm": 6.772119772438152, "learning_rate": 7.328427974494201e-05, "loss": 1.3435, "step": 255 }, { "epoch": 1.3473684210526315, "grad_norm": 3.1439267189525966, "learning_rate": 7.32025406297445e-05, "loss": 1.3482, "step": 256 }, { "epoch": 1.3526315789473684, "grad_norm": 2.9757174788747442, "learning_rate": 7.312035324592081e-05, "loss": 1.4253, "step": 257 }, { "epoch": 1.3578947368421053, "grad_norm": 19.234550469937634, "learning_rate": 7.303771870308561e-05, "loss": 1.5748, "step": 258 }, { "epoch": 1.3631578947368421, "grad_norm": 166.65509126340316, "learning_rate": 7.295463811689069e-05, "loss": 7.3386, "step": 259 }, { "epoch": 1.368421052631579, "grad_norm": 37.296808447795385, "learning_rate": 7.28711126090098e-05, "loss": 7.6292, "step": 260 }, { "epoch": 1.3736842105263158, "grad_norm": 206.77559967714524, "learning_rate": 7.278714330712372e-05, "loss": 5.9669, "step": 261 }, { "epoch": 1.3789473684210527, "grad_norm": 31.10081965111831, "learning_rate": 7.27027313449048e-05, "loss": 1.9804, "step": 262 }, { "epoch": 1.3842105263157896, "grad_norm": 273.4851676662734, "learning_rate": 7.261787786200179e-05, "loss": 4.0434, "step": 263 }, { "epoch": 1.3894736842105262, "grad_norm": 21.94907796550795, "learning_rate": 7.253258400402448e-05, "loss": 2.3785, "step": 264 }, { "epoch": 1.3947368421052633, "grad_norm": 87.34643583499242, "learning_rate": 7.24468509225281e-05, "loss": 3.2623, "step": 265 }, { "epoch": 1.4, "grad_norm": 14.043694608373665, "learning_rate": 7.236067977499791e-05, "loss": 2.0359, "step": 266 }, { "epoch": 1.4052631578947368, "grad_norm": 204.7560211103692, "learning_rate": 7.227407172483348e-05, "loss": 2.6066, "step": 267 }, { "epoch": 1.4105263157894736, "grad_norm": 8.50429112485772, "learning_rate": 7.218702794133304e-05, "loss": 1.8554, "step": 268 }, { "epoch": 1.4157894736842105, "grad_norm": 7.0947931831701805, "learning_rate": 7.209954959967765e-05, "loss": 1.7393, "step": 269 }, { "epoch": 1.4210526315789473, "grad_norm": 3.181643344667253, "learning_rate": 7.201163788091536e-05, "loss": 1.5682, "step": 270 }, { "epoch": 1.4263157894736842, "grad_norm": 2.0126327538765865, "learning_rate": 7.192329397194529e-05, "loss": 1.4786, "step": 271 }, { "epoch": 1.431578947368421, "grad_norm": 2.7578093818109175, "learning_rate": 7.183451906550155e-05, "loss": 1.4642, "step": 272 }, { "epoch": 1.436842105263158, "grad_norm": 1.9145504095924126, "learning_rate": 7.174531436013712e-05, "loss": 1.4291, "step": 273 }, { "epoch": 1.4421052631578948, "grad_norm": 2.830916672053224, "learning_rate": 7.165568106020779e-05, "loss": 1.4538, "step": 274 }, { "epoch": 1.4473684210526316, "grad_norm": 3.3750230003094464, "learning_rate": 7.156562037585576e-05, "loss": 1.4218, "step": 275 }, { "epoch": 1.4526315789473685, "grad_norm": 1.886048219133163, "learning_rate": 7.147513352299336e-05, "loss": 1.4005, "step": 276 }, { "epoch": 1.4578947368421051, "grad_norm": 4.612254413421206, "learning_rate": 7.138422172328671e-05, "loss": 1.4112, "step": 277 }, { "epoch": 1.4631578947368422, "grad_norm": 3.475637100160211, "learning_rate": 7.129288620413907e-05, "loss": 1.388, "step": 278 }, { "epoch": 1.4684210526315788, "grad_norm": 3.762823770210365, "learning_rate": 7.120112819867437e-05, "loss": 1.3941, "step": 279 }, { "epoch": 1.4736842105263157, "grad_norm": 2.965363344704181, "learning_rate": 7.110894894572056e-05, "loss": 1.3815, "step": 280 }, { "epoch": 1.4789473684210526, "grad_norm": 3.003148787485473, "learning_rate": 7.101634968979287e-05, "loss": 1.3805, "step": 281 }, { "epoch": 1.4842105263157894, "grad_norm": 1.9778398666652557, "learning_rate": 7.092333168107697e-05, "loss": 1.3752, "step": 282 }, { "epoch": 1.4894736842105263, "grad_norm": 3.5530639325816114, "learning_rate": 7.082989617541217e-05, "loss": 1.3919, "step": 283 }, { "epoch": 1.4947368421052631, "grad_norm": 2.4964012979013144, "learning_rate": 7.073604443427437e-05, "loss": 1.3752, "step": 284 }, { "epoch": 1.5, "grad_norm": 3.2586608061353224, "learning_rate": 7.064177772475912e-05, "loss": 1.3537, "step": 285 }, { "epoch": 1.5052631578947369, "grad_norm": 2.7382891584432576, "learning_rate": 7.054709731956449e-05, "loss": 1.3548, "step": 286 }, { "epoch": 1.5105263157894737, "grad_norm": 2.7983043107255714, "learning_rate": 7.045200449697379e-05, "loss": 1.355, "step": 287 }, { "epoch": 1.5157894736842106, "grad_norm": 2.1575170098628207, "learning_rate": 7.035650054083847e-05, "loss": 1.3666, "step": 288 }, { "epoch": 1.5210526315789474, "grad_norm": 2.280552356804481, "learning_rate": 7.026058674056067e-05, "loss": 1.3729, "step": 289 }, { "epoch": 1.526315789473684, "grad_norm": 1.8204200442034197, "learning_rate": 7.016426439107586e-05, "loss": 1.3285, "step": 290 }, { "epoch": 1.5315789473684212, "grad_norm": 2.2692718684429805, "learning_rate": 7.006753479283535e-05, "loss": 1.3432, "step": 291 }, { "epoch": 1.5368421052631578, "grad_norm": 1.608298273784726, "learning_rate": 6.99703992517887e-05, "loss": 1.3457, "step": 292 }, { "epoch": 1.5421052631578949, "grad_norm": 2.291066728931036, "learning_rate": 6.987285907936617e-05, "loss": 1.3489, "step": 293 }, { "epoch": 1.5473684210526315, "grad_norm": 1.799873956016166, "learning_rate": 6.977491559246091e-05, "loss": 1.3538, "step": 294 }, { "epoch": 1.5526315789473686, "grad_norm": 2.1836156231488144, "learning_rate": 6.967657011341126e-05, "loss": 1.3393, "step": 295 }, { "epoch": 1.5578947368421052, "grad_norm": 1.656082168753184, "learning_rate": 6.957782396998289e-05, "loss": 1.3487, "step": 296 }, { "epoch": 1.563157894736842, "grad_norm": 2.237518228348859, "learning_rate": 6.94786784953508e-05, "loss": 1.3431, "step": 297 }, { "epoch": 1.568421052631579, "grad_norm": 1.8074440576933803, "learning_rate": 6.937913502808142e-05, "loss": 1.3338, "step": 298 }, { "epoch": 1.5736842105263158, "grad_norm": 2.1852538797514134, "learning_rate": 6.927919491211447e-05, "loss": 1.3408, "step": 299 }, { "epoch": 1.5789473684210527, "grad_norm": 1.686931533274294, "learning_rate": 6.917885949674483e-05, "loss": 1.337, "step": 300 }, { "epoch": 1.5842105263157895, "grad_norm": 2.484073100527864, "learning_rate": 6.907813013660437e-05, "loss": 1.3315, "step": 301 }, { "epoch": 1.5894736842105264, "grad_norm": 1.9730996981374016, "learning_rate": 6.897700819164357e-05, "loss": 1.3383, "step": 302 }, { "epoch": 1.594736842105263, "grad_norm": 1.4953094506502813, "learning_rate": 6.887549502711323e-05, "loss": 1.3316, "step": 303 }, { "epoch": 1.6, "grad_norm": 1.7333934091879961, "learning_rate": 6.877359201354606e-05, "loss": 1.3338, "step": 304 }, { "epoch": 1.6052631578947367, "grad_norm": 1.701314271187227, "learning_rate": 6.867130052673806e-05, "loss": 1.3233, "step": 305 }, { "epoch": 1.6105263157894738, "grad_norm": 2.5170048810500365, "learning_rate": 6.856862194773008e-05, "loss": 1.3418, "step": 306 }, { "epoch": 1.6157894736842104, "grad_norm": 1.1423723633356422, "learning_rate": 6.846555766278909e-05, "loss": 1.3456, "step": 307 }, { "epoch": 1.6210526315789475, "grad_norm": 2.1226546892123688, "learning_rate": 6.83621090633895e-05, "loss": 1.3199, "step": 308 }, { "epoch": 1.6263157894736842, "grad_norm": 2.120207603951501, "learning_rate": 6.825827754619434e-05, "loss": 1.3252, "step": 309 }, { "epoch": 1.631578947368421, "grad_norm": 1.3158750566710444, "learning_rate": 6.815406451303647e-05, "loss": 1.3213, "step": 310 }, { "epoch": 1.6368421052631579, "grad_norm": 2.597320776495221, "learning_rate": 6.804947137089955e-05, "loss": 1.3112, "step": 311 }, { "epoch": 1.6421052631578947, "grad_norm": 1.6685217693160599, "learning_rate": 6.794449953189916e-05, "loss": 1.3074, "step": 312 }, { "epoch": 1.6473684210526316, "grad_norm": 2.5188932447525283, "learning_rate": 6.783915041326364e-05, "loss": 1.331, "step": 313 }, { "epoch": 1.6526315789473685, "grad_norm": 2.071056305940592, "learning_rate": 6.773342543731503e-05, "loss": 1.3173, "step": 314 }, { "epoch": 1.6578947368421053, "grad_norm": 2.427656153267591, "learning_rate": 6.762732603144978e-05, "loss": 1.3329, "step": 315 }, { "epoch": 1.663157894736842, "grad_norm": 1.6471906450412725, "learning_rate": 6.75208536281196e-05, "loss": 1.311, "step": 316 }, { "epoch": 1.668421052631579, "grad_norm": 2.3066742827555022, "learning_rate": 6.7414009664812e-05, "loss": 1.3349, "step": 317 }, { "epoch": 1.6736842105263157, "grad_norm": 1.8458843126503317, "learning_rate": 6.730679558403093e-05, "loss": 1.3236, "step": 318 }, { "epoch": 1.6789473684210527, "grad_norm": 2.1861813200392843, "learning_rate": 6.719921283327736e-05, "loss": 1.3268, "step": 319 }, { "epoch": 1.6842105263157894, "grad_norm": 2.1870673388161124, "learning_rate": 6.709126286502965e-05, "loss": 1.3019, "step": 320 }, { "epoch": 1.6894736842105265, "grad_norm": 1.4274808199921123, "learning_rate": 6.698294713672395e-05, "loss": 1.3255, "step": 321 }, { "epoch": 1.694736842105263, "grad_norm": 1.5694017468203492, "learning_rate": 6.687426711073462e-05, "loss": 1.3048, "step": 322 }, { "epoch": 1.7, "grad_norm": 1.1078521144544478, "learning_rate": 6.676522425435433e-05, "loss": 1.3087, "step": 323 }, { "epoch": 1.7052631578947368, "grad_norm": 2.4538742138976386, "learning_rate": 6.665582003977441e-05, "loss": 1.3244, "step": 324 }, { "epoch": 1.7105263157894737, "grad_norm": 1.7323261915367696, "learning_rate": 6.654605594406486e-05, "loss": 1.3093, "step": 325 }, { "epoch": 1.7157894736842105, "grad_norm": 1.7174315153551183, "learning_rate": 6.643593344915445e-05, "loss": 1.3141, "step": 326 }, { "epoch": 1.7210526315789474, "grad_norm": 1.4498322250506395, "learning_rate": 6.632545404181074e-05, "loss": 1.3251, "step": 327 }, { "epoch": 1.7263157894736842, "grad_norm": 2.978144373846546, "learning_rate": 6.62146192136199e-05, "loss": 1.3117, "step": 328 }, { "epoch": 1.731578947368421, "grad_norm": 1.8288925620523002, "learning_rate": 6.610343046096674e-05, "loss": 1.311, "step": 329 }, { "epoch": 1.736842105263158, "grad_norm": 2.8409045314260255, "learning_rate": 6.59918892850144e-05, "loss": 1.3263, "step": 330 }, { "epoch": 1.7421052631578946, "grad_norm": 1.9806940703831386, "learning_rate": 6.587999719168401e-05, "loss": 1.3179, "step": 331 }, { "epoch": 1.7473684210526317, "grad_norm": 2.231645147378468, "learning_rate": 6.576775569163458e-05, "loss": 1.3216, "step": 332 }, { "epoch": 1.7526315789473683, "grad_norm": 2.242436351469589, "learning_rate": 6.565516630024236e-05, "loss": 1.3263, "step": 333 }, { "epoch": 1.7578947368421054, "grad_norm": 1.418447898215289, "learning_rate": 6.554223053758055e-05, "loss": 1.317, "step": 334 }, { "epoch": 1.763157894736842, "grad_norm": 2.1049377231565036, "learning_rate": 6.542894992839873e-05, "loss": 1.3278, "step": 335 }, { "epoch": 1.768421052631579, "grad_norm": 1.9649271286389844, "learning_rate": 6.531532600210222e-05, "loss": 1.3309, "step": 336 }, { "epoch": 1.7736842105263158, "grad_norm": 1.3002559482544591, "learning_rate": 6.520136029273151e-05, "loss": 1.3003, "step": 337 }, { "epoch": 1.7789473684210526, "grad_norm": 2.4684068562448136, "learning_rate": 6.508705433894149e-05, "loss": 1.32, "step": 338 }, { "epoch": 1.7842105263157895, "grad_norm": 1.5743465960197915, "learning_rate": 6.497240968398072e-05, "loss": 1.3006, "step": 339 }, { "epoch": 1.7894736842105263, "grad_norm": 2.59770911193071, "learning_rate": 6.48574278756706e-05, "loss": 1.3222, "step": 340 }, { "epoch": 1.7947368421052632, "grad_norm": 1.7842505149097647, "learning_rate": 6.474211046638438e-05, "loss": 1.3161, "step": 341 }, { "epoch": 1.8, "grad_norm": 2.7074992552378805, "learning_rate": 6.462645901302633e-05, "loss": 1.3281, "step": 342 }, { "epoch": 1.805263157894737, "grad_norm": 1.8215475542278567, "learning_rate": 6.451047507701065e-05, "loss": 1.3282, "step": 343 }, { "epoch": 1.8105263157894735, "grad_norm": 3.304881983512875, "learning_rate": 6.439416022424036e-05, "loss": 1.3391, "step": 344 }, { "epoch": 1.8157894736842106, "grad_norm": 3.2476105816930954, "learning_rate": 6.427751602508628e-05, "loss": 1.3348, "step": 345 }, { "epoch": 1.8210526315789473, "grad_norm": 1.5869549786160873, "learning_rate": 6.416054405436564e-05, "loss": 1.3201, "step": 346 }, { "epoch": 1.8263157894736843, "grad_norm": 2.2683887128326723, "learning_rate": 6.404324589132101e-05, "loss": 1.3204, "step": 347 }, { "epoch": 1.831578947368421, "grad_norm": 1.9620950054062172, "learning_rate": 6.392562311959886e-05, "loss": 1.3158, "step": 348 }, { "epoch": 1.836842105263158, "grad_norm": 1.8047773439892525, "learning_rate": 6.380767732722821e-05, "loss": 1.3181, "step": 349 }, { "epoch": 1.8421052631578947, "grad_norm": 1.911531036771628, "learning_rate": 6.368941010659921e-05, "loss": 1.3292, "step": 350 }, { "epoch": 1.8473684210526315, "grad_norm": 1.636720765605733, "learning_rate": 6.35708230544416e-05, "loss": 1.3091, "step": 351 }, { "epoch": 1.8526315789473684, "grad_norm": 1.424304904246396, "learning_rate": 6.34519177718032e-05, "loss": 1.3207, "step": 352 }, { "epoch": 1.8578947368421053, "grad_norm": 1.3525184453889394, "learning_rate": 6.333269586402827e-05, "loss": 1.3125, "step": 353 }, { "epoch": 1.8631578947368421, "grad_norm": 1.7281390731184902, "learning_rate": 6.321315894073581e-05, "loss": 1.3231, "step": 354 }, { "epoch": 1.868421052631579, "grad_norm": 1.1089366431889842, "learning_rate": 6.309330861579786e-05, "loss": 1.3238, "step": 355 }, { "epoch": 1.8736842105263158, "grad_norm": 1.876591117688095, "learning_rate": 6.297314650731775e-05, "loss": 1.3118, "step": 356 }, { "epoch": 1.8789473684210525, "grad_norm": 1.61640922760774, "learning_rate": 6.285267423760817e-05, "loss": 1.3263, "step": 357 }, { "epoch": 1.8842105263157896, "grad_norm": 1.4451990798983758, "learning_rate": 6.273189343316929e-05, "loss": 1.325, "step": 358 }, { "epoch": 1.8894736842105262, "grad_norm": 1.3409307869705591, "learning_rate": 6.261080572466688e-05, "loss": 1.3057, "step": 359 }, { "epoch": 1.8947368421052633, "grad_norm": 1.6052273256057499, "learning_rate": 6.248941274691017e-05, "loss": 1.3252, "step": 360 }, { "epoch": 1.9, "grad_norm": 2.366978019871103, "learning_rate": 6.236771613882987e-05, "loss": 1.3179, "step": 361 }, { "epoch": 1.905263157894737, "grad_norm": 1.1868922713453152, "learning_rate": 6.224571754345602e-05, "loss": 1.3082, "step": 362 }, { "epoch": 1.9105263157894736, "grad_norm": 2.2556197419222612, "learning_rate": 6.21234186078958e-05, "loss": 1.3115, "step": 363 }, { "epoch": 1.9157894736842105, "grad_norm": 1.7410078285379156, "learning_rate": 6.200082098331126e-05, "loss": 1.3281, "step": 364 }, { "epoch": 1.9210526315789473, "grad_norm": 1.7950505417159182, "learning_rate": 6.18779263248971e-05, "loss": 1.3162, "step": 365 }, { "epoch": 1.9263157894736842, "grad_norm": 1.8544654429983962, "learning_rate": 6.175473629185822e-05, "loss": 1.3205, "step": 366 }, { "epoch": 1.931578947368421, "grad_norm": 1.7372962100479836, "learning_rate": 6.163125254738751e-05, "loss": 1.3065, "step": 367 }, { "epoch": 1.936842105263158, "grad_norm": 2.242141298655648, "learning_rate": 6.150747675864314e-05, "loss": 1.2985, "step": 368 }, { "epoch": 1.9421052631578948, "grad_norm": 1.481088212600443, "learning_rate": 6.138341059672622e-05, "loss": 1.3136, "step": 369 }, { "epoch": 1.9473684210526314, "grad_norm": 2.1356181680202253, "learning_rate": 6.125905573665824e-05, "loss": 1.3282, "step": 370 }, { "epoch": 1.9526315789473685, "grad_norm": 1.6259642009051207, "learning_rate": 6.113441385735836e-05, "loss": 1.3131, "step": 371 }, { "epoch": 1.9578947368421051, "grad_norm": 1.973896595209029, "learning_rate": 6.100948664162081e-05, "loss": 1.3182, "step": 372 }, { "epoch": 1.9631578947368422, "grad_norm": 1.616074252415091, "learning_rate": 6.088427577609219e-05, "loss": 1.3037, "step": 373 }, { "epoch": 1.9684210526315788, "grad_norm": 1.777657189903051, "learning_rate": 6.075878295124861e-05, "loss": 1.3096, "step": 374 }, { "epoch": 1.973684210526316, "grad_norm": 1.5325372367258376, "learning_rate": 6.063300986137297e-05, "loss": 1.3092, "step": 375 }, { "epoch": 1.9789473684210526, "grad_norm": 1.7425893453777117, "learning_rate": 6.0506958204531996e-05, "loss": 1.3094, "step": 376 }, { "epoch": 1.9842105263157894, "grad_norm": 1.2678177296707205, "learning_rate": 6.0380629682553395e-05, "loss": 1.2995, "step": 377 }, { "epoch": 1.9894736842105263, "grad_norm": 2.083473793091378, "learning_rate": 6.025402600100283e-05, "loss": 1.3133, "step": 378 }, { "epoch": 1.9947368421052631, "grad_norm": 1.5761354608098717, "learning_rate": 6.012714886916088e-05, "loss": 1.3232, "step": 379 }, { "epoch": 2.0, "grad_norm": 1.6759654932294628, "learning_rate": 6.000000000000001e-05, "loss": 1.295, "step": 380 }, { "epoch": 2.0052631578947366, "grad_norm": 1.6566803747791274, "learning_rate": 5.987258111016139e-05, "loss": 1.269, "step": 381 }, { "epoch": 2.0105263157894737, "grad_norm": 1.7773404022104615, "learning_rate": 5.974489391993182e-05, "loss": 1.2756, "step": 382 }, { "epoch": 2.0157894736842104, "grad_norm": 1.7734820944361986, "learning_rate": 5.9616940153220336e-05, "loss": 1.3024, "step": 383 }, { "epoch": 2.0210526315789474, "grad_norm": 1.3832861885005663, "learning_rate": 5.948872153753509e-05, "loss": 1.292, "step": 384 }, { "epoch": 2.026315789473684, "grad_norm": 1.9267936406726134, "learning_rate": 5.936023980395997e-05, "loss": 1.2974, "step": 385 }, { "epoch": 2.031578947368421, "grad_norm": 1.1634482135657338, "learning_rate": 5.923149668713118e-05, "loss": 1.2864, "step": 386 }, { "epoch": 2.036842105263158, "grad_norm": 1.4709242560357587, "learning_rate": 5.9102493925213946e-05, "loss": 1.2719, "step": 387 }, { "epoch": 2.042105263157895, "grad_norm": 1.3716802116661737, "learning_rate": 5.8973233259878914e-05, "loss": 1.2688, "step": 388 }, { "epoch": 2.0473684210526315, "grad_norm": 1.8540565451199285, "learning_rate": 5.8843716436278696e-05, "loss": 1.292, "step": 389 }, { "epoch": 2.0526315789473686, "grad_norm": 1.6569753347566705, "learning_rate": 5.871394520302432e-05, "loss": 1.2923, "step": 390 }, { "epoch": 2.057894736842105, "grad_norm": 1.3344056097550805, "learning_rate": 5.85839213121616e-05, "loss": 1.2783, "step": 391 }, { "epoch": 2.0631578947368423, "grad_norm": 1.0756154226095422, "learning_rate": 5.845364651914752e-05, "loss": 1.2823, "step": 392 }, { "epoch": 2.068421052631579, "grad_norm": 1.5938788684018976, "learning_rate": 5.832312258282645e-05, "loss": 1.2872, "step": 393 }, { "epoch": 2.0736842105263156, "grad_norm": 1.9960858248177353, "learning_rate": 5.8192351265406466e-05, "loss": 1.2819, "step": 394 }, { "epoch": 2.0789473684210527, "grad_norm": 1.500794950504469, "learning_rate": 5.806133433243558e-05, "loss": 1.3018, "step": 395 }, { "epoch": 2.0842105263157893, "grad_norm": 1.9776756621227738, "learning_rate": 5.793007355277783e-05, "loss": 1.2947, "step": 396 }, { "epoch": 2.0894736842105264, "grad_norm": 1.5136499333830533, "learning_rate": 5.7798570698589465e-05, "loss": 1.2847, "step": 397 }, { "epoch": 2.094736842105263, "grad_norm": 2.7888675821510467, "learning_rate": 5.7666827545294965e-05, "loss": 1.2803, "step": 398 }, { "epoch": 2.1, "grad_norm": 1.997214443213332, "learning_rate": 5.75348458715631e-05, "loss": 1.2889, "step": 399 }, { "epoch": 2.1052631578947367, "grad_norm": 2.636705031350177, "learning_rate": 5.740262745928293e-05, "loss": 1.2964, "step": 400 }, { "epoch": 2.110526315789474, "grad_norm": 2.1100662062402775, "learning_rate": 5.727017409353971e-05, "loss": 1.2878, "step": 401 }, { "epoch": 2.1157894736842104, "grad_norm": 1.9907403756995032, "learning_rate": 5.713748756259085e-05, "loss": 1.2942, "step": 402 }, { "epoch": 2.1210526315789475, "grad_norm": 2.0618250265894433, "learning_rate": 5.700456965784167e-05, "loss": 1.2857, "step": 403 }, { "epoch": 2.126315789473684, "grad_norm": 0.9319289242731835, "learning_rate": 5.687142217382129e-05, "loss": 1.2708, "step": 404 }, { "epoch": 2.1315789473684212, "grad_norm": 2.992807432876805, "learning_rate": 5.673804690815845e-05, "loss": 1.309, "step": 405 }, { "epoch": 2.136842105263158, "grad_norm": 1.864224068302743, "learning_rate": 5.660444566155709e-05, "loss": 1.2854, "step": 406 }, { "epoch": 2.1421052631578945, "grad_norm": 3.4214971672572596, "learning_rate": 5.647062023777221e-05, "loss": 1.2927, "step": 407 }, { "epoch": 2.1473684210526316, "grad_norm": 2.508109225516717, "learning_rate": 5.633657244358535e-05, "loss": 1.2829, "step": 408 }, { "epoch": 2.1526315789473682, "grad_norm": 3.929215425642367, "learning_rate": 5.6202304088780335e-05, "loss": 1.2946, "step": 409 }, { "epoch": 2.1578947368421053, "grad_norm": 3.798085324745515, "learning_rate": 5.606781698611879e-05, "loss": 1.3013, "step": 410 }, { "epoch": 2.163157894736842, "grad_norm": 2.0206510011506222, "learning_rate": 5.593311295131562e-05, "loss": 1.2917, "step": 411 }, { "epoch": 2.168421052631579, "grad_norm": 3.092904855930784, "learning_rate": 5.579819380301458e-05, "loss": 1.2795, "step": 412 }, { "epoch": 2.1736842105263157, "grad_norm": 2.2386119816911623, "learning_rate": 5.5663061362763665e-05, "loss": 1.2964, "step": 413 }, { "epoch": 2.1789473684210527, "grad_norm": 2.8845075274191223, "learning_rate": 5.552771745499051e-05, "loss": 1.29, "step": 414 }, { "epoch": 2.1842105263157894, "grad_norm": 2.355680412049654, "learning_rate": 5.5392163906977835e-05, "loss": 1.2802, "step": 415 }, { "epoch": 2.1894736842105265, "grad_norm": 2.577365756174829, "learning_rate": 5.525640254883865e-05, "loss": 1.2894, "step": 416 }, { "epoch": 2.194736842105263, "grad_norm": 2.0046660703267576, "learning_rate": 5.512043521349166e-05, "loss": 1.2873, "step": 417 }, { "epoch": 2.2, "grad_norm": 2.576417069207704, "learning_rate": 5.4984263736636494e-05, "loss": 1.2759, "step": 418 }, { "epoch": 2.205263157894737, "grad_norm": 1.9511016222282593, "learning_rate": 5.4847889956728834e-05, "loss": 1.298, "step": 419 }, { "epoch": 2.2105263157894735, "grad_norm": 2.6670388356828285, "learning_rate": 5.471131571495574e-05, "loss": 1.2951, "step": 420 }, { "epoch": 2.2157894736842105, "grad_norm": 2.126207826369636, "learning_rate": 5.457454285521064e-05, "loss": 1.2812, "step": 421 }, { "epoch": 2.221052631578947, "grad_norm": 2.400820316806507, "learning_rate": 5.4437573224068595e-05, "loss": 1.2948, "step": 422 }, { "epoch": 2.2263157894736842, "grad_norm": 1.7795504525185426, "learning_rate": 5.4300408670761204e-05, "loss": 1.2959, "step": 423 }, { "epoch": 2.231578947368421, "grad_norm": 2.8829533810849663, "learning_rate": 5.416305104715175e-05, "loss": 1.3074, "step": 424 }, { "epoch": 2.236842105263158, "grad_norm": 2.159110693359624, "learning_rate": 5.4025502207710184e-05, "loss": 1.2797, "step": 425 }, { "epoch": 2.2421052631578946, "grad_norm": 3.0783424431722666, "learning_rate": 5.388776400948803e-05, "loss": 1.2864, "step": 426 }, { "epoch": 2.2473684210526317, "grad_norm": 2.640451536165918, "learning_rate": 5.3749838312093364e-05, "loss": 1.2987, "step": 427 }, { "epoch": 2.2526315789473683, "grad_norm": 2.413356511304884, "learning_rate": 5.361172697766573e-05, "loss": 1.2775, "step": 428 }, { "epoch": 2.2578947368421054, "grad_norm": 2.334518355071201, "learning_rate": 5.3473431870850904e-05, "loss": 1.275, "step": 429 }, { "epoch": 2.263157894736842, "grad_norm": 2.4122426514527984, "learning_rate": 5.333495485877583e-05, "loss": 1.2961, "step": 430 }, { "epoch": 2.268421052631579, "grad_norm": 2.2362176845592208, "learning_rate": 5.3196297811023316e-05, "loss": 1.2937, "step": 431 }, { "epoch": 2.2736842105263158, "grad_norm": 2.293429615790018, "learning_rate": 5.305746259960689e-05, "loss": 1.2852, "step": 432 }, { "epoch": 2.2789473684210524, "grad_norm": 1.786254087945556, "learning_rate": 5.291845109894544e-05, "loss": 1.2799, "step": 433 }, { "epoch": 2.2842105263157895, "grad_norm": 2.6945565744818407, "learning_rate": 5.277926518583793e-05, "loss": 1.2921, "step": 434 }, { "epoch": 2.2894736842105265, "grad_norm": 2.2194659113523962, "learning_rate": 5.263990673943811e-05, "loss": 1.3046, "step": 435 }, { "epoch": 2.294736842105263, "grad_norm": 2.547332541805207, "learning_rate": 5.250037764122907e-05, "loss": 1.2842, "step": 436 }, { "epoch": 2.3, "grad_norm": 2.2417529148390325, "learning_rate": 5.23606797749979e-05, "loss": 1.2737, "step": 437 }, { "epoch": 2.305263157894737, "grad_norm": 2.297738957213229, "learning_rate": 5.2220815026810234e-05, "loss": 1.2964, "step": 438 }, { "epoch": 2.3105263157894735, "grad_norm": 2.0161964482231416, "learning_rate": 5.208078528498476e-05, "loss": 1.2734, "step": 439 }, { "epoch": 2.3157894736842106, "grad_norm": 2.5589773064425896, "learning_rate": 5.194059244006779e-05, "loss": 1.3239, "step": 440 }, { "epoch": 2.3210526315789473, "grad_norm": 2.5534996515172246, "learning_rate": 5.180023838480765e-05, "loss": 1.2839, "step": 441 }, { "epoch": 2.3263157894736843, "grad_norm": 1.6213064440253335, "learning_rate": 5.165972501412921e-05, "loss": 1.2804, "step": 442 }, { "epoch": 2.331578947368421, "grad_norm": 1.3934815609685396, "learning_rate": 5.151905422510825e-05, "loss": 1.2733, "step": 443 }, { "epoch": 2.336842105263158, "grad_norm": 2.6675021957708447, "learning_rate": 5.137822791694585e-05, "loss": 1.2847, "step": 444 }, { "epoch": 2.3421052631578947, "grad_norm": 2.0956189405862027, "learning_rate": 5.123724799094279e-05, "loss": 1.2705, "step": 445 }, { "epoch": 2.3473684210526318, "grad_norm": 2.419629528561346, "learning_rate": 5.109611635047379e-05, "loss": 1.2879, "step": 446 }, { "epoch": 2.3526315789473684, "grad_norm": 2.4104356876557755, "learning_rate": 5.095483490096194e-05, "loss": 1.2935, "step": 447 }, { "epoch": 2.3578947368421055, "grad_norm": 1.8777710470149278, "learning_rate": 5.081340554985287e-05, "loss": 1.2775, "step": 448 }, { "epoch": 2.363157894736842, "grad_norm": 1.487851978844643, "learning_rate": 5.067183020658905e-05, "loss": 1.2761, "step": 449 }, { "epoch": 2.3684210526315788, "grad_norm": 2.4549351462483586, "learning_rate": 5.053011078258397e-05, "loss": 1.2692, "step": 450 }, { "epoch": 2.373684210526316, "grad_norm": 1.8656568337670418, "learning_rate": 5.03882491911964e-05, "loss": 1.2911, "step": 451 }, { "epoch": 2.3789473684210525, "grad_norm": 2.7641450741813265, "learning_rate": 5.024624734770446e-05, "loss": 1.2735, "step": 452 }, { "epoch": 2.3842105263157896, "grad_norm": 2.722572050222999, "learning_rate": 5.010410716927988e-05, "loss": 1.2737, "step": 453 }, { "epoch": 2.389473684210526, "grad_norm": 1.428940654329024, "learning_rate": 4.9961830574962e-05, "loss": 1.2888, "step": 454 }, { "epoch": 2.3947368421052633, "grad_norm": 1.5780962302368826, "learning_rate": 4.981941948563197e-05, "loss": 1.2812, "step": 455 }, { "epoch": 2.4, "grad_norm": 2.0344826746474283, "learning_rate": 4.967687582398671e-05, "loss": 1.2864, "step": 456 }, { "epoch": 2.405263157894737, "grad_norm": 1.4319688356833826, "learning_rate": 4.953420151451304e-05, "loss": 1.2834, "step": 457 }, { "epoch": 2.4105263157894736, "grad_norm": 2.601310717014097, "learning_rate": 4.939139848346164e-05, "loss": 1.2823, "step": 458 }, { "epoch": 2.4157894736842107, "grad_norm": 2.348235653569354, "learning_rate": 4.924846865882107e-05, "loss": 1.2846, "step": 459 }, { "epoch": 2.4210526315789473, "grad_norm": 1.726129892949758, "learning_rate": 4.9105413970291747e-05, "loss": 1.3011, "step": 460 }, { "epoch": 2.4263157894736844, "grad_norm": 1.7111299716712474, "learning_rate": 4.896223634925984e-05, "loss": 1.3116, "step": 461 }, { "epoch": 2.431578947368421, "grad_norm": 1.803266671516624, "learning_rate": 4.8818937728771294e-05, "loss": 1.272, "step": 462 }, { "epoch": 2.4368421052631577, "grad_norm": 1.3759594624212466, "learning_rate": 4.867552004350564e-05, "loss": 1.289, "step": 463 }, { "epoch": 2.442105263157895, "grad_norm": 2.3296183382561253, "learning_rate": 4.853198522974988e-05, "loss": 1.2911, "step": 464 }, { "epoch": 2.4473684210526314, "grad_norm": 1.9742731162641471, "learning_rate": 4.8388335225372416e-05, "loss": 1.2656, "step": 465 }, { "epoch": 2.4526315789473685, "grad_norm": 1.6842871202377092, "learning_rate": 4.8244571969796817e-05, "loss": 1.2891, "step": 466 }, { "epoch": 2.457894736842105, "grad_norm": 1.5497120892994825, "learning_rate": 4.810069740397569e-05, "loss": 1.2844, "step": 467 }, { "epoch": 2.463157894736842, "grad_norm": 1.824870478700358, "learning_rate": 4.795671347036439e-05, "loss": 1.2902, "step": 468 }, { "epoch": 2.468421052631579, "grad_norm": 1.3990180325007069, "learning_rate": 4.781262211289491e-05, "loss": 1.281, "step": 469 }, { "epoch": 2.473684210526316, "grad_norm": 2.320520504849803, "learning_rate": 4.7668425276949546e-05, "loss": 1.2838, "step": 470 }, { "epoch": 2.4789473684210526, "grad_norm": 2.1234854667075758, "learning_rate": 4.7524124909334653e-05, "loss": 1.2797, "step": 471 }, { "epoch": 2.4842105263157896, "grad_norm": 1.422390194322296, "learning_rate": 4.7379722958254394e-05, "loss": 1.2896, "step": 472 }, { "epoch": 2.4894736842105263, "grad_norm": 1.3196019422752756, "learning_rate": 4.7235221373284407e-05, "loss": 1.2744, "step": 473 }, { "epoch": 2.4947368421052634, "grad_norm": 1.7461193994213873, "learning_rate": 4.709062210534547e-05, "loss": 1.2887, "step": 474 }, { "epoch": 2.5, "grad_norm": 1.3331786938242027, "learning_rate": 4.694592710667723e-05, "loss": 1.281, "step": 475 }, { "epoch": 2.5052631578947366, "grad_norm": 2.0104890922006464, "learning_rate": 4.680113833081173e-05, "loss": 1.2786, "step": 476 }, { "epoch": 2.5105263157894737, "grad_norm": 1.92734853764442, "learning_rate": 4.665625773254716e-05, "loss": 1.2844, "step": 477 }, { "epoch": 2.515789473684211, "grad_norm": 1.3368646117339327, "learning_rate": 4.6511287267921394e-05, "loss": 1.2944, "step": 478 }, { "epoch": 2.5210526315789474, "grad_norm": 1.158860061239002, "learning_rate": 4.636622889418558e-05, "loss": 1.2728, "step": 479 }, { "epoch": 2.526315789473684, "grad_norm": 1.7510317771973813, "learning_rate": 4.622108456977773e-05, "loss": 1.2752, "step": 480 }, { "epoch": 2.531578947368421, "grad_norm": 1.3943091910623553, "learning_rate": 4.60758562542963e-05, "loss": 1.3005, "step": 481 }, { "epoch": 2.536842105263158, "grad_norm": 1.8549674666458555, "learning_rate": 4.593054590847368e-05, "loss": 1.281, "step": 482 }, { "epoch": 2.542105263157895, "grad_norm": 1.7188026241177852, "learning_rate": 4.57851554941498e-05, "loss": 1.3061, "step": 483 }, { "epoch": 2.5473684210526315, "grad_norm": 1.2685249463251793, "learning_rate": 4.563968697424553e-05, "loss": 1.2822, "step": 484 }, { "epoch": 2.5526315789473686, "grad_norm": 1.549673702485011, "learning_rate": 4.549414231273633e-05, "loss": 1.2958, "step": 485 }, { "epoch": 2.557894736842105, "grad_norm": 1.0822272735204688, "learning_rate": 4.534852347462559e-05, "loss": 1.2829, "step": 486 }, { "epoch": 2.5631578947368423, "grad_norm": 0.9555789360037702, "learning_rate": 4.5202832425918166e-05, "loss": 1.3051, "step": 487 }, { "epoch": 2.568421052631579, "grad_norm": 1.067488479666183, "learning_rate": 4.5057071133593853e-05, "loss": 1.275, "step": 488 }, { "epoch": 2.5736842105263156, "grad_norm": 1.3214859289777758, "learning_rate": 4.4911241565580796e-05, "loss": 1.2887, "step": 489 }, { "epoch": 2.5789473684210527, "grad_norm": 1.309909912707582, "learning_rate": 4.476534569072895e-05, "loss": 1.2933, "step": 490 }, { "epoch": 2.5842105263157897, "grad_norm": 1.103182457932209, "learning_rate": 4.4619385478783456e-05, "loss": 1.2785, "step": 491 }, { "epoch": 2.5894736842105264, "grad_norm": 1.050641547852913, "learning_rate": 4.4473362900358065e-05, "loss": 1.2877, "step": 492 }, { "epoch": 2.594736842105263, "grad_norm": 1.1472680394377797, "learning_rate": 4.432727992690857e-05, "loss": 1.285, "step": 493 }, { "epoch": 2.6, "grad_norm": 1.362629738278887, "learning_rate": 4.418113853070614e-05, "loss": 1.2774, "step": 494 }, { "epoch": 2.6052631578947367, "grad_norm": 0.9212638160971107, "learning_rate": 4.403494068481074e-05, "loss": 1.2956, "step": 495 }, { "epoch": 2.610526315789474, "grad_norm": 1.3060601473810125, "learning_rate": 4.388868836304442e-05, "loss": 1.2864, "step": 496 }, { "epoch": 2.6157894736842104, "grad_norm": 0.9964781716303204, "learning_rate": 4.374238353996472e-05, "loss": 1.2846, "step": 497 }, { "epoch": 2.6210526315789475, "grad_norm": 1.0334496502405375, "learning_rate": 4.3596028190838045e-05, "loss": 1.2751, "step": 498 }, { "epoch": 2.626315789473684, "grad_norm": 0.8374244946961393, "learning_rate": 4.3449624291612895e-05, "loss": 1.2846, "step": 499 }, { "epoch": 2.6315789473684212, "grad_norm": 1.271324423070232, "learning_rate": 4.33031738188933e-05, "loss": 1.2893, "step": 500 }, { "epoch": 2.636842105263158, "grad_norm": 1.0290842796861808, "learning_rate": 4.315667874991205e-05, "loss": 1.2769, "step": 501 }, { "epoch": 2.6421052631578945, "grad_norm": 1.6287387587682205, "learning_rate": 4.3010141062504e-05, "loss": 1.2808, "step": 502 }, { "epoch": 2.6473684210526316, "grad_norm": 1.05633292251089, "learning_rate": 4.286356273507949e-05, "loss": 1.2752, "step": 503 }, { "epoch": 2.6526315789473687, "grad_norm": 1.2639851855275497, "learning_rate": 4.271694574659744e-05, "loss": 1.2673, "step": 504 }, { "epoch": 2.6578947368421053, "grad_norm": 1.3942797511952854, "learning_rate": 4.257029207653881e-05, "loss": 1.2725, "step": 505 }, { "epoch": 2.663157894736842, "grad_norm": 0.9618496184388876, "learning_rate": 4.242360370487976e-05, "loss": 1.2747, "step": 506 }, { "epoch": 2.668421052631579, "grad_norm": 1.141131756163478, "learning_rate": 4.2276882612064936e-05, "loss": 1.3005, "step": 507 }, { "epoch": 2.6736842105263157, "grad_norm": 0.9471533928078693, "learning_rate": 4.213013077898084e-05, "loss": 1.2726, "step": 508 }, { "epoch": 2.6789473684210527, "grad_norm": 0.9237490499511763, "learning_rate": 4.1983350186928894e-05, "loss": 1.2801, "step": 509 }, { "epoch": 2.6842105263157894, "grad_norm": 1.1110718726310678, "learning_rate": 4.183654281759888e-05, "loss": 1.2674, "step": 510 }, { "epoch": 2.6894736842105265, "grad_norm": 1.2581508468500637, "learning_rate": 4.168971065304205e-05, "loss": 1.2809, "step": 511 }, { "epoch": 2.694736842105263, "grad_norm": 0.9468779696627782, "learning_rate": 4.154285567564442e-05, "loss": 1.2796, "step": 512 }, { "epoch": 2.7, "grad_norm": 1.3134333293058689, "learning_rate": 4.139597986810005e-05, "loss": 1.2698, "step": 513 }, { "epoch": 2.705263157894737, "grad_norm": 0.9243484356050305, "learning_rate": 4.124908521338416e-05, "loss": 1.2745, "step": 514 }, { "epoch": 2.7105263157894735, "grad_norm": 0.9728548533723144, "learning_rate": 4.110217369472649e-05, "loss": 1.2925, "step": 515 }, { "epoch": 2.7157894736842105, "grad_norm": 0.7367530477112044, "learning_rate": 4.095524729558441e-05, "loss": 1.2677, "step": 516 }, { "epoch": 2.7210526315789476, "grad_norm": 0.9279276228473495, "learning_rate": 4.080830799961622e-05, "loss": 1.2802, "step": 517 }, { "epoch": 2.7263157894736842, "grad_norm": 1.3931410811444014, "learning_rate": 4.0661357790654345e-05, "loss": 1.262, "step": 518 }, { "epoch": 2.731578947368421, "grad_norm": 0.9883528178224094, "learning_rate": 4.0514398652678514e-05, "loss": 1.2964, "step": 519 }, { "epoch": 2.736842105263158, "grad_norm": 1.3163227077320665, "learning_rate": 4.0367432569789065e-05, "loss": 1.2805, "step": 520 }, { "epoch": 2.7421052631578946, "grad_norm": 0.7397578201219466, "learning_rate": 4.0220461526180023e-05, "loss": 1.2773, "step": 521 }, { "epoch": 2.7473684210526317, "grad_norm": 0.9990315603511356, "learning_rate": 4.007348750611245e-05, "loss": 1.292, "step": 522 }, { "epoch": 2.7526315789473683, "grad_norm": 1.1681928253101022, "learning_rate": 3.9926512493887555e-05, "loss": 1.2893, "step": 523 }, { "epoch": 2.7578947368421054, "grad_norm": 1.1014263843169803, "learning_rate": 3.977953847381998e-05, "loss": 1.2715, "step": 524 }, { "epoch": 2.763157894736842, "grad_norm": 1.1159951506001466, "learning_rate": 3.963256743021095e-05, "loss": 1.2785, "step": 525 }, { "epoch": 2.768421052631579, "grad_norm": 1.2507849560008404, "learning_rate": 3.9485601347321486e-05, "loss": 1.2906, "step": 526 }, { "epoch": 2.7736842105263158, "grad_norm": 0.8936481542314029, "learning_rate": 3.933864220934566e-05, "loss": 1.2669, "step": 527 }, { "epoch": 2.7789473684210524, "grad_norm": 0.9849739951015418, "learning_rate": 3.919169200038379e-05, "loss": 1.2771, "step": 528 }, { "epoch": 2.7842105263157895, "grad_norm": 0.837609493415301, "learning_rate": 3.904475270441561e-05, "loss": 1.266, "step": 529 }, { "epoch": 2.7894736842105265, "grad_norm": 0.7219038051900546, "learning_rate": 3.889782630527353e-05, "loss": 1.2726, "step": 530 }, { "epoch": 2.794736842105263, "grad_norm": 0.7505423536800923, "learning_rate": 3.875091478661585e-05, "loss": 1.2703, "step": 531 }, { "epoch": 2.8, "grad_norm": 0.7301064067667716, "learning_rate": 3.860402013189998e-05, "loss": 1.2812, "step": 532 }, { "epoch": 2.805263157894737, "grad_norm": 0.822332511103024, "learning_rate": 3.845714432435558e-05, "loss": 1.2718, "step": 533 }, { "epoch": 2.8105263157894735, "grad_norm": 0.7783007350783687, "learning_rate": 3.8310289346957965e-05, "loss": 1.2574, "step": 534 }, { "epoch": 2.8157894736842106, "grad_norm": 0.957233585531184, "learning_rate": 3.816345718240113e-05, "loss": 1.2805, "step": 535 }, { "epoch": 2.8210526315789473, "grad_norm": 1.1226403614971794, "learning_rate": 3.8016649813071106e-05, "loss": 1.2983, "step": 536 }, { "epoch": 2.8263157894736843, "grad_norm": 1.306496898793406, "learning_rate": 3.7869869221019177e-05, "loss": 1.2727, "step": 537 }, { "epoch": 2.831578947368421, "grad_norm": 0.5971018924485769, "learning_rate": 3.772311738793507e-05, "loss": 1.2834, "step": 538 }, { "epoch": 2.836842105263158, "grad_norm": 0.9138436747802899, "learning_rate": 3.757639629512026e-05, "loss": 1.2871, "step": 539 }, { "epoch": 2.8421052631578947, "grad_norm": 1.2871243761882003, "learning_rate": 3.74297079234612e-05, "loss": 1.2797, "step": 540 }, { "epoch": 2.8473684210526313, "grad_norm": 0.951777404285425, "learning_rate": 3.7283054253402574e-05, "loss": 1.2754, "step": 541 }, { "epoch": 2.8526315789473684, "grad_norm": 0.9411344162055937, "learning_rate": 3.713643726492053e-05, "loss": 1.2721, "step": 542 }, { "epoch": 2.8578947368421055, "grad_norm": 1.317846119765965, "learning_rate": 3.698985893749599e-05, "loss": 1.2887, "step": 543 }, { "epoch": 2.863157894736842, "grad_norm": 0.8089940300375972, "learning_rate": 3.6843321250087966e-05, "loss": 1.2848, "step": 544 }, { "epoch": 2.8684210526315788, "grad_norm": 0.5361830354797774, "learning_rate": 3.669682618110671e-05, "loss": 1.2657, "step": 545 }, { "epoch": 2.873684210526316, "grad_norm": 0.8183317690070797, "learning_rate": 3.655037570838711e-05, "loss": 1.2866, "step": 546 }, { "epoch": 2.8789473684210525, "grad_norm": 1.2353845817797051, "learning_rate": 3.640397180916197e-05, "loss": 1.2806, "step": 547 }, { "epoch": 2.8842105263157896, "grad_norm": 1.275438903457986, "learning_rate": 3.62576164600353e-05, "loss": 1.3042, "step": 548 }, { "epoch": 2.889473684210526, "grad_norm": 0.5384567882124249, "learning_rate": 3.611131163695561e-05, "loss": 1.2689, "step": 549 }, { "epoch": 2.8947368421052633, "grad_norm": 0.7803333541473619, "learning_rate": 3.5965059315189274e-05, "loss": 1.2797, "step": 550 }, { "epoch": 2.9, "grad_norm": 1.4689702016559818, "learning_rate": 3.581886146929387e-05, "loss": 1.2648, "step": 551 }, { "epoch": 2.905263157894737, "grad_norm": 0.6546922206210783, "learning_rate": 3.567272007309145e-05, "loss": 1.279, "step": 552 }, { "epoch": 2.9105263157894736, "grad_norm": 0.706490220466843, "learning_rate": 3.552663709964194e-05, "loss": 1.2735, "step": 553 }, { "epoch": 2.9157894736842103, "grad_norm": 1.3658529786651115, "learning_rate": 3.538061452121656e-05, "loss": 1.2916, "step": 554 }, { "epoch": 2.9210526315789473, "grad_norm": 0.9489706293231372, "learning_rate": 3.523465430927106e-05, "loss": 1.2918, "step": 555 }, { "epoch": 2.9263157894736844, "grad_norm": 0.8764748345395458, "learning_rate": 3.50887584344192e-05, "loss": 1.3015, "step": 556 }, { "epoch": 2.931578947368421, "grad_norm": 0.5355041821626928, "learning_rate": 3.494292886640615e-05, "loss": 1.2751, "step": 557 }, { "epoch": 2.9368421052631577, "grad_norm": 0.9242505060104863, "learning_rate": 3.479716757408185e-05, "loss": 1.2819, "step": 558 }, { "epoch": 2.942105263157895, "grad_norm": 1.2554507748275814, "learning_rate": 3.465147652537443e-05, "loss": 1.276, "step": 559 }, { "epoch": 2.9473684210526314, "grad_norm": 0.9095910936651702, "learning_rate": 3.4505857687263675e-05, "loss": 1.2753, "step": 560 }, { "epoch": 2.9526315789473685, "grad_norm": 0.6651860681009616, "learning_rate": 3.4360313025754476e-05, "loss": 1.2695, "step": 561 }, { "epoch": 2.957894736842105, "grad_norm": 0.9291042715583687, "learning_rate": 3.421484450585023e-05, "loss": 1.2961, "step": 562 }, { "epoch": 2.963157894736842, "grad_norm": 1.3279167632034623, "learning_rate": 3.406945409152632e-05, "loss": 1.2858, "step": 563 }, { "epoch": 2.968421052631579, "grad_norm": 0.5900225146576717, "learning_rate": 3.392414374570371e-05, "loss": 1.2786, "step": 564 }, { "epoch": 2.973684210526316, "grad_norm": 0.8721553828236943, "learning_rate": 3.377891543022229e-05, "loss": 1.2712, "step": 565 }, { "epoch": 2.9789473684210526, "grad_norm": 1.0505950441437681, "learning_rate": 3.363377110581442e-05, "loss": 1.2719, "step": 566 }, { "epoch": 2.984210526315789, "grad_norm": 1.3040980751717048, "learning_rate": 3.348871273207861e-05, "loss": 1.2961, "step": 567 }, { "epoch": 2.9894736842105263, "grad_norm": 0.7750168697948207, "learning_rate": 3.334374226745285e-05, "loss": 1.287, "step": 568 }, { "epoch": 2.9947368421052634, "grad_norm": 0.797465388668966, "learning_rate": 3.319886166918829e-05, "loss": 1.2798, "step": 569 }, { "epoch": 3.0, "grad_norm": 0.7195120795363182, "learning_rate": 3.305407289332279e-05, "loss": 1.2516, "step": 570 }, { "epoch": 3.0052631578947366, "grad_norm": 0.9683227012264453, "learning_rate": 3.290937789465454e-05, "loss": 1.245, "step": 571 }, { "epoch": 3.0105263157894737, "grad_norm": 1.0758712880323096, "learning_rate": 3.276477862671562e-05, "loss": 1.2628, "step": 572 }, { "epoch": 3.0157894736842104, "grad_norm": 0.9883859720793654, "learning_rate": 3.262027704174561e-05, "loss": 1.2509, "step": 573 }, { "epoch": 3.0210526315789474, "grad_norm": 0.9615705195781193, "learning_rate": 3.247587509066535e-05, "loss": 1.264, "step": 574 }, { "epoch": 3.026315789473684, "grad_norm": 0.7910888117572926, "learning_rate": 3.2331574723050474e-05, "loss": 1.2454, "step": 575 }, { "epoch": 3.031578947368421, "grad_norm": 0.6384643157937262, "learning_rate": 3.218737788710509e-05, "loss": 1.2538, "step": 576 }, { "epoch": 3.036842105263158, "grad_norm": 0.5368984975332023, "learning_rate": 3.2043286529635614e-05, "loss": 1.2587, "step": 577 }, { "epoch": 3.042105263157895, "grad_norm": 0.6163486545813159, "learning_rate": 3.189930259602433e-05, "loss": 1.2452, "step": 578 }, { "epoch": 3.0473684210526315, "grad_norm": 0.72931652275654, "learning_rate": 3.175542803020319e-05, "loss": 1.2414, "step": 579 }, { "epoch": 3.0526315789473686, "grad_norm": 0.8975216647677429, "learning_rate": 3.161166477462759e-05, "loss": 1.2562, "step": 580 }, { "epoch": 3.057894736842105, "grad_norm": 0.6779341882405682, "learning_rate": 3.146801477025013e-05, "loss": 1.259, "step": 581 }, { "epoch": 3.0631578947368423, "grad_norm": 0.35876586554623435, "learning_rate": 3.132447995649438e-05, "loss": 1.2439, "step": 582 }, { "epoch": 3.068421052631579, "grad_norm": 0.4633889396883284, "learning_rate": 3.11810622712287e-05, "loss": 1.2443, "step": 583 }, { "epoch": 3.0736842105263156, "grad_norm": 0.37725222875354836, "learning_rate": 3.103776365074017e-05, "loss": 1.244, "step": 584 }, { "epoch": 3.0789473684210527, "grad_norm": 0.48272932389464906, "learning_rate": 3.089458602970828e-05, "loss": 1.2509, "step": 585 }, { "epoch": 3.0842105263157893, "grad_norm": 0.5490877544866288, "learning_rate": 3.075153134117893e-05, "loss": 1.264, "step": 586 }, { "epoch": 3.0894736842105264, "grad_norm": 0.4207792008385385, "learning_rate": 3.060860151653837e-05, "loss": 1.2519, "step": 587 }, { "epoch": 3.094736842105263, "grad_norm": 0.44337087568296857, "learning_rate": 3.046579848548697e-05, "loss": 1.2387, "step": 588 }, { "epoch": 3.1, "grad_norm": 0.4345387632778484, "learning_rate": 3.0323124176013297e-05, "loss": 1.2471, "step": 589 }, { "epoch": 3.1052631578947367, "grad_norm": 0.34646556904906206, "learning_rate": 3.0180580514368037e-05, "loss": 1.2574, "step": 590 }, { "epoch": 3.110526315789474, "grad_norm": 0.5480997618564871, "learning_rate": 3.0038169425038007e-05, "loss": 1.2483, "step": 591 }, { "epoch": 3.1157894736842104, "grad_norm": 0.3643135054525602, "learning_rate": 2.9895892830720137e-05, "loss": 1.2586, "step": 592 }, { "epoch": 3.1210526315789475, "grad_norm": 0.3565417102254677, "learning_rate": 2.9753752652295538e-05, "loss": 1.2391, "step": 593 }, { "epoch": 3.126315789473684, "grad_norm": 0.4591251447075665, "learning_rate": 2.961175080880362e-05, "loss": 1.2496, "step": 594 }, { "epoch": 3.1315789473684212, "grad_norm": 0.46367424343953406, "learning_rate": 2.9469889217416045e-05, "loss": 1.2466, "step": 595 }, { "epoch": 3.136842105263158, "grad_norm": 0.3997287611132656, "learning_rate": 2.9328169793410954e-05, "loss": 1.2458, "step": 596 }, { "epoch": 3.1421052631578945, "grad_norm": 0.39902269987295985, "learning_rate": 2.918659445014713e-05, "loss": 1.2415, "step": 597 }, { "epoch": 3.1473684210526316, "grad_norm": 0.3404609072909432, "learning_rate": 2.9045165099038066e-05, "loss": 1.2631, "step": 598 }, { "epoch": 3.1526315789473682, "grad_norm": 0.3396481784449944, "learning_rate": 2.890388364952623e-05, "loss": 1.2548, "step": 599 }, { "epoch": 3.1578947368421053, "grad_norm": 0.37782629450525207, "learning_rate": 2.8762752009057232e-05, "loss": 1.2617, "step": 600 }, { "epoch": 3.163157894736842, "grad_norm": 0.5589592818613579, "learning_rate": 2.8621772083054157e-05, "loss": 1.2594, "step": 601 }, { "epoch": 3.168421052631579, "grad_norm": 0.42966453281721634, "learning_rate": 2.8480945774891764e-05, "loss": 1.2413, "step": 602 }, { "epoch": 3.1736842105263157, "grad_norm": 0.37959826731834306, "learning_rate": 2.83402749858708e-05, "loss": 1.2509, "step": 603 }, { "epoch": 3.1789473684210527, "grad_norm": 0.4661717251353946, "learning_rate": 2.819976161519236e-05, "loss": 1.2629, "step": 604 }, { "epoch": 3.1842105263157894, "grad_norm": 0.31707150511990617, "learning_rate": 2.805940755993223e-05, "loss": 1.2446, "step": 605 }, { "epoch": 3.1894736842105265, "grad_norm": 0.3596061389333874, "learning_rate": 2.7919214715015236e-05, "loss": 1.2487, "step": 606 }, { "epoch": 3.194736842105263, "grad_norm": 0.3125529192407293, "learning_rate": 2.7779184973189773e-05, "loss": 1.2575, "step": 607 }, { "epoch": 3.2, "grad_norm": 0.39929761965783167, "learning_rate": 2.7639320225002108e-05, "loss": 1.2563, "step": 608 }, { "epoch": 3.205263157894737, "grad_norm": 0.357481051645098, "learning_rate": 2.7499622358770936e-05, "loss": 1.2399, "step": 609 }, { "epoch": 3.2105263157894735, "grad_norm": 0.3253036562346321, "learning_rate": 2.7360093260561904e-05, "loss": 1.2587, "step": 610 }, { "epoch": 3.2157894736842105, "grad_norm": 1.0866422737509416, "learning_rate": 2.722073481416208e-05, "loss": 1.253, "step": 611 }, { "epoch": 3.221052631578947, "grad_norm": 0.3704975946750915, "learning_rate": 2.7081548901054574e-05, "loss": 1.2449, "step": 612 }, { "epoch": 3.2263157894736842, "grad_norm": 0.39681349064147786, "learning_rate": 2.6942537400393117e-05, "loss": 1.2393, "step": 613 }, { "epoch": 3.231578947368421, "grad_norm": 0.38789399072411884, "learning_rate": 2.680370218897669e-05, "loss": 1.2476, "step": 614 }, { "epoch": 3.236842105263158, "grad_norm": 0.6056318300360599, "learning_rate": 2.6665045141224193e-05, "loss": 1.2498, "step": 615 }, { "epoch": 3.2421052631578946, "grad_norm": 0.5268591378002944, "learning_rate": 2.6526568129149103e-05, "loss": 1.2509, "step": 616 }, { "epoch": 3.2473684210526317, "grad_norm": 0.5275312902783164, "learning_rate": 2.638827302233428e-05, "loss": 1.2581, "step": 617 }, { "epoch": 3.2526315789473683, "grad_norm": 0.37709353602618134, "learning_rate": 2.625016168790664e-05, "loss": 1.2533, "step": 618 }, { "epoch": 3.2578947368421054, "grad_norm": 0.3270640411740736, "learning_rate": 2.611223599051198e-05, "loss": 1.2743, "step": 619 }, { "epoch": 3.263157894736842, "grad_norm": 0.32059620385654264, "learning_rate": 2.597449779228983e-05, "loss": 1.2568, "step": 620 }, { "epoch": 3.268421052631579, "grad_norm": 0.39808261047072035, "learning_rate": 2.5836948952848255e-05, "loss": 1.2525, "step": 621 }, { "epoch": 3.2736842105263158, "grad_norm": 0.34113291757836145, "learning_rate": 2.5699591329238812e-05, "loss": 1.268, "step": 622 }, { "epoch": 3.2789473684210524, "grad_norm": 0.3042065217936969, "learning_rate": 2.5562426775931418e-05, "loss": 1.2483, "step": 623 }, { "epoch": 3.2842105263157895, "grad_norm": 0.3974087061640213, "learning_rate": 2.5425457144789364e-05, "loss": 1.2609, "step": 624 }, { "epoch": 3.2894736842105265, "grad_norm": 0.321409927169932, "learning_rate": 2.5288684285044283e-05, "loss": 1.255, "step": 625 }, { "epoch": 3.294736842105263, "grad_norm": 0.385869925356024, "learning_rate": 2.5152110043271166e-05, "loss": 1.2576, "step": 626 }, { "epoch": 3.3, "grad_norm": 0.3637124957498029, "learning_rate": 2.501573626336352e-05, "loss": 1.2411, "step": 627 }, { "epoch": 3.305263157894737, "grad_norm": 0.3710385685313032, "learning_rate": 2.4879564786508343e-05, "loss": 1.2592, "step": 628 }, { "epoch": 3.3105263157894735, "grad_norm": 0.4487560727745529, "learning_rate": 2.474359745116136e-05, "loss": 1.2404, "step": 629 }, { "epoch": 3.3157894736842106, "grad_norm": 0.3231869771450256, "learning_rate": 2.460783609302218e-05, "loss": 1.2547, "step": 630 }, { "epoch": 3.3210526315789473, "grad_norm": 0.4088431022056057, "learning_rate": 2.4472282545009493e-05, "loss": 1.2548, "step": 631 }, { "epoch": 3.3263157894736843, "grad_norm": 0.29515450703495905, "learning_rate": 2.4336938637236352e-05, "loss": 1.2525, "step": 632 }, { "epoch": 3.331578947368421, "grad_norm": 0.33297468568328076, "learning_rate": 2.4201806196985426e-05, "loss": 1.2737, "step": 633 }, { "epoch": 3.336842105263158, "grad_norm": 0.3335294632136315, "learning_rate": 2.4066887048684394e-05, "loss": 1.2447, "step": 634 }, { "epoch": 3.3421052631578947, "grad_norm": 0.2879112803644998, "learning_rate": 2.393218301388123e-05, "loss": 1.2715, "step": 635 }, { "epoch": 3.3473684210526318, "grad_norm": 0.3133592323848536, "learning_rate": 2.3797695911219668e-05, "loss": 1.2561, "step": 636 }, { "epoch": 3.3526315789473684, "grad_norm": 0.2430811283889928, "learning_rate": 2.3663427556414664e-05, "loss": 1.2601, "step": 637 }, { "epoch": 3.3578947368421055, "grad_norm": 0.3579114126056535, "learning_rate": 2.352937976222781e-05, "loss": 1.253, "step": 638 }, { "epoch": 3.363157894736842, "grad_norm": 0.26852343656836425, "learning_rate": 2.3395554338442908e-05, "loss": 1.245, "step": 639 }, { "epoch": 3.3684210526315788, "grad_norm": 0.3011001164622397, "learning_rate": 2.3261953091841553e-05, "loss": 1.2546, "step": 640 }, { "epoch": 3.373684210526316, "grad_norm": 0.26481840311987703, "learning_rate": 2.3128577826178723e-05, "loss": 1.2606, "step": 641 }, { "epoch": 3.3789473684210525, "grad_norm": 0.3257272912007352, "learning_rate": 2.2995430342158365e-05, "loss": 1.2353, "step": 642 }, { "epoch": 3.3842105263157896, "grad_norm": 0.38000488426555273, "learning_rate": 2.2862512437409162e-05, "loss": 1.2423, "step": 643 }, { "epoch": 3.389473684210526, "grad_norm": 0.29866098174675637, "learning_rate": 2.272982590646029e-05, "loss": 1.2653, "step": 644 }, { "epoch": 3.3947368421052633, "grad_norm": 0.7482169914063777, "learning_rate": 2.2597372540717083e-05, "loss": 1.2591, "step": 645 }, { "epoch": 3.4, "grad_norm": 0.48442639740243737, "learning_rate": 2.24651541284369e-05, "loss": 1.2748, "step": 646 }, { "epoch": 3.405263157894737, "grad_norm": 0.3933443985045218, "learning_rate": 2.233317245470504e-05, "loss": 1.2491, "step": 647 }, { "epoch": 3.4105263157894736, "grad_norm": 0.4653265340743596, "learning_rate": 2.220142930141054e-05, "loss": 1.2592, "step": 648 }, { "epoch": 3.4157894736842107, "grad_norm": 0.42673076337011967, "learning_rate": 2.206992644722216e-05, "loss": 1.2396, "step": 649 }, { "epoch": 3.4210526315789473, "grad_norm": 0.33876188196334395, "learning_rate": 2.1938665667564435e-05, "loss": 1.2436, "step": 650 }, { "epoch": 3.4263157894736844, "grad_norm": 0.3630588058950603, "learning_rate": 2.1807648734593558e-05, "loss": 1.2557, "step": 651 }, { "epoch": 3.431578947368421, "grad_norm": 0.3529402619316208, "learning_rate": 2.167687741717358e-05, "loss": 1.2536, "step": 652 }, { "epoch": 3.4368421052631577, "grad_norm": 0.3145009910486473, "learning_rate": 2.1546353480852495e-05, "loss": 1.2465, "step": 653 }, { "epoch": 3.442105263157895, "grad_norm": 0.2825566028878834, "learning_rate": 2.1416078687838403e-05, "loss": 1.2543, "step": 654 }, { "epoch": 3.4473684210526314, "grad_norm": 0.2872680469582709, "learning_rate": 2.1286054796975696e-05, "loss": 1.2637, "step": 655 }, { "epoch": 3.4526315789473685, "grad_norm": 0.2802498708050248, "learning_rate": 2.115628356372131e-05, "loss": 1.245, "step": 656 }, { "epoch": 3.457894736842105, "grad_norm": 0.2779169417503312, "learning_rate": 2.1026766740121096e-05, "loss": 1.2548, "step": 657 }, { "epoch": 3.463157894736842, "grad_norm": 0.27790502165031583, "learning_rate": 2.089750607478606e-05, "loss": 1.2482, "step": 658 }, { "epoch": 3.468421052631579, "grad_norm": 0.3106234637273863, "learning_rate": 2.076850331286881e-05, "loss": 1.2474, "step": 659 }, { "epoch": 3.473684210526316, "grad_norm": 0.2460612966298966, "learning_rate": 2.063976019604006e-05, "loss": 1.2578, "step": 660 }, { "epoch": 3.4789473684210526, "grad_norm": 0.4002624603687612, "learning_rate": 2.0511278462464933e-05, "loss": 1.2323, "step": 661 }, { "epoch": 3.4842105263157896, "grad_norm": 0.3558072656216221, "learning_rate": 2.038305984677969e-05, "loss": 1.2513, "step": 662 }, { "epoch": 3.4894736842105263, "grad_norm": 0.32674276214626774, "learning_rate": 2.025510608006819e-05, "loss": 1.248, "step": 663 }, { "epoch": 3.4947368421052634, "grad_norm": 0.3685362965399088, "learning_rate": 2.012741888983861e-05, "loss": 1.2612, "step": 664 }, { "epoch": 3.5, "grad_norm": 0.3851874388241183, "learning_rate": 2.0000000000000012e-05, "loss": 1.26, "step": 665 }, { "epoch": 3.5052631578947366, "grad_norm": 0.2922093360847206, "learning_rate": 1.9872851130839126e-05, "loss": 1.2503, "step": 666 }, { "epoch": 3.5105263157894737, "grad_norm": 0.2982128935849179, "learning_rate": 1.9745973998997177e-05, "loss": 1.2461, "step": 667 }, { "epoch": 3.515789473684211, "grad_norm": 0.36881831273338744, "learning_rate": 1.9619370317446612e-05, "loss": 1.2627, "step": 668 }, { "epoch": 3.5210526315789474, "grad_norm": 0.25559075127742553, "learning_rate": 1.9493041795468018e-05, "loss": 1.2474, "step": 669 }, { "epoch": 3.526315789473684, "grad_norm": 0.6103223603779421, "learning_rate": 1.9366990138627054e-05, "loss": 1.2553, "step": 670 }, { "epoch": 3.531578947368421, "grad_norm": 0.32053875904249984, "learning_rate": 1.9241217048751406e-05, "loss": 1.2716, "step": 671 }, { "epoch": 3.536842105263158, "grad_norm": 0.32627511828160094, "learning_rate": 1.911572422390783e-05, "loss": 1.2509, "step": 672 }, { "epoch": 3.542105263157895, "grad_norm": 0.31231339980121575, "learning_rate": 1.899051335837919e-05, "loss": 1.2542, "step": 673 }, { "epoch": 3.5473684210526315, "grad_norm": 0.31642734990082777, "learning_rate": 1.886558614264165e-05, "loss": 1.2544, "step": 674 }, { "epoch": 3.5526315789473686, "grad_norm": 0.41419322615420073, "learning_rate": 1.8740944263341773e-05, "loss": 1.2722, "step": 675 }, { "epoch": 3.557894736842105, "grad_norm": 0.2575000448429207, "learning_rate": 1.8616589403273776e-05, "loss": 1.251, "step": 676 }, { "epoch": 3.5631578947368423, "grad_norm": 0.45829370611833337, "learning_rate": 1.8492523241356877e-05, "loss": 1.2552, "step": 677 }, { "epoch": 3.568421052631579, "grad_norm": 0.3876144668681015, "learning_rate": 1.8368747452612504e-05, "loss": 1.2756, "step": 678 }, { "epoch": 3.5736842105263156, "grad_norm": 0.3605137220418223, "learning_rate": 1.8245263708141782e-05, "loss": 1.242, "step": 679 }, { "epoch": 3.5789473684210527, "grad_norm": 0.3947355937612717, "learning_rate": 1.8122073675102935e-05, "loss": 1.2556, "step": 680 }, { "epoch": 3.5842105263157897, "grad_norm": 0.29347916836402094, "learning_rate": 1.7999179016688763e-05, "loss": 1.26, "step": 681 }, { "epoch": 3.5894736842105264, "grad_norm": 0.32495295214844105, "learning_rate": 1.7876581392104225e-05, "loss": 1.2496, "step": 682 }, { "epoch": 3.594736842105263, "grad_norm": 0.2493724682619427, "learning_rate": 1.7754282456543977e-05, "loss": 1.2514, "step": 683 }, { "epoch": 3.6, "grad_norm": 0.35605401548647925, "learning_rate": 1.7632283861170135e-05, "loss": 1.2539, "step": 684 }, { "epoch": 3.6052631578947367, "grad_norm": 0.2630345804072707, "learning_rate": 1.7510587253089842e-05, "loss": 1.2579, "step": 685 }, { "epoch": 3.610526315789474, "grad_norm": 0.2772719177300871, "learning_rate": 1.7389194275333124e-05, "loss": 1.2471, "step": 686 }, { "epoch": 3.6157894736842104, "grad_norm": 0.3256551364716347, "learning_rate": 1.7268106566830713e-05, "loss": 1.2562, "step": 687 }, { "epoch": 3.6210526315789475, "grad_norm": 0.2942105351769792, "learning_rate": 1.7147325762391848e-05, "loss": 1.2664, "step": 688 }, { "epoch": 3.626315789473684, "grad_norm": 0.29601761914650015, "learning_rate": 1.702685349268226e-05, "loss": 1.2559, "step": 689 }, { "epoch": 3.6315789473684212, "grad_norm": 0.2759560921461832, "learning_rate": 1.690669138420215e-05, "loss": 1.2591, "step": 690 }, { "epoch": 3.636842105263158, "grad_norm": 0.2440653651529168, "learning_rate": 1.6786841059264217e-05, "loss": 1.2574, "step": 691 }, { "epoch": 3.6421052631578945, "grad_norm": 0.303898127022955, "learning_rate": 1.6667304135971756e-05, "loss": 1.2547, "step": 692 }, { "epoch": 3.6473684210526316, "grad_norm": 0.2481861381786453, "learning_rate": 1.65480822281968e-05, "loss": 1.2488, "step": 693 }, { "epoch": 3.6526315789473687, "grad_norm": 0.2565499348104272, "learning_rate": 1.6429176945558413e-05, "loss": 1.2561, "step": 694 }, { "epoch": 3.6578947368421053, "grad_norm": 0.3224687182653659, "learning_rate": 1.6310589893400804e-05, "loss": 1.247, "step": 695 }, { "epoch": 3.663157894736842, "grad_norm": 0.25279520055905946, "learning_rate": 1.6192322672771793e-05, "loss": 1.2636, "step": 696 }, { "epoch": 3.668421052631579, "grad_norm": 0.3239078414093973, "learning_rate": 1.6074376880401147e-05, "loss": 1.2431, "step": 697 }, { "epoch": 3.6736842105263157, "grad_norm": 0.25211963429157525, "learning_rate": 1.5956754108678996e-05, "loss": 1.2489, "step": 698 }, { "epoch": 3.6789473684210527, "grad_norm": 0.3288796816421695, "learning_rate": 1.5839455945634372e-05, "loss": 1.2433, "step": 699 }, { "epoch": 3.6842105263157894, "grad_norm": 0.2570823868070139, "learning_rate": 1.5722483974913737e-05, "loss": 1.2437, "step": 700 }, { "epoch": 3.6894736842105265, "grad_norm": 0.23205884441696503, "learning_rate": 1.560583977575964e-05, "loss": 1.2558, "step": 701 }, { "epoch": 3.694736842105263, "grad_norm": 0.254586157360098, "learning_rate": 1.5489524922989367e-05, "loss": 1.2677, "step": 702 }, { "epoch": 3.7, "grad_norm": 0.2528078741758432, "learning_rate": 1.537354098697367e-05, "loss": 1.2521, "step": 703 }, { "epoch": 3.705263157894737, "grad_norm": 0.30438966904710707, "learning_rate": 1.525788953361563e-05, "loss": 1.2569, "step": 704 }, { "epoch": 3.7105263157894735, "grad_norm": 0.24585215855274448, "learning_rate": 1.5142572124329418e-05, "loss": 1.2582, "step": 705 }, { "epoch": 3.7157894736842105, "grad_norm": 0.23812179037555448, "learning_rate": 1.5027590316019276e-05, "loss": 1.2582, "step": 706 }, { "epoch": 3.7210526315789476, "grad_norm": 0.2258598951803704, "learning_rate": 1.491294566105852e-05, "loss": 1.2398, "step": 707 }, { "epoch": 3.7263157894736842, "grad_norm": 0.23820145432975506, "learning_rate": 1.4798639707268509e-05, "loss": 1.26, "step": 708 }, { "epoch": 3.731578947368421, "grad_norm": 0.27098791758934043, "learning_rate": 1.4684673997897795e-05, "loss": 1.2467, "step": 709 }, { "epoch": 3.736842105263158, "grad_norm": 0.1895081621315529, "learning_rate": 1.457105007160129e-05, "loss": 1.2469, "step": 710 }, { "epoch": 3.7421052631578946, "grad_norm": 0.24431380487075854, "learning_rate": 1.4457769462419461e-05, "loss": 1.2505, "step": 711 }, { "epoch": 3.7473684210526317, "grad_norm": 0.2598287894690381, "learning_rate": 1.4344833699757662e-05, "loss": 1.2733, "step": 712 }, { "epoch": 3.7526315789473683, "grad_norm": 0.24173801356915325, "learning_rate": 1.4232244308365437e-05, "loss": 1.2515, "step": 713 }, { "epoch": 3.7578947368421054, "grad_norm": 0.2744768545995936, "learning_rate": 1.4120002808315999e-05, "loss": 1.2446, "step": 714 }, { "epoch": 3.763157894736842, "grad_norm": 0.29075680429359135, "learning_rate": 1.4008110714985623e-05, "loss": 1.2576, "step": 715 }, { "epoch": 3.768421052631579, "grad_norm": 0.1679499052346039, "learning_rate": 1.3896569539033253e-05, "loss": 1.2434, "step": 716 }, { "epoch": 3.7736842105263158, "grad_norm": 0.21354680460803685, "learning_rate": 1.3785380786380103e-05, "loss": 1.2642, "step": 717 }, { "epoch": 3.7789473684210524, "grad_norm": 0.24355235079533985, "learning_rate": 1.367454595818928e-05, "loss": 1.2449, "step": 718 }, { "epoch": 3.7842105263157895, "grad_norm": 0.17842505149174132, "learning_rate": 1.3564066550845558e-05, "loss": 1.2399, "step": 719 }, { "epoch": 3.7894736842105265, "grad_norm": 0.2363958816949115, "learning_rate": 1.3453944055935151e-05, "loss": 1.2471, "step": 720 }, { "epoch": 3.794736842105263, "grad_norm": 0.20243183669778259, "learning_rate": 1.3344179960225603e-05, "loss": 1.2535, "step": 721 }, { "epoch": 3.8, "grad_norm": 0.2471952644451058, "learning_rate": 1.3234775745645684e-05, "loss": 1.2484, "step": 722 }, { "epoch": 3.805263157894737, "grad_norm": 0.21944384742054443, "learning_rate": 1.3125732889265393e-05, "loss": 1.2444, "step": 723 }, { "epoch": 3.8105263157894735, "grad_norm": 0.22384339943654685, "learning_rate": 1.3017052863276054e-05, "loss": 1.2544, "step": 724 }, { "epoch": 3.8157894736842106, "grad_norm": 0.20643300200194556, "learning_rate": 1.2908737134970367e-05, "loss": 1.2455, "step": 725 }, { "epoch": 3.8210526315789473, "grad_norm": 0.22387663232782792, "learning_rate": 1.2800787166722634e-05, "loss": 1.2415, "step": 726 }, { "epoch": 3.8263157894736843, "grad_norm": 0.23601246798864953, "learning_rate": 1.2693204415969068e-05, "loss": 1.2488, "step": 727 }, { "epoch": 3.831578947368421, "grad_norm": 0.21781387567237637, "learning_rate": 1.2585990335188014e-05, "loss": 1.2346, "step": 728 }, { "epoch": 3.836842105263158, "grad_norm": 0.20954968812529903, "learning_rate": 1.2479146371880408e-05, "loss": 1.25, "step": 729 }, { "epoch": 3.8421052631578947, "grad_norm": 0.2935272323831709, "learning_rate": 1.2372673968550229e-05, "loss": 1.2575, "step": 730 }, { "epoch": 3.8473684210526313, "grad_norm": 0.23428135792560334, "learning_rate": 1.2266574562684994e-05, "loss": 1.2477, "step": 731 }, { "epoch": 3.8526315789473684, "grad_norm": 0.18658016102303704, "learning_rate": 1.2160849586736375e-05, "loss": 1.256, "step": 732 }, { "epoch": 3.8578947368421055, "grad_norm": 0.23105098493810466, "learning_rate": 1.2055500468100849e-05, "loss": 1.2399, "step": 733 }, { "epoch": 3.863157894736842, "grad_norm": 0.1929616859489707, "learning_rate": 1.1950528629100457e-05, "loss": 1.2515, "step": 734 }, { "epoch": 3.8684210526315788, "grad_norm": 0.218750003790284, "learning_rate": 1.1845935486963546e-05, "loss": 1.2489, "step": 735 }, { "epoch": 3.873684210526316, "grad_norm": 0.19977098349774547, "learning_rate": 1.1741722453805657e-05, "loss": 1.2449, "step": 736 }, { "epoch": 3.8789473684210525, "grad_norm": 0.23507506446012338, "learning_rate": 1.163789093661051e-05, "loss": 1.2562, "step": 737 }, { "epoch": 3.8842105263157896, "grad_norm": 0.19034197687876206, "learning_rate": 1.1534442337210919e-05, "loss": 1.2528, "step": 738 }, { "epoch": 3.889473684210526, "grad_norm": 0.25267159420496116, "learning_rate": 1.1431378052269934e-05, "loss": 1.2571, "step": 739 }, { "epoch": 3.8947368421052633, "grad_norm": 0.21369948030483346, "learning_rate": 1.1328699473261957e-05, "loss": 1.241, "step": 740 }, { "epoch": 3.9, "grad_norm": 0.23307740618119258, "learning_rate": 1.1226407986453963e-05, "loss": 1.2557, "step": 741 }, { "epoch": 3.905263157894737, "grad_norm": 0.19115969783367653, "learning_rate": 1.1124504972886782e-05, "loss": 1.2525, "step": 742 }, { "epoch": 3.9105263157894736, "grad_norm": 0.2681117091243346, "learning_rate": 1.1022991808356442e-05, "loss": 1.248, "step": 743 }, { "epoch": 3.9157894736842103, "grad_norm": 0.1651284103554666, "learning_rate": 1.0921869863395642e-05, "loss": 1.242, "step": 744 }, { "epoch": 3.9210526315789473, "grad_norm": 0.24161510420189317, "learning_rate": 1.0821140503255174e-05, "loss": 1.2555, "step": 745 }, { "epoch": 3.9263157894736844, "grad_norm": 0.20280135248319278, "learning_rate": 1.0720805087885533e-05, "loss": 1.2578, "step": 746 }, { "epoch": 3.931578947368421, "grad_norm": 0.3284807144434915, "learning_rate": 1.0620864971918579e-05, "loss": 1.259, "step": 747 }, { "epoch": 3.9368421052631577, "grad_norm": 0.22538990852777954, "learning_rate": 1.05213215046492e-05, "loss": 1.2597, "step": 748 }, { "epoch": 3.942105263157895, "grad_norm": 0.19055951323654136, "learning_rate": 1.0422176030017117e-05, "loss": 1.2443, "step": 749 }, { "epoch": 3.9473684210526314, "grad_norm": 0.18646041833135787, "learning_rate": 1.0323429886588743e-05, "loss": 1.2388, "step": 750 }, { "epoch": 3.9526315789473685, "grad_norm": 0.19285379546461523, "learning_rate": 1.0225084407539109e-05, "loss": 1.2335, "step": 751 }, { "epoch": 3.957894736842105, "grad_norm": 0.1997818414436052, "learning_rate": 1.0127140920633857e-05, "loss": 1.2439, "step": 752 }, { "epoch": 3.963157894736842, "grad_norm": 0.20149581036707856, "learning_rate": 1.0029600748211314e-05, "loss": 1.2415, "step": 753 }, { "epoch": 3.968421052631579, "grad_norm": 0.19260248911961064, "learning_rate": 9.932465207164675e-06, "loss": 1.2633, "step": 754 }, { "epoch": 3.973684210526316, "grad_norm": 0.21099578591151794, "learning_rate": 9.835735608924155e-06, "loss": 1.231, "step": 755 }, { "epoch": 3.9789473684210526, "grad_norm": 0.17132739675169845, "learning_rate": 9.739413259439337e-06, "loss": 1.2451, "step": 756 }, { "epoch": 3.984210526315789, "grad_norm": 0.21904215059223633, "learning_rate": 9.643499459161538e-06, "loss": 1.2523, "step": 757 }, { "epoch": 3.9894736842105263, "grad_norm": 0.224551193557602, "learning_rate": 9.547995503026217e-06, "loss": 1.2478, "step": 758 }, { "epoch": 3.9947368421052634, "grad_norm": 0.19238609932248696, "learning_rate": 9.452902680435527e-06, "loss": 1.249, "step": 759 }, { "epoch": 4.0, "grad_norm": 0.22055223309269914, "learning_rate": 9.358222275240884e-06, "loss": 1.2167, "step": 760 }, { "epoch": 4.005263157894737, "grad_norm": 0.20140128214778716, "learning_rate": 9.263955565725648e-06, "loss": 1.2391, "step": 761 }, { "epoch": 4.010526315789473, "grad_norm": 0.2068373210972995, "learning_rate": 9.170103824587855e-06, "loss": 1.2331, "step": 762 }, { "epoch": 4.015789473684211, "grad_norm": 0.18232115316386402, "learning_rate": 9.07666831892304e-06, "loss": 1.2121, "step": 763 }, { "epoch": 4.021052631578947, "grad_norm": 0.2188152260857773, "learning_rate": 8.983650310207142e-06, "loss": 1.2232, "step": 764 }, { "epoch": 4.026315789473684, "grad_norm": 0.1880274936269495, "learning_rate": 8.89105105427945e-06, "loss": 1.2272, "step": 765 }, { "epoch": 4.031578947368421, "grad_norm": 0.17030491611623289, "learning_rate": 8.798871801325632e-06, "loss": 1.2284, "step": 766 }, { "epoch": 4.036842105263158, "grad_norm": 0.1887119280020856, "learning_rate": 8.707113795860938e-06, "loss": 1.2364, "step": 767 }, { "epoch": 4.042105263157895, "grad_norm": 0.18907111180220373, "learning_rate": 8.615778276713293e-06, "loss": 1.2277, "step": 768 }, { "epoch": 4.0473684210526315, "grad_norm": 0.17028701794910334, "learning_rate": 8.524866477006637e-06, "loss": 1.2268, "step": 769 }, { "epoch": 4.052631578947368, "grad_norm": 0.1927239082270522, "learning_rate": 8.434379624144261e-06, "loss": 1.2202, "step": 770 }, { "epoch": 4.057894736842106, "grad_norm": 0.18231681740661396, "learning_rate": 8.344318939792232e-06, "loss": 1.2103, "step": 771 }, { "epoch": 4.063157894736842, "grad_norm": 0.2108141165888399, "learning_rate": 8.254685639862896e-06, "loss": 1.2289, "step": 772 }, { "epoch": 4.068421052631579, "grad_norm": 0.21501105777435195, "learning_rate": 8.165480934498462e-06, "loss": 1.2304, "step": 773 }, { "epoch": 4.073684210526316, "grad_norm": 0.22014095135466175, "learning_rate": 8.076706028054709e-06, "loss": 1.2395, "step": 774 }, { "epoch": 4.078947368421052, "grad_norm": 0.18281510166398557, "learning_rate": 7.988362119084642e-06, "loss": 1.232, "step": 775 }, { "epoch": 4.08421052631579, "grad_norm": 0.21712131045816194, "learning_rate": 7.90045040032236e-06, "loss": 1.2423, "step": 776 }, { "epoch": 4.089473684210526, "grad_norm": 0.19226805462323326, "learning_rate": 7.812972058666974e-06, "loss": 1.2295, "step": 777 }, { "epoch": 4.094736842105263, "grad_norm": 0.175015352113717, "learning_rate": 7.725928275166534e-06, "loss": 1.2282, "step": 778 }, { "epoch": 4.1, "grad_norm": 0.2095750364202842, "learning_rate": 7.639320225002106e-06, "loss": 1.2244, "step": 779 }, { "epoch": 4.105263157894737, "grad_norm": 0.19644672306841843, "learning_rate": 7.553149077471915e-06, "loss": 1.2217, "step": 780 }, { "epoch": 4.110526315789474, "grad_norm": 0.2000635414888708, "learning_rate": 7.46741599597554e-06, "loss": 1.2319, "step": 781 }, { "epoch": 4.11578947368421, "grad_norm": 0.1746543551783459, "learning_rate": 7.382122137998209e-06, "loss": 1.2282, "step": 782 }, { "epoch": 4.121052631578947, "grad_norm": 0.17481980918717463, "learning_rate": 7.297268655095213e-06, "loss": 1.2395, "step": 783 }, { "epoch": 4.126315789473685, "grad_norm": 0.17610089627569894, "learning_rate": 7.212856692876289e-06, "loss": 1.2319, "step": 784 }, { "epoch": 4.131578947368421, "grad_norm": 0.17566117386443802, "learning_rate": 7.128887390990198e-06, "loss": 1.2245, "step": 785 }, { "epoch": 4.136842105263158, "grad_norm": 0.18888285977402394, "learning_rate": 7.045361883109318e-06, "loss": 1.2363, "step": 786 }, { "epoch": 4.1421052631578945, "grad_norm": 0.1679963465599155, "learning_rate": 6.962281296914386e-06, "loss": 1.2319, "step": 787 }, { "epoch": 4.147368421052631, "grad_norm": 0.17232128719198106, "learning_rate": 6.8796467540791986e-06, "loss": 1.2312, "step": 788 }, { "epoch": 4.152631578947369, "grad_norm": 0.19685528274227304, "learning_rate": 6.797459370255519e-06, "loss": 1.2324, "step": 789 }, { "epoch": 4.157894736842105, "grad_norm": 0.1583456150516079, "learning_rate": 6.715720255058e-06, "loss": 1.24, "step": 790 }, { "epoch": 4.163157894736842, "grad_norm": 0.172328795648275, "learning_rate": 6.634430512049213e-06, "loss": 1.2513, "step": 791 }, { "epoch": 4.168421052631579, "grad_norm": 0.16257107292586506, "learning_rate": 6.553591238724712e-06, "loss": 1.2275, "step": 792 }, { "epoch": 4.173684210526316, "grad_norm": 0.14389724088218966, "learning_rate": 6.4732035264982904e-06, "loss": 1.2348, "step": 793 }, { "epoch": 4.178947368421053, "grad_norm": 0.15689066100797078, "learning_rate": 6.39326846068717e-06, "loss": 1.2179, "step": 794 }, { "epoch": 4.184210526315789, "grad_norm": 0.18533318047509703, "learning_rate": 6.313787120497376e-06, "loss": 1.236, "step": 795 }, { "epoch": 4.189473684210526, "grad_norm": 0.1459277700590749, "learning_rate": 6.234760579009167e-06, "loss": 1.2435, "step": 796 }, { "epoch": 4.1947368421052635, "grad_norm": 0.155103015306397, "learning_rate": 6.1561899031625794e-06, "loss": 1.2282, "step": 797 }, { "epoch": 4.2, "grad_norm": 0.1477347804716696, "learning_rate": 6.078076153742962e-06, "loss": 1.2249, "step": 798 }, { "epoch": 4.205263157894737, "grad_norm": 0.15276423763995617, "learning_rate": 6.000420385366687e-06, "loss": 1.2297, "step": 799 }, { "epoch": 4.2105263157894735, "grad_norm": 0.15126290143221918, "learning_rate": 5.923223646466923e-06, "loss": 1.2387, "step": 800 }, { "epoch": 4.215789473684211, "grad_norm": 0.15492928616201465, "learning_rate": 5.846486979279449e-06, "loss": 1.2367, "step": 801 }, { "epoch": 4.221052631578948, "grad_norm": 0.17188412280549703, "learning_rate": 5.770211419828604e-06, "loss": 1.2322, "step": 802 }, { "epoch": 4.226315789473684, "grad_norm": 0.15097184444197026, "learning_rate": 5.694397997913319e-06, "loss": 1.2321, "step": 803 }, { "epoch": 4.231578947368421, "grad_norm": 0.1453328152503722, "learning_rate": 5.619047737093164e-06, "loss": 1.2384, "step": 804 }, { "epoch": 4.2368421052631575, "grad_norm": 0.18220366542871314, "learning_rate": 5.5441616546745646e-06, "loss": 1.2383, "step": 805 }, { "epoch": 4.242105263157895, "grad_norm": 0.167450785630923, "learning_rate": 5.469740761697044e-06, "loss": 1.2426, "step": 806 }, { "epoch": 4.247368421052632, "grad_norm": 0.14931148609570408, "learning_rate": 5.395786062919622e-06, "loss": 1.2333, "step": 807 }, { "epoch": 4.252631578947368, "grad_norm": 0.16803901696022852, "learning_rate": 5.322298556807179e-06, "loss": 1.2417, "step": 808 }, { "epoch": 4.257894736842105, "grad_norm": 0.16226281008273294, "learning_rate": 5.249279235517031e-06, "loss": 1.2329, "step": 809 }, { "epoch": 4.2631578947368425, "grad_norm": 0.13999210516589425, "learning_rate": 5.176729084885508e-06, "loss": 1.2412, "step": 810 }, { "epoch": 4.268421052631579, "grad_norm": 0.18759320208384703, "learning_rate": 5.10464908441465e-06, "loss": 1.2357, "step": 811 }, { "epoch": 4.273684210526316, "grad_norm": 0.17340200354001228, "learning_rate": 5.033040207258979e-06, "loss": 1.2271, "step": 812 }, { "epoch": 4.278947368421052, "grad_norm": 0.15950936103038027, "learning_rate": 4.9619034202123884e-06, "loss": 1.2151, "step": 813 }, { "epoch": 4.284210526315789, "grad_norm": 0.15140529416594803, "learning_rate": 4.891239683695044e-06, "loss": 1.232, "step": 814 }, { "epoch": 4.2894736842105265, "grad_norm": 0.13998854140642578, "learning_rate": 4.821049951740442e-06, "loss": 1.2255, "step": 815 }, { "epoch": 4.294736842105263, "grad_norm": 0.1499375273785916, "learning_rate": 4.751335171982527e-06, "loss": 1.2314, "step": 816 }, { "epoch": 4.3, "grad_norm": 0.158191412279702, "learning_rate": 4.6820962856429205e-06, "loss": 1.234, "step": 817 }, { "epoch": 4.3052631578947365, "grad_norm": 0.14057999268017907, "learning_rate": 4.613334227518165e-06, "loss": 1.2427, "step": 818 }, { "epoch": 4.310526315789474, "grad_norm": 0.1433236145981049, "learning_rate": 4.545049925967137e-06, "loss": 1.2313, "step": 819 }, { "epoch": 4.315789473684211, "grad_norm": 0.1303171009707853, "learning_rate": 4.4772443028985004e-06, "loss": 1.2297, "step": 820 }, { "epoch": 4.321052631578947, "grad_norm": 0.15617718436585057, "learning_rate": 4.409918273758278e-06, "loss": 1.2412, "step": 821 }, { "epoch": 4.326315789473684, "grad_norm": 0.1476554630303936, "learning_rate": 4.343072747517459e-06, "loss": 1.2387, "step": 822 }, { "epoch": 4.331578947368421, "grad_norm": 0.1362192280798835, "learning_rate": 4.276708626659778e-06, "loss": 1.2349, "step": 823 }, { "epoch": 4.336842105263158, "grad_norm": 0.15051183831923126, "learning_rate": 4.2108268071694616e-06, "loss": 1.2122, "step": 824 }, { "epoch": 4.342105263157895, "grad_norm": 0.1445207500529269, "learning_rate": 4.1454281785191995e-06, "loss": 1.2224, "step": 825 }, { "epoch": 4.347368421052631, "grad_norm": 0.14362316701732558, "learning_rate": 4.080513623658075e-06, "loss": 1.2186, "step": 826 }, { "epoch": 4.352631578947369, "grad_norm": 0.1479016471804495, "learning_rate": 4.0160840189997155e-06, "loss": 1.2324, "step": 827 }, { "epoch": 4.3578947368421055, "grad_norm": 0.1397296336400901, "learning_rate": 3.952140234410396e-06, "loss": 1.2309, "step": 828 }, { "epoch": 4.363157894736842, "grad_norm": 0.12213103880797943, "learning_rate": 3.888683133197293e-06, "loss": 1.2231, "step": 829 }, { "epoch": 4.368421052631579, "grad_norm": 0.13169031113809618, "learning_rate": 3.825713572096903e-06, "loss": 1.2264, "step": 830 }, { "epoch": 4.373684210526315, "grad_norm": 0.1415624842501799, "learning_rate": 3.7632324012633992e-06, "loss": 1.2444, "step": 831 }, { "epoch": 4.378947368421053, "grad_norm": 0.14671744800493622, "learning_rate": 3.701240464257181e-06, "loss": 1.2183, "step": 832 }, { "epoch": 4.38421052631579, "grad_norm": 0.13323253635868224, "learning_rate": 3.6397385980335e-06, "loss": 1.2156, "step": 833 }, { "epoch": 4.389473684210526, "grad_norm": 0.13127420581089705, "learning_rate": 3.5787276329311315e-06, "loss": 1.2231, "step": 834 }, { "epoch": 4.394736842105263, "grad_norm": 0.133896287855281, "learning_rate": 3.518208392661184e-06, "loss": 1.2293, "step": 835 }, { "epoch": 4.4, "grad_norm": 0.13026051571456285, "learning_rate": 3.458181694295961e-06, "loss": 1.2395, "step": 836 }, { "epoch": 4.405263157894737, "grad_norm": 0.15010438499441964, "learning_rate": 3.398648348257969e-06, "loss": 1.2323, "step": 837 }, { "epoch": 4.410526315789474, "grad_norm": 0.1501949658490909, "learning_rate": 3.3396091583089275e-06, "loss": 1.2186, "step": 838 }, { "epoch": 4.41578947368421, "grad_norm": 0.1433763217867749, "learning_rate": 3.281064921538919e-06, "loss": 1.2379, "step": 839 }, { "epoch": 4.421052631578947, "grad_norm": 0.1323412042853926, "learning_rate": 3.2230164283556918e-06, "loss": 1.2231, "step": 840 }, { "epoch": 4.426315789473684, "grad_norm": 0.13635885589343097, "learning_rate": 3.1654644624739082e-06, "loss": 1.2297, "step": 841 }, { "epoch": 4.431578947368421, "grad_norm": 0.1323736124785357, "learning_rate": 3.1084098009046106e-06, "loss": 1.235, "step": 842 }, { "epoch": 4.436842105263158, "grad_norm": 0.13102236402292022, "learning_rate": 3.0518532139447267e-06, "loss": 1.2307, "step": 843 }, { "epoch": 4.442105263157894, "grad_norm": 0.15428668766385725, "learning_rate": 2.995795465166644e-06, "loss": 1.226, "step": 844 }, { "epoch": 4.447368421052632, "grad_norm": 0.13778512976226498, "learning_rate": 2.9402373114079295e-06, "loss": 1.2276, "step": 845 }, { "epoch": 4.4526315789473685, "grad_norm": 0.13474950636883365, "learning_rate": 2.8851795027610997e-06, "loss": 1.2228, "step": 846 }, { "epoch": 4.457894736842105, "grad_norm": 0.1353883744809194, "learning_rate": 2.83062278256351e-06, "loss": 1.2339, "step": 847 }, { "epoch": 4.463157894736842, "grad_norm": 0.13137189130014673, "learning_rate": 2.776567887387267e-06, "loss": 1.2301, "step": 848 }, { "epoch": 4.468421052631579, "grad_norm": 0.13126591401950521, "learning_rate": 2.723015547029344e-06, "loss": 1.2468, "step": 849 }, { "epoch": 4.473684210526316, "grad_norm": 0.1415673262181535, "learning_rate": 2.669966484501716e-06, "loss": 1.2245, "step": 850 }, { "epoch": 4.478947368421053, "grad_norm": 0.1320404723499411, "learning_rate": 2.6174214160215704e-06, "loss": 1.2352, "step": 851 }, { "epoch": 4.484210526315789, "grad_norm": 0.12633771710003897, "learning_rate": 2.5653810510016454e-06, "loss": 1.2339, "step": 852 }, { "epoch": 4.489473684210527, "grad_norm": 0.12316620532344269, "learning_rate": 2.5138460920406884e-06, "loss": 1.2317, "step": 853 }, { "epoch": 4.494736842105263, "grad_norm": 0.13602160694846396, "learning_rate": 2.462817234913919e-06, "loss": 1.2273, "step": 854 }, { "epoch": 4.5, "grad_norm": 0.1415982613618782, "learning_rate": 2.4122951685636674e-06, "loss": 1.2243, "step": 855 }, { "epoch": 4.505263157894737, "grad_norm": 0.3127501030193754, "learning_rate": 2.3622805750900567e-06, "loss": 1.2222, "step": 856 }, { "epoch": 4.510526315789473, "grad_norm": 0.13107864455151064, "learning_rate": 2.3127741297418283e-06, "loss": 1.2366, "step": 857 }, { "epoch": 4.515789473684211, "grad_norm": 0.13657403397982118, "learning_rate": 2.2637765009071576e-06, "loss": 1.2337, "step": 858 }, { "epoch": 4.521052631578947, "grad_norm": 0.1302788270408855, "learning_rate": 2.215288350104694e-06, "loss": 1.2253, "step": 859 }, { "epoch": 4.526315789473684, "grad_norm": 0.12664051031696197, "learning_rate": 2.1673103319746146e-06, "loss": 1.225, "step": 860 }, { "epoch": 4.531578947368421, "grad_norm": 0.14352455601262662, "learning_rate": 2.1198430942697625e-06, "loss": 1.2251, "step": 861 }, { "epoch": 4.536842105263158, "grad_norm": 0.13649018750914618, "learning_rate": 2.0728872778469224e-06, "loss": 1.2407, "step": 862 }, { "epoch": 4.542105263157895, "grad_norm": 0.20719895993192947, "learning_rate": 2.026443516658163e-06, "loss": 1.2272, "step": 863 }, { "epoch": 4.5473684210526315, "grad_norm": 0.13901759037964037, "learning_rate": 1.9805124377422834e-06, "loss": 1.2368, "step": 864 }, { "epoch": 4.552631578947368, "grad_norm": 0.1307517829866866, "learning_rate": 1.93509466121633e-06, "loss": 1.2318, "step": 865 }, { "epoch": 4.557894736842105, "grad_norm": 0.12095060465165465, "learning_rate": 1.8901908002672442e-06, "loss": 1.2359, "step": 866 }, { "epoch": 4.563157894736842, "grad_norm": 0.11915812322895941, "learning_rate": 1.8458014611435705e-06, "loss": 1.2426, "step": 867 }, { "epoch": 4.568421052631579, "grad_norm": 0.1240067541225263, "learning_rate": 1.80192724314729e-06, "loss": 1.2163, "step": 868 }, { "epoch": 4.573684210526316, "grad_norm": 0.12397138277266245, "learning_rate": 1.7585687386256944e-06, "loss": 1.2428, "step": 869 }, { "epoch": 4.578947368421053, "grad_norm": 0.13123863782173215, "learning_rate": 1.7157265329634354e-06, "loss": 1.2413, "step": 870 }, { "epoch": 4.58421052631579, "grad_norm": 0.13114076151140514, "learning_rate": 1.6734012045745762e-06, "loss": 1.2255, "step": 871 }, { "epoch": 4.589473684210526, "grad_norm": 0.11880095160971275, "learning_rate": 1.6315933248948068e-06, "loss": 1.2325, "step": 872 }, { "epoch": 4.594736842105263, "grad_norm": 0.133304713863376, "learning_rate": 1.5903034583737343e-06, "loss": 1.2406, "step": 873 }, { "epoch": 4.6, "grad_norm": 0.12445121300617833, "learning_rate": 1.5495321624672443e-06, "loss": 1.2323, "step": 874 }, { "epoch": 4.605263157894737, "grad_norm": 0.11989093911414492, "learning_rate": 1.5092799876299835e-06, "loss": 1.2152, "step": 875 }, { "epoch": 4.610526315789474, "grad_norm": 0.12318779958969978, "learning_rate": 1.4695474773079287e-06, "loss": 1.2274, "step": 876 }, { "epoch": 4.61578947368421, "grad_norm": 0.11755163812164948, "learning_rate": 1.4303351679310473e-06, "loss": 1.2323, "step": 877 }, { "epoch": 4.621052631578947, "grad_norm": 0.1271239652597123, "learning_rate": 1.3916435889060575e-06, "loss": 1.2281, "step": 878 }, { "epoch": 4.626315789473685, "grad_norm": 0.12826240424195157, "learning_rate": 1.353473262609275e-06, "loss": 1.2273, "step": 879 }, { "epoch": 4.631578947368421, "grad_norm": 0.12460164768857226, "learning_rate": 1.3158247043795735e-06, "loss": 1.2264, "step": 880 }, { "epoch": 4.636842105263158, "grad_norm": 0.11779769435490604, "learning_rate": 1.278698422511413e-06, "loss": 1.2243, "step": 881 }, { "epoch": 4.6421052631578945, "grad_norm": 0.11403097697307746, "learning_rate": 1.242094918247978e-06, "loss": 1.2283, "step": 882 }, { "epoch": 4.647368421052631, "grad_norm": 0.12118016084867007, "learning_rate": 1.2060146857744282e-06, "loss": 1.2392, "step": 883 }, { "epoch": 4.652631578947369, "grad_norm": 0.12319740930061163, "learning_rate": 1.1704582122112008e-06, "loss": 1.2088, "step": 884 }, { "epoch": 4.657894736842105, "grad_norm": 0.11386564708274247, "learning_rate": 1.1354259776074472e-06, "loss": 1.233, "step": 885 }, { "epoch": 4.663157894736842, "grad_norm": 0.11374999316034942, "learning_rate": 1.1009184549345632e-06, "loss": 1.2386, "step": 886 }, { "epoch": 4.668421052631579, "grad_norm": 0.12522042587937965, "learning_rate": 1.0669361100797704e-06, "loss": 1.2418, "step": 887 }, { "epoch": 4.673684210526316, "grad_norm": 0.11429258921626788, "learning_rate": 1.0334794018398652e-06, "loss": 1.2178, "step": 888 }, { "epoch": 4.678947368421053, "grad_norm": 0.34812757148076545, "learning_rate": 1.0005487819149917e-06, "loss": 1.2272, "step": 889 }, { "epoch": 4.684210526315789, "grad_norm": 0.1182967297844485, "learning_rate": 9.681446949025752e-07, "loss": 1.2191, "step": 890 }, { "epoch": 4.689473684210526, "grad_norm": 0.1272033760667648, "learning_rate": 9.362675782912923e-07, "loss": 1.2356, "step": 891 }, { "epoch": 4.6947368421052635, "grad_norm": 0.12672455306432165, "learning_rate": 9.049178624551635e-07, "loss": 1.2285, "step": 892 }, { "epoch": 4.7, "grad_norm": 0.3617879606840202, "learning_rate": 8.740959706477725e-07, "loss": 1.2656, "step": 893 }, { "epoch": 4.705263157894737, "grad_norm": 0.10997692184574041, "learning_rate": 8.438023189965272e-07, "loss": 1.2358, "step": 894 }, { "epoch": 4.7105263157894735, "grad_norm": 0.12136967224479166, "learning_rate": 8.140373164970428e-07, "loss": 1.2146, "step": 895 }, { "epoch": 4.715789473684211, "grad_norm": 0.2009841140710602, "learning_rate": 7.848013650076258e-07, "loss": 1.2284, "step": 896 }, { "epoch": 4.721052631578948, "grad_norm": 0.11466884057407387, "learning_rate": 7.560948592438521e-07, "loss": 1.241, "step": 897 }, { "epoch": 4.726315789473684, "grad_norm": 0.11496880440793267, "learning_rate": 7.279181867732199e-07, "loss": 1.2151, "step": 898 }, { "epoch": 4.731578947368421, "grad_norm": 0.12172797181082162, "learning_rate": 7.002717280099403e-07, "loss": 1.2227, "step": 899 }, { "epoch": 4.7368421052631575, "grad_norm": 0.12443319588453902, "learning_rate": 6.731558562097995e-07, "loss": 1.2329, "step": 900 }, { "epoch": 4.742105263157895, "grad_norm": 0.12280105376114048, "learning_rate": 6.465709374650964e-07, "loss": 1.2343, "step": 901 }, { "epoch": 4.747368421052632, "grad_norm": 0.11762303018802715, "learning_rate": 6.205173306997125e-07, "loss": 1.2267, "step": 902 }, { "epoch": 4.752631578947368, "grad_norm": 0.11816128091190285, "learning_rate": 5.949953876642855e-07, "loss": 1.2293, "step": 903 }, { "epoch": 4.757894736842105, "grad_norm": 0.1156560558591028, "learning_rate": 5.700054529314347e-07, "loss": 1.2315, "step": 904 }, { "epoch": 4.7631578947368425, "grad_norm": 0.11137701754866611, "learning_rate": 5.455478638911071e-07, "loss": 1.2394, "step": 905 }, { "epoch": 4.768421052631579, "grad_norm": 0.11181750038905715, "learning_rate": 5.216229507460435e-07, "loss": 1.2208, "step": 906 }, { "epoch": 4.773684210526316, "grad_norm": 0.12036980268449626, "learning_rate": 4.982310365073107e-07, "loss": 1.2235, "step": 907 }, { "epoch": 4.778947368421052, "grad_norm": 0.12359942605818125, "learning_rate": 4.75372436989936e-07, "loss": 1.2308, "step": 908 }, { "epoch": 4.784210526315789, "grad_norm": 0.13220645490519645, "learning_rate": 4.530474608086355e-07, "loss": 1.214, "step": 909 }, { "epoch": 4.7894736842105265, "grad_norm": 0.12206816510347139, "learning_rate": 4.3125640937368373e-07, "loss": 1.2194, "step": 910 }, { "epoch": 4.794736842105263, "grad_norm": 0.11617994515280962, "learning_rate": 4.0999957688679706e-07, "loss": 1.2241, "step": 911 }, { "epoch": 4.8, "grad_norm": 0.1148058679734953, "learning_rate": 3.8927725033718553e-07, "loss": 1.2223, "step": 912 }, { "epoch": 4.8052631578947365, "grad_norm": 0.11823614340464102, "learning_rate": 3.690897094976942e-07, "loss": 1.2238, "step": 913 }, { "epoch": 4.810526315789474, "grad_norm": 0.11790591732140702, "learning_rate": 3.4943722692099224e-07, "loss": 1.2153, "step": 914 }, { "epoch": 4.815789473684211, "grad_norm": 0.11877977952867706, "learning_rate": 3.3032006793590977e-07, "loss": 1.2334, "step": 915 }, { "epoch": 4.821052631578947, "grad_norm": 0.12246828468344964, "learning_rate": 3.117384906438581e-07, "loss": 1.2386, "step": 916 }, { "epoch": 4.826315789473684, "grad_norm": 0.10958575864964563, "learning_rate": 2.936927459153438e-07, "loss": 1.2392, "step": 917 }, { "epoch": 4.831578947368421, "grad_norm": 0.11159223936915229, "learning_rate": 2.761830773865759e-07, "loss": 1.225, "step": 918 }, { "epoch": 4.836842105263158, "grad_norm": 0.11067027350647266, "learning_rate": 2.5920972145618394e-07, "loss": 1.2182, "step": 919 }, { "epoch": 4.842105263157895, "grad_norm": 0.11845597460367807, "learning_rate": 2.4277290728202063e-07, "loss": 1.2303, "step": 920 }, { "epoch": 4.847368421052631, "grad_norm": 0.11321338292881286, "learning_rate": 2.2687285677807536e-07, "loss": 1.2286, "step": 921 }, { "epoch": 4.852631578947369, "grad_norm": 0.10918511827532087, "learning_rate": 2.1150978461146332e-07, "loss": 1.2303, "step": 922 }, { "epoch": 4.8578947368421055, "grad_norm": 0.11331613848290951, "learning_rate": 1.9668389819954338e-07, "loss": 1.2238, "step": 923 }, { "epoch": 4.863157894736842, "grad_norm": 0.11227809077874316, "learning_rate": 1.8239539770711133e-07, "loss": 1.229, "step": 924 }, { "epoch": 4.868421052631579, "grad_norm": 0.11188390803054302, "learning_rate": 1.6864447604370004e-07, "loss": 1.2315, "step": 925 }, { "epoch": 4.873684210526315, "grad_norm": 0.1126954574845899, "learning_rate": 1.5543131886096352e-07, "loss": 1.2281, "step": 926 }, { "epoch": 4.878947368421053, "grad_norm": 0.1154664007961282, "learning_rate": 1.427561045501902e-07, "loss": 1.2372, "step": 927 }, { "epoch": 4.88421052631579, "grad_norm": 0.11176844779105831, "learning_rate": 1.3061900423986917e-07, "loss": 1.2268, "step": 928 }, { "epoch": 4.889473684210526, "grad_norm": 0.11214271981901136, "learning_rate": 1.1902018179340779e-07, "loss": 1.2211, "step": 929 }, { "epoch": 4.894736842105263, "grad_norm": 0.11806437367689042, "learning_rate": 1.0795979380690657e-07, "loss": 1.2232, "step": 930 }, { "epoch": 4.9, "grad_norm": 0.12131946872074126, "learning_rate": 9.74379896070321e-08, "loss": 1.2392, "step": 931 }, { "epoch": 4.905263157894737, "grad_norm": 0.11758661501722971, "learning_rate": 8.745491124901861e-08, "loss": 1.2215, "step": 932 }, { "epoch": 4.910526315789474, "grad_norm": 0.10980377112088192, "learning_rate": 7.80106935147451e-08, "loss": 1.2412, "step": 933 }, { "epoch": 4.91578947368421, "grad_norm": 0.11037951117364361, "learning_rate": 6.910546391092343e-08, "loss": 1.2198, "step": 934 }, { "epoch": 4.921052631578947, "grad_norm": 0.11711041285423687, "learning_rate": 6.073934266735303e-08, "loss": 1.2256, "step": 935 }, { "epoch": 4.926315789473684, "grad_norm": 0.11213122469542446, "learning_rate": 5.291244273531782e-08, "loss": 1.2389, "step": 936 }, { "epoch": 4.931578947368421, "grad_norm": 0.11158223482551854, "learning_rate": 4.562486978606728e-08, "loss": 1.2358, "step": 937 }, { "epoch": 4.936842105263158, "grad_norm": 0.10743288484662021, "learning_rate": 3.887672220936445e-08, "loss": 1.2142, "step": 938 }, { "epoch": 4.942105263157895, "grad_norm": 0.11480044753648233, "learning_rate": 3.266809111218017e-08, "loss": 1.2304, "step": 939 }, { "epoch": 4.947368421052632, "grad_norm": 0.12600905075056, "learning_rate": 2.699906031745414e-08, "loss": 1.2348, "step": 940 }, { "epoch": 4.9526315789473685, "grad_norm": 0.10693815172707843, "learning_rate": 2.1869706362958044e-08, "loss": 1.2329, "step": 941 }, { "epoch": 4.957894736842105, "grad_norm": 0.11368842959943799, "learning_rate": 1.7280098500283005e-08, "loss": 1.2461, "step": 942 }, { "epoch": 4.963157894736842, "grad_norm": 0.11074973929231093, "learning_rate": 1.3230298693871491e-08, "loss": 1.2364, "step": 943 }, { "epoch": 4.968421052631579, "grad_norm": 0.11094251507004392, "learning_rate": 9.720361620217943e-09, "loss": 1.2314, "step": 944 }, { "epoch": 4.973684210526316, "grad_norm": 0.11419040432886776, "learning_rate": 6.750334667091629e-09, "loss": 1.23, "step": 945 }, { "epoch": 4.978947368421053, "grad_norm": 0.2986955255592173, "learning_rate": 4.320257932928229e-09, "loss": 1.2347, "step": 946 }, { "epoch": 4.984210526315789, "grad_norm": 0.10907490264263059, "learning_rate": 2.4301642262791748e-09, "loss": 1.2327, "step": 947 }, { "epoch": 4.989473684210527, "grad_norm": 0.11799915395997997, "learning_rate": 1.0800790653675564e-09, "loss": 1.2269, "step": 948 }, { "epoch": 4.994736842105263, "grad_norm": 0.11484395305342408, "learning_rate": 2.700206777328518e-10, "loss": 1.2454, "step": 949 }, { "epoch": 5.0, "grad_norm": 0.11735650459931829, "learning_rate": 0.0, "loss": 1.2331, "step": 950 }, { "epoch": 5.0, "step": 950, "total_flos": 1.59373351452672e+16, "train_loss": 1.3301890049482648, "train_runtime": 16504.5508, "train_samples_per_second": 29.374, "train_steps_per_second": 0.058 } ], "logging_steps": 1, "max_steps": 950, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.59373351452672e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }