| { |
| "best_metric": 0.6652334928512573, |
| "best_model_checkpoint": "/l/users/visionlanguage/mostafa_ciai/hf_checkpoints_code_ciai_gemma2/checkpoint-1700", |
| "epoch": 5.994075260208167, |
| "eval_steps": 50, |
| "global_step": 1752, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.006832132372564718, |
| "grad_norm": 93.82548522949219, |
| "learning_rate": 2.777777777777778e-06, |
| "loss": 208.4052, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.013664264745129436, |
| "grad_norm": 65.51689147949219, |
| "learning_rate": 5.555555555555556e-06, |
| "loss": 194.4831, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.020496397117694156, |
| "grad_norm": 30.816993713378906, |
| "learning_rate": 8.333333333333334e-06, |
| "loss": 159.6516, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.027328529490258872, |
| "grad_norm": 30.113662719726562, |
| "learning_rate": 1.1111111111111112e-05, |
| "loss": 145.5557, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.03416066186282359, |
| "grad_norm": 22.37295150756836, |
| "learning_rate": 1.388888888888889e-05, |
| "loss": 128.5444, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.04099279423538831, |
| "grad_norm": 22.287870407104492, |
| "learning_rate": 1.6666666666666667e-05, |
| "loss": 116.2723, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.04782492660795303, |
| "grad_norm": 16.027904510498047, |
| "learning_rate": 1.9444444444444445e-05, |
| "loss": 107.5451, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.054657058980517745, |
| "grad_norm": 17.97212791442871, |
| "learning_rate": 2.2222222222222223e-05, |
| "loss": 100.7136, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.061489191353082465, |
| "grad_norm": 15.427449226379395, |
| "learning_rate": 2.5e-05, |
| "loss": 96.4422, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.06832132372564718, |
| "grad_norm": 11.836018562316895, |
| "learning_rate": 2.777777777777778e-05, |
| "loss": 89.9874, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.0751534560982119, |
| "grad_norm": 13.170073509216309, |
| "learning_rate": 3.055555555555556e-05, |
| "loss": 90.5263, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.08198558847077662, |
| "grad_norm": 12.781464576721191, |
| "learning_rate": 3.3333333333333335e-05, |
| "loss": 87.3144, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.08881772084334134, |
| "grad_norm": 11.460458755493164, |
| "learning_rate": 3.611111111111111e-05, |
| "loss": 85.6209, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.09564985321590606, |
| "grad_norm": 10.382000923156738, |
| "learning_rate": 3.888888888888889e-05, |
| "loss": 88.2803, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.10248198558847077, |
| "grad_norm": 10.578895568847656, |
| "learning_rate": 4.166666666666667e-05, |
| "loss": 80.589, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.10931411796103549, |
| "grad_norm": 10.231274604797363, |
| "learning_rate": 4.4444444444444447e-05, |
| "loss": 83.0791, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.11614625033360021, |
| "grad_norm": 13.121459007263184, |
| "learning_rate": 4.722222222222222e-05, |
| "loss": 81.0775, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.12297838270616493, |
| "grad_norm": 11.594988822937012, |
| "learning_rate": 5e-05, |
| "loss": 79.3985, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.12981051507872965, |
| "grad_norm": 10.554534912109375, |
| "learning_rate": 4.9999832415172185e-05, |
| "loss": 78.9732, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.13664264745129437, |
| "grad_norm": 9.661481857299805, |
| "learning_rate": 4.9999329662935534e-05, |
| "loss": 77.5229, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.1434747798238591, |
| "grad_norm": 11.10251235961914, |
| "learning_rate": 4.9998491750030315e-05, |
| "loss": 77.7747, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.1503069121964238, |
| "grad_norm": 9.058899879455566, |
| "learning_rate": 4.999731868769027e-05, |
| "loss": 79.2141, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.15713904456898853, |
| "grad_norm": 9.254643440246582, |
| "learning_rate": 4.999581049164237e-05, |
| "loss": 77.5962, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.16397117694155325, |
| "grad_norm": 10.37578010559082, |
| "learning_rate": 4.99939671821067e-05, |
| "loss": 76.6356, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.17080330931411797, |
| "grad_norm": 9.983922004699707, |
| "learning_rate": 4.999178878379611e-05, |
| "loss": 76.0763, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.17080330931411797, |
| "eval_loss": 1.20554518699646, |
| "eval_runtime": 119.3115, |
| "eval_samples_per_second": 33.065, |
| "eval_steps_per_second": 8.272, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.1776354416866827, |
| "grad_norm": 9.109485626220703, |
| "learning_rate": 4.998927532591592e-05, |
| "loss": 75.2524, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.1844675740592474, |
| "grad_norm": 8.939992904663086, |
| "learning_rate": 4.9986426842163515e-05, |
| "loss": 75.8614, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.19129970643181213, |
| "grad_norm": 8.342733383178711, |
| "learning_rate": 4.9983243370727914e-05, |
| "loss": 72.864, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.19813183880437685, |
| "grad_norm": 7.625518321990967, |
| "learning_rate": 4.9979724954289244e-05, |
| "loss": 75.7165, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.20496397117694154, |
| "grad_norm": 6.545467853546143, |
| "learning_rate": 4.9975871640018154e-05, |
| "loss": 72.337, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.21179610354950626, |
| "grad_norm": 8.73936939239502, |
| "learning_rate": 4.99716834795752e-05, |
| "loss": 73.0804, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.21862823592207098, |
| "grad_norm": 7.599481105804443, |
| "learning_rate": 4.996716052911017e-05, |
| "loss": 71.3494, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.2254603682946357, |
| "grad_norm": 8.88508415222168, |
| "learning_rate": 4.996230284926128e-05, |
| "loss": 73.4886, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.23229250066720042, |
| "grad_norm": 7.141696453094482, |
| "learning_rate": 4.99571105051544e-05, |
| "loss": 73.0934, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.23912463303976514, |
| "grad_norm": 8.946745872497559, |
| "learning_rate": 4.99515835664022e-05, |
| "loss": 70.5761, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.24595676541232986, |
| "grad_norm": 7.428682804107666, |
| "learning_rate": 4.994572210710315e-05, |
| "loss": 69.8488, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.2527888977848946, |
| "grad_norm": 10.490913391113281, |
| "learning_rate": 4.993952620584058e-05, |
| "loss": 72.1602, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.2596210301574593, |
| "grad_norm": 6.010617733001709, |
| "learning_rate": 4.993299594568163e-05, |
| "loss": 70.0962, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.26645316253002405, |
| "grad_norm": 5.207183361053467, |
| "learning_rate": 4.992613141417608e-05, |
| "loss": 70.6436, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.27328529490258874, |
| "grad_norm": 7.816757678985596, |
| "learning_rate": 4.9918932703355256e-05, |
| "loss": 68.9464, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.28011742727515343, |
| "grad_norm": 6.2263383865356445, |
| "learning_rate": 4.9911399909730714e-05, |
| "loss": 68.8249, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.2869495596477182, |
| "grad_norm": 6.726258754730225, |
| "learning_rate": 4.990353313429303e-05, |
| "loss": 68.7637, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.29378169202028287, |
| "grad_norm": 5.4038543701171875, |
| "learning_rate": 4.989533248251037e-05, |
| "loss": 68.7726, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.3006138243928476, |
| "grad_norm": 9.256815910339355, |
| "learning_rate": 4.988679806432712e-05, |
| "loss": 68.2967, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.3074459567654123, |
| "grad_norm": 7.765486717224121, |
| "learning_rate": 4.98779299941624e-05, |
| "loss": 70.6181, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.31427808913797706, |
| "grad_norm": 7.625786304473877, |
| "learning_rate": 4.9868728390908526e-05, |
| "loss": 68.5738, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.32111022151054175, |
| "grad_norm": 7.776100158691406, |
| "learning_rate": 4.985919337792944e-05, |
| "loss": 65.0074, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.3279423538831065, |
| "grad_norm": 6.496335029602051, |
| "learning_rate": 4.9849325083059e-05, |
| "loss": 66.7343, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.3347744862556712, |
| "grad_norm": 6.616697311401367, |
| "learning_rate": 4.983912363859935e-05, |
| "loss": 69.292, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.34160661862823594, |
| "grad_norm": 7.259242057800293, |
| "learning_rate": 4.982858918131906e-05, |
| "loss": 66.8941, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.34160661862823594, |
| "eval_loss": 1.0700218677520752, |
| "eval_runtime": 119.6843, |
| "eval_samples_per_second": 32.962, |
| "eval_steps_per_second": 8.247, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.34843875100080063, |
| "grad_norm": 7.206521987915039, |
| "learning_rate": 4.981772185245135e-05, |
| "loss": 68.3145, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.3552708833733654, |
| "grad_norm": 6.332549095153809, |
| "learning_rate": 4.980652179769218e-05, |
| "loss": 67.5062, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.36210301574593007, |
| "grad_norm": 8.422966957092285, |
| "learning_rate": 4.979498916719828e-05, |
| "loss": 69.0426, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.3689351481184948, |
| "grad_norm": 4.5074357986450195, |
| "learning_rate": 4.978312411558518e-05, |
| "loss": 66.0764, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.3757672804910595, |
| "grad_norm": 6.847994327545166, |
| "learning_rate": 4.977092680192507e-05, |
| "loss": 68.0597, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.38259941286362426, |
| "grad_norm": 9.010295867919922, |
| "learning_rate": 4.9758397389744734e-05, |
| "loss": 66.7856, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.38943154523618895, |
| "grad_norm": 8.793087005615234, |
| "learning_rate": 4.9745536047023324e-05, |
| "loss": 66.6415, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.3962636776087537, |
| "grad_norm": 6.820159912109375, |
| "learning_rate": 4.973234294619011e-05, |
| "loss": 66.8668, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.4030958099813184, |
| "grad_norm": 10.739355087280273, |
| "learning_rate": 4.971881826412218e-05, |
| "loss": 64.5842, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.4099279423538831, |
| "grad_norm": 6.451905727386475, |
| "learning_rate": 4.9704962182142044e-05, |
| "loss": 64.2948, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.4167600747264478, |
| "grad_norm": 6.998046398162842, |
| "learning_rate": 4.9690774886015244e-05, |
| "loss": 66.095, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.4235922070990125, |
| "grad_norm": 6.946700096130371, |
| "learning_rate": 4.967625656594782e-05, |
| "loss": 66.6205, |
| "step": 124 |
| }, |
| { |
| "epoch": 0.43042433947157727, |
| "grad_norm": 7.656089782714844, |
| "learning_rate": 4.966140741658379e-05, |
| "loss": 65.2253, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.43725647184414196, |
| "grad_norm": 8.242254257202148, |
| "learning_rate": 4.9646227637002515e-05, |
| "loss": 65.4466, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.4440886042167067, |
| "grad_norm": 6.5599894523620605, |
| "learning_rate": 4.963071743071607e-05, |
| "loss": 64.5302, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.4509207365892714, |
| "grad_norm": 5.671536922454834, |
| "learning_rate": 4.961487700566646e-05, |
| "loss": 64.9711, |
| "step": 132 |
| }, |
| { |
| "epoch": 0.45775286896183615, |
| "grad_norm": 6.317226886749268, |
| "learning_rate": 4.9598706574222886e-05, |
| "loss": 66.1428, |
| "step": 134 |
| }, |
| { |
| "epoch": 0.46458500133440084, |
| "grad_norm": 7.731470584869385, |
| "learning_rate": 4.958220635317886e-05, |
| "loss": 65.6398, |
| "step": 136 |
| }, |
| { |
| "epoch": 0.4714171337069656, |
| "grad_norm": 7.070956230163574, |
| "learning_rate": 4.956537656374933e-05, |
| "loss": 64.027, |
| "step": 138 |
| }, |
| { |
| "epoch": 0.4782492660795303, |
| "grad_norm": 5.216205596923828, |
| "learning_rate": 4.9548217431567665e-05, |
| "loss": 64.9929, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.485081398452095, |
| "grad_norm": 6.5882344245910645, |
| "learning_rate": 4.95307291866827e-05, |
| "loss": 66.2789, |
| "step": 142 |
| }, |
| { |
| "epoch": 0.4919135308246597, |
| "grad_norm": 5.5962934494018555, |
| "learning_rate": 4.95129120635556e-05, |
| "loss": 65.4516, |
| "step": 144 |
| }, |
| { |
| "epoch": 0.49874566319722446, |
| "grad_norm": 7.341054916381836, |
| "learning_rate": 4.949476630105669e-05, |
| "loss": 64.339, |
| "step": 146 |
| }, |
| { |
| "epoch": 0.5055777955697892, |
| "grad_norm": 7.5083441734313965, |
| "learning_rate": 4.9476292142462374e-05, |
| "loss": 62.7076, |
| "step": 148 |
| }, |
| { |
| "epoch": 0.5124099279423538, |
| "grad_norm": 5.081834316253662, |
| "learning_rate": 4.945748983545172e-05, |
| "loss": 64.2066, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.5124099279423538, |
| "eval_loss": 0.9920685291290283, |
| "eval_runtime": 120.1858, |
| "eval_samples_per_second": 32.824, |
| "eval_steps_per_second": 8.212, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.5192420603149186, |
| "grad_norm": 6.279696464538574, |
| "learning_rate": 4.943835963210324e-05, |
| "loss": 63.3412, |
| "step": 152 |
| }, |
| { |
| "epoch": 0.5260741926874833, |
| "grad_norm": 6.806802749633789, |
| "learning_rate": 4.941890178889149e-05, |
| "loss": 63.2038, |
| "step": 154 |
| }, |
| { |
| "epoch": 0.5329063250600481, |
| "grad_norm": 8.012312889099121, |
| "learning_rate": 4.939911656668361e-05, |
| "loss": 63.4725, |
| "step": 156 |
| }, |
| { |
| "epoch": 0.5397384574326127, |
| "grad_norm": 6.68613338470459, |
| "learning_rate": 4.937900423073585e-05, |
| "loss": 62.8267, |
| "step": 158 |
| }, |
| { |
| "epoch": 0.5465705898051775, |
| "grad_norm": 6.391062259674072, |
| "learning_rate": 4.9358565050689985e-05, |
| "loss": 63.4099, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.5534027221777422, |
| "grad_norm": 6.4117817878723145, |
| "learning_rate": 4.933779930056975e-05, |
| "loss": 62.475, |
| "step": 162 |
| }, |
| { |
| "epoch": 0.5602348545503069, |
| "grad_norm": 10.238900184631348, |
| "learning_rate": 4.93167072587771e-05, |
| "loss": 62.3929, |
| "step": 164 |
| }, |
| { |
| "epoch": 0.5670669869228716, |
| "grad_norm": 6.800478935241699, |
| "learning_rate": 4.929528920808854e-05, |
| "loss": 63.4465, |
| "step": 166 |
| }, |
| { |
| "epoch": 0.5738991192954364, |
| "grad_norm": 6.688059329986572, |
| "learning_rate": 4.92735454356513e-05, |
| "loss": 62.3017, |
| "step": 168 |
| }, |
| { |
| "epoch": 0.5807312516680011, |
| "grad_norm": 5.010741710662842, |
| "learning_rate": 4.925147623297949e-05, |
| "loss": 61.5306, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.5875633840405657, |
| "grad_norm": 6.061219215393066, |
| "learning_rate": 4.922908189595018e-05, |
| "loss": 63.5529, |
| "step": 172 |
| }, |
| { |
| "epoch": 0.5943955164131305, |
| "grad_norm": 7.6835126876831055, |
| "learning_rate": 4.920636272479946e-05, |
| "loss": 64.4077, |
| "step": 174 |
| }, |
| { |
| "epoch": 0.6012276487856952, |
| "grad_norm": 5.945671558380127, |
| "learning_rate": 4.9183319024118415e-05, |
| "loss": 64.3411, |
| "step": 176 |
| }, |
| { |
| "epoch": 0.60805978115826, |
| "grad_norm": 4.983694076538086, |
| "learning_rate": 4.915995110284901e-05, |
| "loss": 63.5529, |
| "step": 178 |
| }, |
| { |
| "epoch": 0.6148919135308246, |
| "grad_norm": 5.736062049865723, |
| "learning_rate": 4.9136259274279955e-05, |
| "loss": 63.7282, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.6217240459033894, |
| "grad_norm": 6.8453545570373535, |
| "learning_rate": 4.911224385604255e-05, |
| "loss": 63.5027, |
| "step": 182 |
| }, |
| { |
| "epoch": 0.6285561782759541, |
| "grad_norm": 5.9253668785095215, |
| "learning_rate": 4.908790517010636e-05, |
| "loss": 60.5142, |
| "step": 184 |
| }, |
| { |
| "epoch": 0.6353883106485189, |
| "grad_norm": 5.743585586547852, |
| "learning_rate": 4.906324354277495e-05, |
| "loss": 62.4935, |
| "step": 186 |
| }, |
| { |
| "epoch": 0.6422204430210835, |
| "grad_norm": 4.686921119689941, |
| "learning_rate": 4.903825930468149e-05, |
| "loss": 60.8045, |
| "step": 188 |
| }, |
| { |
| "epoch": 0.6490525753936482, |
| "grad_norm": 5.350888729095459, |
| "learning_rate": 4.901295279078431e-05, |
| "loss": 62.3775, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.655884707766213, |
| "grad_norm": 5.417562961578369, |
| "learning_rate": 4.898732434036244e-05, |
| "loss": 60.1095, |
| "step": 192 |
| }, |
| { |
| "epoch": 0.6627168401387777, |
| "grad_norm": 5.238453388214111, |
| "learning_rate": 4.896137429701102e-05, |
| "loss": 62.8943, |
| "step": 194 |
| }, |
| { |
| "epoch": 0.6695489725113424, |
| "grad_norm": 6.252527713775635, |
| "learning_rate": 4.893510300863676e-05, |
| "loss": 61.1666, |
| "step": 196 |
| }, |
| { |
| "epoch": 0.6763811048839071, |
| "grad_norm": 5.860842704772949, |
| "learning_rate": 4.890851082745319e-05, |
| "loss": 62.6643, |
| "step": 198 |
| }, |
| { |
| "epoch": 0.6832132372564719, |
| "grad_norm": 6.3946099281311035, |
| "learning_rate": 4.8881598109976004e-05, |
| "loss": 61.939, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.6832132372564719, |
| "eval_loss": 0.9664058685302734, |
| "eval_runtime": 119.3157, |
| "eval_samples_per_second": 33.064, |
| "eval_steps_per_second": 8.272, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.6900453696290365, |
| "grad_norm": 5.909948825836182, |
| "learning_rate": 4.885436521701824e-05, |
| "loss": 63.9172, |
| "step": 202 |
| }, |
| { |
| "epoch": 0.6968775020016013, |
| "grad_norm": 6.600235462188721, |
| "learning_rate": 4.8826812513685487e-05, |
| "loss": 60.6396, |
| "step": 204 |
| }, |
| { |
| "epoch": 0.703709634374166, |
| "grad_norm": 5.97224235534668, |
| "learning_rate": 4.8798940369370944e-05, |
| "loss": 61.1365, |
| "step": 206 |
| }, |
| { |
| "epoch": 0.7105417667467308, |
| "grad_norm": 5.521954536437988, |
| "learning_rate": 4.877074915775049e-05, |
| "loss": 61.9178, |
| "step": 208 |
| }, |
| { |
| "epoch": 0.7173738991192954, |
| "grad_norm": 4.756962299346924, |
| "learning_rate": 4.8742239256777674e-05, |
| "loss": 60.0003, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.7242060314918601, |
| "grad_norm": 7.966216564178467, |
| "learning_rate": 4.8713411048678635e-05, |
| "loss": 60.3937, |
| "step": 212 |
| }, |
| { |
| "epoch": 0.7310381638644249, |
| "grad_norm": 5.864863872528076, |
| "learning_rate": 4.868426491994702e-05, |
| "loss": 60.5208, |
| "step": 214 |
| }, |
| { |
| "epoch": 0.7378702962369896, |
| "grad_norm": 4.952422142028809, |
| "learning_rate": 4.865480126133872e-05, |
| "loss": 61.4458, |
| "step": 216 |
| }, |
| { |
| "epoch": 0.7447024286095543, |
| "grad_norm": 4.522135257720947, |
| "learning_rate": 4.862502046786671e-05, |
| "loss": 62.5035, |
| "step": 218 |
| }, |
| { |
| "epoch": 0.751534560982119, |
| "grad_norm": 4.29464054107666, |
| "learning_rate": 4.859492293879574e-05, |
| "loss": 61.5825, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.7583666933546838, |
| "grad_norm": 5.789974212646484, |
| "learning_rate": 4.856450907763693e-05, |
| "loss": 59.9352, |
| "step": 222 |
| }, |
| { |
| "epoch": 0.7651988257272485, |
| "grad_norm": 6.44216251373291, |
| "learning_rate": 4.853377929214243e-05, |
| "loss": 59.1637, |
| "step": 224 |
| }, |
| { |
| "epoch": 0.7720309580998131, |
| "grad_norm": 4.520390033721924, |
| "learning_rate": 4.85027339942999e-05, |
| "loss": 60.4813, |
| "step": 226 |
| }, |
| { |
| "epoch": 0.7788630904723779, |
| "grad_norm": 6.058870315551758, |
| "learning_rate": 4.8471373600326996e-05, |
| "loss": 60.2968, |
| "step": 228 |
| }, |
| { |
| "epoch": 0.7856952228449426, |
| "grad_norm": 5.945502281188965, |
| "learning_rate": 4.843969853066584e-05, |
| "loss": 58.2098, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.7925273552175074, |
| "grad_norm": 4.318876266479492, |
| "learning_rate": 4.8407709209977305e-05, |
| "loss": 58.4711, |
| "step": 232 |
| }, |
| { |
| "epoch": 0.799359487590072, |
| "grad_norm": 5.385821342468262, |
| "learning_rate": 4.837540606713538e-05, |
| "loss": 59.5379, |
| "step": 234 |
| }, |
| { |
| "epoch": 0.8061916199626368, |
| "grad_norm": 6.59214973449707, |
| "learning_rate": 4.834278953522138e-05, |
| "loss": 58.4163, |
| "step": 236 |
| }, |
| { |
| "epoch": 0.8130237523352015, |
| "grad_norm": 5.087238311767578, |
| "learning_rate": 4.8309860051518204e-05, |
| "loss": 60.5546, |
| "step": 238 |
| }, |
| { |
| "epoch": 0.8198558847077662, |
| "grad_norm": 6.804642200469971, |
| "learning_rate": 4.8276618057504376e-05, |
| "loss": 59.0874, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.8266880170803309, |
| "grad_norm": 5.035391330718994, |
| "learning_rate": 4.824306399884822e-05, |
| "loss": 59.9545, |
| "step": 242 |
| }, |
| { |
| "epoch": 0.8335201494528957, |
| "grad_norm": 5.837290287017822, |
| "learning_rate": 4.8209198325401815e-05, |
| "loss": 59.5963, |
| "step": 244 |
| }, |
| { |
| "epoch": 0.8403522818254604, |
| "grad_norm": 4.17293643951416, |
| "learning_rate": 4.817502149119502e-05, |
| "loss": 59.7065, |
| "step": 246 |
| }, |
| { |
| "epoch": 0.847184414198025, |
| "grad_norm": 4.964944362640381, |
| "learning_rate": 4.8140533954429327e-05, |
| "loss": 59.5358, |
| "step": 248 |
| }, |
| { |
| "epoch": 0.8540165465705898, |
| "grad_norm": 6.021297931671143, |
| "learning_rate": 4.810573617747178e-05, |
| "loss": 60.6391, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.8540165465705898, |
| "eval_loss": 0.9407148361206055, |
| "eval_runtime": 119.9595, |
| "eval_samples_per_second": 32.886, |
| "eval_steps_per_second": 8.228, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.8608486789431545, |
| "grad_norm": 5.707021713256836, |
| "learning_rate": 4.8070628626848735e-05, |
| "loss": 61.5872, |
| "step": 252 |
| }, |
| { |
| "epoch": 0.8676808113157193, |
| "grad_norm": 4.725375652313232, |
| "learning_rate": 4.803521177323962e-05, |
| "loss": 59.2192, |
| "step": 254 |
| }, |
| { |
| "epoch": 0.8745129436882839, |
| "grad_norm": 23.445714950561523, |
| "learning_rate": 4.799948609147061e-05, |
| "loss": 60.1762, |
| "step": 256 |
| }, |
| { |
| "epoch": 0.8813450760608487, |
| "grad_norm": 5.503020286560059, |
| "learning_rate": 4.796345206050829e-05, |
| "loss": 62.2226, |
| "step": 258 |
| }, |
| { |
| "epoch": 0.8881772084334134, |
| "grad_norm": 6.558228015899658, |
| "learning_rate": 4.792711016345321e-05, |
| "loss": 62.089, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.8950093408059782, |
| "grad_norm": 8.109895706176758, |
| "learning_rate": 4.7890460887533417e-05, |
| "loss": 60.7872, |
| "step": 262 |
| }, |
| { |
| "epoch": 0.9018414731785428, |
| "grad_norm": 5.230234622955322, |
| "learning_rate": 4.785350472409792e-05, |
| "loss": 57.9312, |
| "step": 264 |
| }, |
| { |
| "epoch": 0.9086736055511075, |
| "grad_norm": 6.669562339782715, |
| "learning_rate": 4.7816242168610093e-05, |
| "loss": 61.7966, |
| "step": 266 |
| }, |
| { |
| "epoch": 0.9155057379236723, |
| "grad_norm": 5.428192615509033, |
| "learning_rate": 4.777867372064105e-05, |
| "loss": 58.4551, |
| "step": 268 |
| }, |
| { |
| "epoch": 0.922337870296237, |
| "grad_norm": 5.6168131828308105, |
| "learning_rate": 4.774079988386296e-05, |
| "loss": 59.9015, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.9291700026688017, |
| "grad_norm": 5.785460948944092, |
| "learning_rate": 4.770262116604224e-05, |
| "loss": 59.723, |
| "step": 272 |
| }, |
| { |
| "epoch": 0.9360021350413664, |
| "grad_norm": 8.77035140991211, |
| "learning_rate": 4.76641380790328e-05, |
| "loss": 60.8996, |
| "step": 274 |
| }, |
| { |
| "epoch": 0.9428342674139312, |
| "grad_norm": 4.000178813934326, |
| "learning_rate": 4.762535113876917e-05, |
| "loss": 59.2908, |
| "step": 276 |
| }, |
| { |
| "epoch": 0.9496663997864959, |
| "grad_norm": 5.8565826416015625, |
| "learning_rate": 4.758626086525956e-05, |
| "loss": 59.296, |
| "step": 278 |
| }, |
| { |
| "epoch": 0.9564985321590606, |
| "grad_norm": 6.792466163635254, |
| "learning_rate": 4.754686778257891e-05, |
| "loss": 58.351, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.9633306645316253, |
| "grad_norm": 6.484628677368164, |
| "learning_rate": 4.750717241886185e-05, |
| "loss": 58.46, |
| "step": 282 |
| }, |
| { |
| "epoch": 0.97016279690419, |
| "grad_norm": 5.421430587768555, |
| "learning_rate": 4.7467175306295655e-05, |
| "loss": 59.0205, |
| "step": 284 |
| }, |
| { |
| "epoch": 0.9769949292767547, |
| "grad_norm": 4.550335884094238, |
| "learning_rate": 4.7426876981113044e-05, |
| "loss": 60.8234, |
| "step": 286 |
| }, |
| { |
| "epoch": 0.9838270616493194, |
| "grad_norm": 5.412383079528809, |
| "learning_rate": 4.738627798358506e-05, |
| "loss": 57.3651, |
| "step": 288 |
| }, |
| { |
| "epoch": 0.9906591940218842, |
| "grad_norm": 5.225856781005859, |
| "learning_rate": 4.7345378858013776e-05, |
| "loss": 58.8522, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.9974913263944489, |
| "grad_norm": 3.856189250946045, |
| "learning_rate": 4.730418015272503e-05, |
| "loss": 59.7945, |
| "step": 292 |
| }, |
| { |
| "epoch": 1.0034160661862823, |
| "grad_norm": 6.19010066986084, |
| "learning_rate": 4.726268242006106e-05, |
| "loss": 50.2722, |
| "step": 294 |
| }, |
| { |
| "epoch": 1.0102481985588472, |
| "grad_norm": 5.333181858062744, |
| "learning_rate": 4.722088621637309e-05, |
| "loss": 58.7285, |
| "step": 296 |
| }, |
| { |
| "epoch": 1.0170803309314118, |
| "grad_norm": 5.93973970413208, |
| "learning_rate": 4.717879210201389e-05, |
| "loss": 57.2823, |
| "step": 298 |
| }, |
| { |
| "epoch": 1.0239124633039765, |
| "grad_norm": 4.59360408782959, |
| "learning_rate": 4.713640064133025e-05, |
| "loss": 58.4687, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.0239124633039765, |
| "eval_loss": 0.9195547699928284, |
| "eval_runtime": 119.3076, |
| "eval_samples_per_second": 33.066, |
| "eval_steps_per_second": 8.273, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.0307445956765413, |
| "grad_norm": 5.437332630157471, |
| "learning_rate": 4.7093712402655427e-05, |
| "loss": 57.7491, |
| "step": 302 |
| }, |
| { |
| "epoch": 1.037576728049106, |
| "grad_norm": 4.938009738922119, |
| "learning_rate": 4.7050727958301506e-05, |
| "loss": 58.2642, |
| "step": 304 |
| }, |
| { |
| "epoch": 1.0444088604216706, |
| "grad_norm": 5.104777812957764, |
| "learning_rate": 4.7007447884551745e-05, |
| "loss": 56.1312, |
| "step": 306 |
| }, |
| { |
| "epoch": 1.0512409927942354, |
| "grad_norm": 5.78248405456543, |
| "learning_rate": 4.6963872761652835e-05, |
| "loss": 56.9488, |
| "step": 308 |
| }, |
| { |
| "epoch": 1.0580731251668, |
| "grad_norm": 4.8224287033081055, |
| "learning_rate": 4.692000317380715e-05, |
| "loss": 56.6993, |
| "step": 310 |
| }, |
| { |
| "epoch": 1.064905257539365, |
| "grad_norm": 4.517540454864502, |
| "learning_rate": 4.687583970916487e-05, |
| "loss": 58.8636, |
| "step": 312 |
| }, |
| { |
| "epoch": 1.0717373899119296, |
| "grad_norm": 5.353949069976807, |
| "learning_rate": 4.683138295981611e-05, |
| "loss": 58.6762, |
| "step": 314 |
| }, |
| { |
| "epoch": 1.0785695222844942, |
| "grad_norm": 6.164919376373291, |
| "learning_rate": 4.678663352178301e-05, |
| "loss": 57.9218, |
| "step": 316 |
| }, |
| { |
| "epoch": 1.085401654657059, |
| "grad_norm": 4.577470302581787, |
| "learning_rate": 4.674159199501173e-05, |
| "loss": 58.1644, |
| "step": 318 |
| }, |
| { |
| "epoch": 1.0922337870296237, |
| "grad_norm": 6.5861592292785645, |
| "learning_rate": 4.6696258983364385e-05, |
| "loss": 57.3447, |
| "step": 320 |
| }, |
| { |
| "epoch": 1.0990659194021883, |
| "grad_norm": 4.327467918395996, |
| "learning_rate": 4.665063509461097e-05, |
| "loss": 57.2627, |
| "step": 322 |
| }, |
| { |
| "epoch": 1.1058980517747532, |
| "grad_norm": 7.534716606140137, |
| "learning_rate": 4.660472094042121e-05, |
| "loss": 57.2099, |
| "step": 324 |
| }, |
| { |
| "epoch": 1.1127301841473178, |
| "grad_norm": 5.549008369445801, |
| "learning_rate": 4.655851713635635e-05, |
| "loss": 58.4564, |
| "step": 326 |
| }, |
| { |
| "epoch": 1.1195623165198825, |
| "grad_norm": 4.385070323944092, |
| "learning_rate": 4.651202430186092e-05, |
| "loss": 57.0019, |
| "step": 328 |
| }, |
| { |
| "epoch": 1.1263944488924473, |
| "grad_norm": 4.763044357299805, |
| "learning_rate": 4.6465243060254415e-05, |
| "loss": 55.7849, |
| "step": 330 |
| }, |
| { |
| "epoch": 1.133226581265012, |
| "grad_norm": 3.9461379051208496, |
| "learning_rate": 4.641817403872293e-05, |
| "loss": 56.2399, |
| "step": 332 |
| }, |
| { |
| "epoch": 1.1400587136375768, |
| "grad_norm": 4.946137428283691, |
| "learning_rate": 4.637081786831079e-05, |
| "loss": 56.7089, |
| "step": 334 |
| }, |
| { |
| "epoch": 1.1468908460101415, |
| "grad_norm": 5.664731025695801, |
| "learning_rate": 4.6323175183912024e-05, |
| "loss": 57.1022, |
| "step": 336 |
| }, |
| { |
| "epoch": 1.153722978382706, |
| "grad_norm": 5.261230945587158, |
| "learning_rate": 4.627524662426194e-05, |
| "loss": 56.3552, |
| "step": 338 |
| }, |
| { |
| "epoch": 1.160555110755271, |
| "grad_norm": 4.166741847991943, |
| "learning_rate": 4.6227032831928484e-05, |
| "loss": 56.888, |
| "step": 340 |
| }, |
| { |
| "epoch": 1.1673872431278356, |
| "grad_norm": 6.015218734741211, |
| "learning_rate": 4.6178534453303666e-05, |
| "loss": 57.3006, |
| "step": 342 |
| }, |
| { |
| "epoch": 1.1742193755004002, |
| "grad_norm": 6.349710941314697, |
| "learning_rate": 4.6129752138594874e-05, |
| "loss": 57.0208, |
| "step": 344 |
| }, |
| { |
| "epoch": 1.181051507872965, |
| "grad_norm": 5.403022766113281, |
| "learning_rate": 4.608068654181617e-05, |
| "loss": 57.0645, |
| "step": 346 |
| }, |
| { |
| "epoch": 1.1878836402455297, |
| "grad_norm": 6.523670673370361, |
| "learning_rate": 4.6031338320779534e-05, |
| "loss": 58.2164, |
| "step": 348 |
| }, |
| { |
| "epoch": 1.1947157726180944, |
| "grad_norm": 6.369359970092773, |
| "learning_rate": 4.5981708137086e-05, |
| "loss": 56.7965, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.1947157726180944, |
| "eval_loss": 0.8986765146255493, |
| "eval_runtime": 119.0222, |
| "eval_samples_per_second": 33.145, |
| "eval_steps_per_second": 8.293, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.2015479049906592, |
| "grad_norm": 5.050749778747559, |
| "learning_rate": 4.5931796656116846e-05, |
| "loss": 56.7828, |
| "step": 352 |
| }, |
| { |
| "epoch": 1.2083800373632239, |
| "grad_norm": 5.341484069824219, |
| "learning_rate": 4.588160454702462e-05, |
| "loss": 57.4058, |
| "step": 354 |
| }, |
| { |
| "epoch": 1.2152121697357887, |
| "grad_norm": 4.554074287414551, |
| "learning_rate": 4.5831132482724195e-05, |
| "loss": 57.6257, |
| "step": 356 |
| }, |
| { |
| "epoch": 1.2220443021083534, |
| "grad_norm": 4.951889514923096, |
| "learning_rate": 4.578038113988376e-05, |
| "loss": 56.0608, |
| "step": 358 |
| }, |
| { |
| "epoch": 1.228876434480918, |
| "grad_norm": 4.2526421546936035, |
| "learning_rate": 4.572935119891571e-05, |
| "loss": 55.8586, |
| "step": 360 |
| }, |
| { |
| "epoch": 1.2357085668534828, |
| "grad_norm": 4.805353164672852, |
| "learning_rate": 4.5678043343967554e-05, |
| "loss": 59.2427, |
| "step": 362 |
| }, |
| { |
| "epoch": 1.2425406992260475, |
| "grad_norm": 4.9927978515625, |
| "learning_rate": 4.5626458262912745e-05, |
| "loss": 55.1494, |
| "step": 364 |
| }, |
| { |
| "epoch": 1.2493728315986123, |
| "grad_norm": 5.778275012969971, |
| "learning_rate": 4.557459664734141e-05, |
| "loss": 55.9791, |
| "step": 366 |
| }, |
| { |
| "epoch": 1.256204963971177, |
| "grad_norm": 4.41555643081665, |
| "learning_rate": 4.552245919255117e-05, |
| "loss": 57.3123, |
| "step": 368 |
| }, |
| { |
| "epoch": 1.2630370963437416, |
| "grad_norm": 5.230330944061279, |
| "learning_rate": 4.5470046597537735e-05, |
| "loss": 55.9031, |
| "step": 370 |
| }, |
| { |
| "epoch": 1.2698692287163063, |
| "grad_norm": 3.9548189640045166, |
| "learning_rate": 4.541735956498554e-05, |
| "loss": 56.6997, |
| "step": 372 |
| }, |
| { |
| "epoch": 1.2767013610888711, |
| "grad_norm": 5.017361640930176, |
| "learning_rate": 4.5364398801258396e-05, |
| "loss": 57.3268, |
| "step": 374 |
| }, |
| { |
| "epoch": 1.2835334934614357, |
| "grad_norm": 5.562941074371338, |
| "learning_rate": 4.5311165016389916e-05, |
| "loss": 55.6271, |
| "step": 376 |
| }, |
| { |
| "epoch": 1.2903656258340006, |
| "grad_norm": 6.675297737121582, |
| "learning_rate": 4.525765892407409e-05, |
| "loss": 55.9593, |
| "step": 378 |
| }, |
| { |
| "epoch": 1.2971977582065652, |
| "grad_norm": 6.47582483291626, |
| "learning_rate": 4.5203881241655644e-05, |
| "loss": 57.0788, |
| "step": 380 |
| }, |
| { |
| "epoch": 1.3040298905791299, |
| "grad_norm": 5.157675743103027, |
| "learning_rate": 4.514983269012049e-05, |
| "loss": 56.3623, |
| "step": 382 |
| }, |
| { |
| "epoch": 1.3108620229516947, |
| "grad_norm": 8.075702667236328, |
| "learning_rate": 4.509551399408598e-05, |
| "loss": 55.6531, |
| "step": 384 |
| }, |
| { |
| "epoch": 1.3176941553242594, |
| "grad_norm": 3.849310874938965, |
| "learning_rate": 4.504092588179128e-05, |
| "loss": 58.7546, |
| "step": 386 |
| }, |
| { |
| "epoch": 1.3245262876968242, |
| "grad_norm": 3.6027579307556152, |
| "learning_rate": 4.498606908508754e-05, |
| "loss": 57.7153, |
| "step": 388 |
| }, |
| { |
| "epoch": 1.3313584200693889, |
| "grad_norm": 5.139729976654053, |
| "learning_rate": 4.4930944339428085e-05, |
| "loss": 56.4532, |
| "step": 390 |
| }, |
| { |
| "epoch": 1.3381905524419535, |
| "grad_norm": 5.337704181671143, |
| "learning_rate": 4.487555238385862e-05, |
| "loss": 54.2958, |
| "step": 392 |
| }, |
| { |
| "epoch": 1.3450226848145181, |
| "grad_norm": 3.3229618072509766, |
| "learning_rate": 4.481989396100724e-05, |
| "loss": 54.2046, |
| "step": 394 |
| }, |
| { |
| "epoch": 1.351854817187083, |
| "grad_norm": 5.2183074951171875, |
| "learning_rate": 4.476396981707453e-05, |
| "loss": 56.0147, |
| "step": 396 |
| }, |
| { |
| "epoch": 1.3586869495596476, |
| "grad_norm": 5.028941631317139, |
| "learning_rate": 4.470778070182353e-05, |
| "loss": 54.3446, |
| "step": 398 |
| }, |
| { |
| "epoch": 1.3655190819322125, |
| "grad_norm": 6.347212791442871, |
| "learning_rate": 4.465132736856969e-05, |
| "loss": 56.7659, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.3655190819322125, |
| "eval_loss": 0.8771227598190308, |
| "eval_runtime": 118.9477, |
| "eval_samples_per_second": 33.166, |
| "eval_steps_per_second": 8.298, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.3723512143047771, |
| "grad_norm": 9.381309509277344, |
| "learning_rate": 4.459461057417078e-05, |
| "loss": 56.8099, |
| "step": 402 |
| }, |
| { |
| "epoch": 1.3791833466773418, |
| "grad_norm": 5.657813549041748, |
| "learning_rate": 4.453763107901675e-05, |
| "loss": 56.3326, |
| "step": 404 |
| }, |
| { |
| "epoch": 1.3860154790499066, |
| "grad_norm": 4.476396083831787, |
| "learning_rate": 4.4480389647019505e-05, |
| "loss": 57.3978, |
| "step": 406 |
| }, |
| { |
| "epoch": 1.3928476114224713, |
| "grad_norm": 5.402798652648926, |
| "learning_rate": 4.442288704560268e-05, |
| "loss": 55.7143, |
| "step": 408 |
| }, |
| { |
| "epoch": 1.3996797437950361, |
| "grad_norm": 4.367002010345459, |
| "learning_rate": 4.436512404569136e-05, |
| "loss": 55.7044, |
| "step": 410 |
| }, |
| { |
| "epoch": 1.4065118761676008, |
| "grad_norm": 5.653073310852051, |
| "learning_rate": 4.430710142170176e-05, |
| "loss": 55.7266, |
| "step": 412 |
| }, |
| { |
| "epoch": 1.4133440085401654, |
| "grad_norm": 7.221829414367676, |
| "learning_rate": 4.424881995153076e-05, |
| "loss": 56.4174, |
| "step": 414 |
| }, |
| { |
| "epoch": 1.4201761409127303, |
| "grad_norm": 5.465057373046875, |
| "learning_rate": 4.419028041654559e-05, |
| "loss": 56.9093, |
| "step": 416 |
| }, |
| { |
| "epoch": 1.427008273285295, |
| "grad_norm": 8.383552551269531, |
| "learning_rate": 4.4131483601573285e-05, |
| "loss": 56.0841, |
| "step": 418 |
| }, |
| { |
| "epoch": 1.4338404056578598, |
| "grad_norm": 4.208652973175049, |
| "learning_rate": 4.4072430294890174e-05, |
| "loss": 57.5786, |
| "step": 420 |
| }, |
| { |
| "epoch": 1.4406725380304244, |
| "grad_norm": 5.773376941680908, |
| "learning_rate": 4.4013121288211307e-05, |
| "loss": 55.8851, |
| "step": 422 |
| }, |
| { |
| "epoch": 1.447504670402989, |
| "grad_norm": 5.354812145233154, |
| "learning_rate": 4.3953557376679856e-05, |
| "loss": 55.1571, |
| "step": 424 |
| }, |
| { |
| "epoch": 1.4543368027755537, |
| "grad_norm": 4.6360039710998535, |
| "learning_rate": 4.389373935885646e-05, |
| "loss": 54.0095, |
| "step": 426 |
| }, |
| { |
| "epoch": 1.4611689351481185, |
| "grad_norm": 7.125521183013916, |
| "learning_rate": 4.383366803670849e-05, |
| "loss": 56.645, |
| "step": 428 |
| }, |
| { |
| "epoch": 1.4680010675206832, |
| "grad_norm": 6.071737766265869, |
| "learning_rate": 4.377334421559932e-05, |
| "loss": 55.3209, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.474833199893248, |
| "grad_norm": 4.569766998291016, |
| "learning_rate": 4.371276870427753e-05, |
| "loss": 54.6604, |
| "step": 432 |
| }, |
| { |
| "epoch": 1.4816653322658127, |
| "grad_norm": 5.426764965057373, |
| "learning_rate": 4.365194231486604e-05, |
| "loss": 56.4116, |
| "step": 434 |
| }, |
| { |
| "epoch": 1.4884974646383773, |
| "grad_norm": 5.6092023849487305, |
| "learning_rate": 4.359086586285127e-05, |
| "loss": 56.0268, |
| "step": 436 |
| }, |
| { |
| "epoch": 1.4953295970109421, |
| "grad_norm": 6.140939712524414, |
| "learning_rate": 4.3529540167072126e-05, |
| "loss": 54.886, |
| "step": 438 |
| }, |
| { |
| "epoch": 1.5021617293835068, |
| "grad_norm": 4.043739318847656, |
| "learning_rate": 4.346796604970912e-05, |
| "loss": 56.6431, |
| "step": 440 |
| }, |
| { |
| "epoch": 1.5089938617560716, |
| "grad_norm": 3.8898212909698486, |
| "learning_rate": 4.340614433627328e-05, |
| "loss": 55.6492, |
| "step": 442 |
| }, |
| { |
| "epoch": 1.5158259941286363, |
| "grad_norm": 6.158950328826904, |
| "learning_rate": 4.3344075855595104e-05, |
| "loss": 55.6869, |
| "step": 444 |
| }, |
| { |
| "epoch": 1.522658126501201, |
| "grad_norm": 3.874180316925049, |
| "learning_rate": 4.328176143981343e-05, |
| "loss": 53.7981, |
| "step": 446 |
| }, |
| { |
| "epoch": 1.5294902588737656, |
| "grad_norm": 4.068581581115723, |
| "learning_rate": 4.321920192436433e-05, |
| "loss": 54.6618, |
| "step": 448 |
| }, |
| { |
| "epoch": 1.5363223912463304, |
| "grad_norm": 4.552149295806885, |
| "learning_rate": 4.315639814796983e-05, |
| "loss": 55.1642, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.5363223912463304, |
| "eval_loss": 0.8704175353050232, |
| "eval_runtime": 119.5049, |
| "eval_samples_per_second": 33.011, |
| "eval_steps_per_second": 8.259, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.5431545236188953, |
| "grad_norm": 4.1831374168396, |
| "learning_rate": 4.309335095262676e-05, |
| "loss": 53.2926, |
| "step": 452 |
| }, |
| { |
| "epoch": 1.54998665599146, |
| "grad_norm": 4.456052780151367, |
| "learning_rate": 4.303006118359537e-05, |
| "loss": 53.6038, |
| "step": 454 |
| }, |
| { |
| "epoch": 1.5568187883640245, |
| "grad_norm": 17.7099609375, |
| "learning_rate": 4.296652968938807e-05, |
| "loss": 54.9325, |
| "step": 456 |
| }, |
| { |
| "epoch": 1.5636509207365892, |
| "grad_norm": 8.005233764648438, |
| "learning_rate": 4.2902757321758016e-05, |
| "loss": 53.7884, |
| "step": 458 |
| }, |
| { |
| "epoch": 1.570483053109154, |
| "grad_norm": 5.034004211425781, |
| "learning_rate": 4.283874493568772e-05, |
| "loss": 53.2575, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.5773151854817187, |
| "grad_norm": 4.005930423736572, |
| "learning_rate": 4.2774493389377545e-05, |
| "loss": 55.4554, |
| "step": 462 |
| }, |
| { |
| "epoch": 1.5841473178542835, |
| "grad_norm": 5.812296390533447, |
| "learning_rate": 4.271000354423426e-05, |
| "loss": 56.7008, |
| "step": 464 |
| }, |
| { |
| "epoch": 1.5909794502268482, |
| "grad_norm": 6.425695896148682, |
| "learning_rate": 4.2645276264859394e-05, |
| "loss": 56.8804, |
| "step": 466 |
| }, |
| { |
| "epoch": 1.5978115825994128, |
| "grad_norm": 4.44102144241333, |
| "learning_rate": 4.258031241903778e-05, |
| "loss": 54.2011, |
| "step": 468 |
| }, |
| { |
| "epoch": 1.6046437149719774, |
| "grad_norm": 4.444553852081299, |
| "learning_rate": 4.251511287772579e-05, |
| "loss": 54.9826, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.6114758473445423, |
| "grad_norm": 3.8157808780670166, |
| "learning_rate": 4.2449678515039747e-05, |
| "loss": 55.2601, |
| "step": 472 |
| }, |
| { |
| "epoch": 1.6183079797171072, |
| "grad_norm": 6.47904634475708, |
| "learning_rate": 4.238401020824416e-05, |
| "loss": 54.5978, |
| "step": 474 |
| }, |
| { |
| "epoch": 1.6251401120896718, |
| "grad_norm": 5.010526180267334, |
| "learning_rate": 4.231810883773999e-05, |
| "loss": 56.0995, |
| "step": 476 |
| }, |
| { |
| "epoch": 1.6319722444622364, |
| "grad_norm": 5.843505382537842, |
| "learning_rate": 4.2251975287052804e-05, |
| "loss": 54.0241, |
| "step": 478 |
| }, |
| { |
| "epoch": 1.638804376834801, |
| "grad_norm": 4.549996852874756, |
| "learning_rate": 4.218561044282099e-05, |
| "loss": 56.3071, |
| "step": 480 |
| }, |
| { |
| "epoch": 1.645636509207366, |
| "grad_norm": 4.20985221862793, |
| "learning_rate": 4.211901519478382e-05, |
| "loss": 54.3977, |
| "step": 482 |
| }, |
| { |
| "epoch": 1.6524686415799306, |
| "grad_norm": 5.491010665893555, |
| "learning_rate": 4.2052190435769554e-05, |
| "loss": 53.1375, |
| "step": 484 |
| }, |
| { |
| "epoch": 1.6593007739524954, |
| "grad_norm": 4.417302131652832, |
| "learning_rate": 4.198513706168345e-05, |
| "loss": 53.959, |
| "step": 486 |
| }, |
| { |
| "epoch": 1.66613290632506, |
| "grad_norm": 5.39029598236084, |
| "learning_rate": 4.191785597149577e-05, |
| "loss": 54.5638, |
| "step": 488 |
| }, |
| { |
| "epoch": 1.6729650386976247, |
| "grad_norm": 4.233526229858398, |
| "learning_rate": 4.1850348067229696e-05, |
| "loss": 54.6384, |
| "step": 490 |
| }, |
| { |
| "epoch": 1.6797971710701893, |
| "grad_norm": 6.301634311676025, |
| "learning_rate": 4.178261425394926e-05, |
| "loss": 55.1738, |
| "step": 492 |
| }, |
| { |
| "epoch": 1.6866293034427542, |
| "grad_norm": 5.9507246017456055, |
| "learning_rate": 4.171465543974723e-05, |
| "loss": 54.7009, |
| "step": 494 |
| }, |
| { |
| "epoch": 1.693461435815319, |
| "grad_norm": 5.033243656158447, |
| "learning_rate": 4.1646472535732895e-05, |
| "loss": 54.3154, |
| "step": 496 |
| }, |
| { |
| "epoch": 1.7002935681878837, |
| "grad_norm": 4.675721168518066, |
| "learning_rate": 4.157806645601988e-05, |
| "loss": 54.1507, |
| "step": 498 |
| }, |
| { |
| "epoch": 1.7071257005604483, |
| "grad_norm": 3.5945537090301514, |
| "learning_rate": 4.1509438117713866e-05, |
| "loss": 52.2103, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.7071257005604483, |
| "eval_loss": 0.8516557216644287, |
| "eval_runtime": 119.4754, |
| "eval_samples_per_second": 33.019, |
| "eval_steps_per_second": 8.261, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.713957832933013, |
| "grad_norm": 4.187085151672363, |
| "learning_rate": 4.144058844090032e-05, |
| "loss": 54.1474, |
| "step": 502 |
| }, |
| { |
| "epoch": 1.7207899653055778, |
| "grad_norm": 3.818648099899292, |
| "learning_rate": 4.137151834863213e-05, |
| "loss": 55.5711, |
| "step": 504 |
| }, |
| { |
| "epoch": 1.7276220976781427, |
| "grad_norm": 5.919620513916016, |
| "learning_rate": 4.130222876691726e-05, |
| "loss": 54.3803, |
| "step": 506 |
| }, |
| { |
| "epoch": 1.7344542300507073, |
| "grad_norm": 5.772305011749268, |
| "learning_rate": 4.123272062470633e-05, |
| "loss": 53.9454, |
| "step": 508 |
| }, |
| { |
| "epoch": 1.741286362423272, |
| "grad_norm": 4.569563865661621, |
| "learning_rate": 4.116299485388014e-05, |
| "loss": 53.5009, |
| "step": 510 |
| }, |
| { |
| "epoch": 1.7481184947958366, |
| "grad_norm": 4.183293342590332, |
| "learning_rate": 4.109305238923718e-05, |
| "loss": 52.9927, |
| "step": 512 |
| }, |
| { |
| "epoch": 1.7549506271684012, |
| "grad_norm": 4.4316301345825195, |
| "learning_rate": 4.102289416848114e-05, |
| "loss": 54.5023, |
| "step": 514 |
| }, |
| { |
| "epoch": 1.761782759540966, |
| "grad_norm": 14.234251976013184, |
| "learning_rate": 4.095252113220827e-05, |
| "loss": 53.1473, |
| "step": 516 |
| }, |
| { |
| "epoch": 1.768614891913531, |
| "grad_norm": 4.889795780181885, |
| "learning_rate": 4.088193422389484e-05, |
| "loss": 53.7265, |
| "step": 518 |
| }, |
| { |
| "epoch": 1.7754470242860956, |
| "grad_norm": 3.02785325050354, |
| "learning_rate": 4.0811134389884433e-05, |
| "loss": 52.5917, |
| "step": 520 |
| }, |
| { |
| "epoch": 1.7822791566586602, |
| "grad_norm": 5.794788360595703, |
| "learning_rate": 4.0740122579375286e-05, |
| "loss": 55.4619, |
| "step": 522 |
| }, |
| { |
| "epoch": 1.7891112890312248, |
| "grad_norm": 4.442338466644287, |
| "learning_rate": 4.066889974440757e-05, |
| "loss": 53.7709, |
| "step": 524 |
| }, |
| { |
| "epoch": 1.7959434214037897, |
| "grad_norm": 4.7714715003967285, |
| "learning_rate": 4.0597466839850595e-05, |
| "loss": 54.16, |
| "step": 526 |
| }, |
| { |
| "epoch": 1.8027755537763546, |
| "grad_norm": 4.7263569831848145, |
| "learning_rate": 4.0525824823390045e-05, |
| "loss": 55.9749, |
| "step": 528 |
| }, |
| { |
| "epoch": 1.8096076861489192, |
| "grad_norm": 4.258271217346191, |
| "learning_rate": 4.045397465551513e-05, |
| "loss": 52.5445, |
| "step": 530 |
| }, |
| { |
| "epoch": 1.8164398185214838, |
| "grad_norm": 4.56829309463501, |
| "learning_rate": 4.038191729950569e-05, |
| "loss": 53.8703, |
| "step": 532 |
| }, |
| { |
| "epoch": 1.8232719508940485, |
| "grad_norm": 8.888167381286621, |
| "learning_rate": 4.030965372141927e-05, |
| "loss": 52.7209, |
| "step": 534 |
| }, |
| { |
| "epoch": 1.8301040832666133, |
| "grad_norm": 4.5087175369262695, |
| "learning_rate": 4.0237184890078245e-05, |
| "loss": 54.591, |
| "step": 536 |
| }, |
| { |
| "epoch": 1.836936215639178, |
| "grad_norm": 4.460638523101807, |
| "learning_rate": 4.0164511777056725e-05, |
| "loss": 54.8662, |
| "step": 538 |
| }, |
| { |
| "epoch": 1.8437683480117428, |
| "grad_norm": 3.5958664417266846, |
| "learning_rate": 4.009163535666761e-05, |
| "loss": 53.423, |
| "step": 540 |
| }, |
| { |
| "epoch": 1.8506004803843075, |
| "grad_norm": 4.3935418128967285, |
| "learning_rate": 4.001855660594948e-05, |
| "loss": 53.9048, |
| "step": 542 |
| }, |
| { |
| "epoch": 1.857432612756872, |
| "grad_norm": 5.473939895629883, |
| "learning_rate": 3.994527650465352e-05, |
| "loss": 52.9295, |
| "step": 544 |
| }, |
| { |
| "epoch": 1.8642647451294367, |
| "grad_norm": 4.8625922203063965, |
| "learning_rate": 3.98717960352304e-05, |
| "loss": 51.8002, |
| "step": 546 |
| }, |
| { |
| "epoch": 1.8710968775020016, |
| "grad_norm": 4.244052886962891, |
| "learning_rate": 3.979811618281706e-05, |
| "loss": 53.6904, |
| "step": 548 |
| }, |
| { |
| "epoch": 1.8779290098745665, |
| "grad_norm": 4.050732612609863, |
| "learning_rate": 3.972423793522352e-05, |
| "loss": 54.7441, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.8779290098745665, |
| "eval_loss": 0.8419561982154846, |
| "eval_runtime": 119.6757, |
| "eval_samples_per_second": 32.964, |
| "eval_steps_per_second": 8.247, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.884761142247131, |
| "grad_norm": 5.255309104919434, |
| "learning_rate": 3.9650162282919655e-05, |
| "loss": 53.6842, |
| "step": 552 |
| }, |
| { |
| "epoch": 1.8915932746196957, |
| "grad_norm": 5.483623504638672, |
| "learning_rate": 3.957589021902191e-05, |
| "loss": 54.0004, |
| "step": 554 |
| }, |
| { |
| "epoch": 1.8984254069922604, |
| "grad_norm": 4.224212169647217, |
| "learning_rate": 3.9501422739279956e-05, |
| "loss": 51.7289, |
| "step": 556 |
| }, |
| { |
| "epoch": 1.9052575393648252, |
| "grad_norm": 5.061962127685547, |
| "learning_rate": 3.942676084206338e-05, |
| "loss": 53.4457, |
| "step": 558 |
| }, |
| { |
| "epoch": 1.9120896717373899, |
| "grad_norm": 3.8694398403167725, |
| "learning_rate": 3.9351905528348285e-05, |
| "loss": 51.8595, |
| "step": 560 |
| }, |
| { |
| "epoch": 1.9189218041099547, |
| "grad_norm": 4.149620056152344, |
| "learning_rate": 3.927685780170385e-05, |
| "loss": 51.8196, |
| "step": 562 |
| }, |
| { |
| "epoch": 1.9257539364825194, |
| "grad_norm": 6.877647399902344, |
| "learning_rate": 3.920161866827889e-05, |
| "loss": 52.7279, |
| "step": 564 |
| }, |
| { |
| "epoch": 1.932586068855084, |
| "grad_norm": 4.069815635681152, |
| "learning_rate": 3.9126189136788416e-05, |
| "loss": 51.1502, |
| "step": 566 |
| }, |
| { |
| "epoch": 1.9394182012276486, |
| "grad_norm": 6.629972457885742, |
| "learning_rate": 3.90505702185e-05, |
| "loss": 52.6793, |
| "step": 568 |
| }, |
| { |
| "epoch": 1.9462503336002135, |
| "grad_norm": 4.475677013397217, |
| "learning_rate": 3.897476292722034e-05, |
| "loss": 51.4329, |
| "step": 570 |
| }, |
| { |
| "epoch": 1.9530824659727783, |
| "grad_norm": 5.370522499084473, |
| "learning_rate": 3.889876827928156e-05, |
| "loss": 53.1101, |
| "step": 572 |
| }, |
| { |
| "epoch": 1.959914598345343, |
| "grad_norm": 5.481414794921875, |
| "learning_rate": 3.882258729352768e-05, |
| "loss": 53.3684, |
| "step": 574 |
| }, |
| { |
| "epoch": 1.9667467307179076, |
| "grad_norm": 6.393594741821289, |
| "learning_rate": 3.874622099130087e-05, |
| "loss": 52.7341, |
| "step": 576 |
| }, |
| { |
| "epoch": 1.9735788630904723, |
| "grad_norm": 3.9178807735443115, |
| "learning_rate": 3.866967039642784e-05, |
| "loss": 51.5249, |
| "step": 578 |
| }, |
| { |
| "epoch": 1.9804109954630371, |
| "grad_norm": 9.721770286560059, |
| "learning_rate": 3.859293653520604e-05, |
| "loss": 51.2705, |
| "step": 580 |
| }, |
| { |
| "epoch": 1.987243127835602, |
| "grad_norm": 4.619483470916748, |
| "learning_rate": 3.851602043638994e-05, |
| "loss": 51.7596, |
| "step": 582 |
| }, |
| { |
| "epoch": 1.9940752602081666, |
| "grad_norm": 4.899592399597168, |
| "learning_rate": 3.843892313117724e-05, |
| "loss": 54.7586, |
| "step": 584 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 3.8423385620117188, |
| "learning_rate": 3.8361645653195026e-05, |
| "loss": 44.9497, |
| "step": 586 |
| }, |
| { |
| "epoch": 2.0068321323725646, |
| "grad_norm": 4.93556022644043, |
| "learning_rate": 3.8284189038485936e-05, |
| "loss": 53.1383, |
| "step": 588 |
| }, |
| { |
| "epoch": 2.0136642647451293, |
| "grad_norm": 6.575899124145508, |
| "learning_rate": 3.8206554325494225e-05, |
| "loss": 52.1373, |
| "step": 590 |
| }, |
| { |
| "epoch": 2.0204963971176944, |
| "grad_norm": 3.5134201049804688, |
| "learning_rate": 3.812874255505191e-05, |
| "loss": 50.8711, |
| "step": 592 |
| }, |
| { |
| "epoch": 2.027328529490259, |
| "grad_norm": 4.761475086212158, |
| "learning_rate": 3.805075477036476e-05, |
| "loss": 52.0756, |
| "step": 594 |
| }, |
| { |
| "epoch": 2.0341606618628236, |
| "grad_norm": 3.7381017208099365, |
| "learning_rate": 3.797259201699833e-05, |
| "loss": 51.0594, |
| "step": 596 |
| }, |
| { |
| "epoch": 2.0409927942353883, |
| "grad_norm": 5.102145671844482, |
| "learning_rate": 3.789425534286394e-05, |
| "loss": 52.1454, |
| "step": 598 |
| }, |
| { |
| "epoch": 2.047824926607953, |
| "grad_norm": 4.762547969818115, |
| "learning_rate": 3.781574579820464e-05, |
| "loss": 50.3373, |
| "step": 600 |
| }, |
| { |
| "epoch": 2.047824926607953, |
| "eval_loss": 0.8283991813659668, |
| "eval_runtime": 119.5704, |
| "eval_samples_per_second": 32.993, |
| "eval_steps_per_second": 8.255, |
| "step": 600 |
| }, |
| { |
| "epoch": 2.0546570589805175, |
| "grad_norm": 4.646745681762695, |
| "learning_rate": 3.773706443558111e-05, |
| "loss": 51.0792, |
| "step": 602 |
| }, |
| { |
| "epoch": 2.0614891913530826, |
| "grad_norm": 5.648324012756348, |
| "learning_rate": 3.765821230985758e-05, |
| "loss": 50.6017, |
| "step": 604 |
| }, |
| { |
| "epoch": 2.0683213237256473, |
| "grad_norm": 4.703359603881836, |
| "learning_rate": 3.75791904781876e-05, |
| "loss": 52.4212, |
| "step": 606 |
| }, |
| { |
| "epoch": 2.075153456098212, |
| "grad_norm": 4.082385540008545, |
| "learning_rate": 3.7500000000000003e-05, |
| "loss": 51.9666, |
| "step": 608 |
| }, |
| { |
| "epoch": 2.0819855884707765, |
| "grad_norm": 4.6461687088012695, |
| "learning_rate": 3.74206419369846e-05, |
| "loss": 51.6205, |
| "step": 610 |
| }, |
| { |
| "epoch": 2.088817720843341, |
| "grad_norm": 3.9972918033599854, |
| "learning_rate": 3.7341117353077966e-05, |
| "loss": 52.6521, |
| "step": 612 |
| }, |
| { |
| "epoch": 2.0956498532159062, |
| "grad_norm": 5.636791229248047, |
| "learning_rate": 3.726142731444921e-05, |
| "loss": 52.6811, |
| "step": 614 |
| }, |
| { |
| "epoch": 2.102481985588471, |
| "grad_norm": 6.055325508117676, |
| "learning_rate": 3.718157288948563e-05, |
| "loss": 51.2952, |
| "step": 616 |
| }, |
| { |
| "epoch": 2.1093141179610355, |
| "grad_norm": 5.317610740661621, |
| "learning_rate": 3.710155514877844e-05, |
| "loss": 52.4443, |
| "step": 618 |
| }, |
| { |
| "epoch": 2.1161462503336, |
| "grad_norm": 4.979522705078125, |
| "learning_rate": 3.702137516510838e-05, |
| "loss": 51.3593, |
| "step": 620 |
| }, |
| { |
| "epoch": 2.122978382706165, |
| "grad_norm": 7.410902500152588, |
| "learning_rate": 3.694103401343136e-05, |
| "loss": 51.5919, |
| "step": 622 |
| }, |
| { |
| "epoch": 2.12981051507873, |
| "grad_norm": 4.962103366851807, |
| "learning_rate": 3.686053277086401e-05, |
| "loss": 51.272, |
| "step": 624 |
| }, |
| { |
| "epoch": 2.1366426474512945, |
| "grad_norm": 4.0044426918029785, |
| "learning_rate": 3.6779872516669295e-05, |
| "loss": 51.6362, |
| "step": 626 |
| }, |
| { |
| "epoch": 2.143474779823859, |
| "grad_norm": 5.016703128814697, |
| "learning_rate": 3.669905433224199e-05, |
| "loss": 51.7369, |
| "step": 628 |
| }, |
| { |
| "epoch": 2.150306912196424, |
| "grad_norm": 4.700343132019043, |
| "learning_rate": 3.6618079301094216e-05, |
| "loss": 50.9454, |
| "step": 630 |
| }, |
| { |
| "epoch": 2.1571390445689884, |
| "grad_norm": 8.11246395111084, |
| "learning_rate": 3.653694850884091e-05, |
| "loss": 50.4605, |
| "step": 632 |
| }, |
| { |
| "epoch": 2.163971176941553, |
| "grad_norm": 3.8724536895751953, |
| "learning_rate": 3.645566304318526e-05, |
| "loss": 52.4849, |
| "step": 634 |
| }, |
| { |
| "epoch": 2.170803309314118, |
| "grad_norm": 3.699873208999634, |
| "learning_rate": 3.637422399390413e-05, |
| "loss": 49.8017, |
| "step": 636 |
| }, |
| { |
| "epoch": 2.1776354416866828, |
| "grad_norm": 4.757104873657227, |
| "learning_rate": 3.6292632452833436e-05, |
| "loss": 52.0966, |
| "step": 638 |
| }, |
| { |
| "epoch": 2.1844675740592474, |
| "grad_norm": 5.273576736450195, |
| "learning_rate": 3.621088951385353e-05, |
| "loss": 49.5201, |
| "step": 640 |
| }, |
| { |
| "epoch": 2.191299706431812, |
| "grad_norm": 4.152122497558594, |
| "learning_rate": 3.612899627287452e-05, |
| "loss": 51.121, |
| "step": 642 |
| }, |
| { |
| "epoch": 2.1981318388043767, |
| "grad_norm": 4.448339939117432, |
| "learning_rate": 3.604695382782159e-05, |
| "loss": 51.5833, |
| "step": 644 |
| }, |
| { |
| "epoch": 2.2049639711769418, |
| "grad_norm": 3.272676706314087, |
| "learning_rate": 3.596476327862024e-05, |
| "loss": 50.4036, |
| "step": 646 |
| }, |
| { |
| "epoch": 2.2117961035495064, |
| "grad_norm": 4.293691158294678, |
| "learning_rate": 3.588242572718162e-05, |
| "loss": 50.4138, |
| "step": 648 |
| }, |
| { |
| "epoch": 2.218628235922071, |
| "grad_norm": 6.384798049926758, |
| "learning_rate": 3.579994227738767e-05, |
| "loss": 49.0042, |
| "step": 650 |
| }, |
| { |
| "epoch": 2.218628235922071, |
| "eval_loss": 0.8110712170600891, |
| "eval_runtime": 119.0744, |
| "eval_samples_per_second": 33.131, |
| "eval_steps_per_second": 8.289, |
| "step": 650 |
| }, |
| { |
| "epoch": 2.2254603682946357, |
| "grad_norm": 4.501573085784912, |
| "learning_rate": 3.5717314035076355e-05, |
| "loss": 49.7713, |
| "step": 652 |
| }, |
| { |
| "epoch": 2.2322925006672003, |
| "grad_norm": 4.808114051818848, |
| "learning_rate": 3.5634542108026876e-05, |
| "loss": 50.6265, |
| "step": 654 |
| }, |
| { |
| "epoch": 2.239124633039765, |
| "grad_norm": 5.616351127624512, |
| "learning_rate": 3.5551627605944745e-05, |
| "loss": 52.1332, |
| "step": 656 |
| }, |
| { |
| "epoch": 2.24595676541233, |
| "grad_norm": 7.0716071128845215, |
| "learning_rate": 3.5468571640446994e-05, |
| "loss": 50.7825, |
| "step": 658 |
| }, |
| { |
| "epoch": 2.2527888977848947, |
| "grad_norm": 4.64641809463501, |
| "learning_rate": 3.5385375325047166e-05, |
| "loss": 50.3092, |
| "step": 660 |
| }, |
| { |
| "epoch": 2.2596210301574593, |
| "grad_norm": 4.058784008026123, |
| "learning_rate": 3.5302039775140486e-05, |
| "loss": 51.7827, |
| "step": 662 |
| }, |
| { |
| "epoch": 2.266453162530024, |
| "grad_norm": 4.011864185333252, |
| "learning_rate": 3.521856610798887e-05, |
| "loss": 51.4194, |
| "step": 664 |
| }, |
| { |
| "epoch": 2.2732852949025886, |
| "grad_norm": 3.89857816696167, |
| "learning_rate": 3.513495544270592e-05, |
| "loss": 50.7032, |
| "step": 666 |
| }, |
| { |
| "epoch": 2.2801174272751537, |
| "grad_norm": 4.966712951660156, |
| "learning_rate": 3.505120890024195e-05, |
| "loss": 49.925, |
| "step": 668 |
| }, |
| { |
| "epoch": 2.2869495596477183, |
| "grad_norm": 4.181141376495361, |
| "learning_rate": 3.496732760336895e-05, |
| "loss": 49.5112, |
| "step": 670 |
| }, |
| { |
| "epoch": 2.293781692020283, |
| "grad_norm": 4.761594772338867, |
| "learning_rate": 3.4883312676665536e-05, |
| "loss": 49.6545, |
| "step": 672 |
| }, |
| { |
| "epoch": 2.3006138243928476, |
| "grad_norm": 3.97501802444458, |
| "learning_rate": 3.479916524650188e-05, |
| "loss": 51.1862, |
| "step": 674 |
| }, |
| { |
| "epoch": 2.307445956765412, |
| "grad_norm": 5.200672149658203, |
| "learning_rate": 3.4714886441024574e-05, |
| "loss": 49.9163, |
| "step": 676 |
| }, |
| { |
| "epoch": 2.314278089137977, |
| "grad_norm": 4.147047519683838, |
| "learning_rate": 3.4630477390141556e-05, |
| "loss": 48.6138, |
| "step": 678 |
| }, |
| { |
| "epoch": 2.321110221510542, |
| "grad_norm": 4.9791693687438965, |
| "learning_rate": 3.4545939225506934e-05, |
| "loss": 51.4538, |
| "step": 680 |
| }, |
| { |
| "epoch": 2.3279423538831066, |
| "grad_norm": 4.929348945617676, |
| "learning_rate": 3.4461273080505793e-05, |
| "loss": 51.2735, |
| "step": 682 |
| }, |
| { |
| "epoch": 2.334774486255671, |
| "grad_norm": 4.98499059677124, |
| "learning_rate": 3.437648009023905e-05, |
| "loss": 48.5889, |
| "step": 684 |
| }, |
| { |
| "epoch": 2.341606618628236, |
| "grad_norm": 4.354183673858643, |
| "learning_rate": 3.4291561391508185e-05, |
| "loss": 51.7768, |
| "step": 686 |
| }, |
| { |
| "epoch": 2.3484387510008005, |
| "grad_norm": 3.482697010040283, |
| "learning_rate": 3.420651812280006e-05, |
| "loss": 48.9966, |
| "step": 688 |
| }, |
| { |
| "epoch": 2.3552708833733655, |
| "grad_norm": 4.613458156585693, |
| "learning_rate": 3.4121351424271594e-05, |
| "loss": 50.8534, |
| "step": 690 |
| }, |
| { |
| "epoch": 2.36210301574593, |
| "grad_norm": 3.93235182762146, |
| "learning_rate": 3.4036062437734484e-05, |
| "loss": 50.9164, |
| "step": 692 |
| }, |
| { |
| "epoch": 2.368935148118495, |
| "grad_norm": 5.348623275756836, |
| "learning_rate": 3.395065230663996e-05, |
| "loss": 49.6679, |
| "step": 694 |
| }, |
| { |
| "epoch": 2.3757672804910595, |
| "grad_norm": 5.050134181976318, |
| "learning_rate": 3.386512217606339e-05, |
| "loss": 48.0534, |
| "step": 696 |
| }, |
| { |
| "epoch": 2.382599412863624, |
| "grad_norm": 3.7587573528289795, |
| "learning_rate": 3.3779473192688954e-05, |
| "loss": 50.3013, |
| "step": 698 |
| }, |
| { |
| "epoch": 2.3894315452361887, |
| "grad_norm": 5.177303314208984, |
| "learning_rate": 3.369370650479425e-05, |
| "loss": 48.8704, |
| "step": 700 |
| }, |
| { |
| "epoch": 2.3894315452361887, |
| "eval_loss": 0.7940448522567749, |
| "eval_runtime": 119.8708, |
| "eval_samples_per_second": 32.91, |
| "eval_steps_per_second": 8.234, |
| "step": 700 |
| }, |
| { |
| "epoch": 2.396263677608754, |
| "grad_norm": 4.268886089324951, |
| "learning_rate": 3.360782326223493e-05, |
| "loss": 50.0788, |
| "step": 702 |
| }, |
| { |
| "epoch": 2.4030958099813184, |
| "grad_norm": 4.847851276397705, |
| "learning_rate": 3.3521824616429285e-05, |
| "loss": 50.5298, |
| "step": 704 |
| }, |
| { |
| "epoch": 2.409927942353883, |
| "grad_norm": 4.221863746643066, |
| "learning_rate": 3.3435711720342764e-05, |
| "loss": 51.0571, |
| "step": 706 |
| }, |
| { |
| "epoch": 2.4167600747264477, |
| "grad_norm": 5.5122528076171875, |
| "learning_rate": 3.3349485728472535e-05, |
| "loss": 48.3266, |
| "step": 708 |
| }, |
| { |
| "epoch": 2.4235922070990124, |
| "grad_norm": 3.7766902446746826, |
| "learning_rate": 3.326314779683207e-05, |
| "loss": 49.9334, |
| "step": 710 |
| }, |
| { |
| "epoch": 2.4304243394715774, |
| "grad_norm": 4.093820571899414, |
| "learning_rate": 3.3176699082935545e-05, |
| "loss": 48.4746, |
| "step": 712 |
| }, |
| { |
| "epoch": 2.437256471844142, |
| "grad_norm": 4.116121292114258, |
| "learning_rate": 3.3090140745782396e-05, |
| "loss": 48.5131, |
| "step": 714 |
| }, |
| { |
| "epoch": 2.4440886042167067, |
| "grad_norm": 5.181516647338867, |
| "learning_rate": 3.300347394584172e-05, |
| "loss": 50.4981, |
| "step": 716 |
| }, |
| { |
| "epoch": 2.4509207365892713, |
| "grad_norm": 4.464053630828857, |
| "learning_rate": 3.2916699845036816e-05, |
| "loss": 50.2301, |
| "step": 718 |
| }, |
| { |
| "epoch": 2.457752868961836, |
| "grad_norm": 4.229206562042236, |
| "learning_rate": 3.282981960672948e-05, |
| "loss": 50.1858, |
| "step": 720 |
| }, |
| { |
| "epoch": 2.4645850013344006, |
| "grad_norm": 3.8356049060821533, |
| "learning_rate": 3.2742834395704486e-05, |
| "loss": 48.9147, |
| "step": 722 |
| }, |
| { |
| "epoch": 2.4714171337069657, |
| "grad_norm": 3.9584670066833496, |
| "learning_rate": 3.265574537815398e-05, |
| "loss": 48.6574, |
| "step": 724 |
| }, |
| { |
| "epoch": 2.4782492660795303, |
| "grad_norm": 4.802350997924805, |
| "learning_rate": 3.25685537216618e-05, |
| "loss": 48.9724, |
| "step": 726 |
| }, |
| { |
| "epoch": 2.485081398452095, |
| "grad_norm": 4.078526020050049, |
| "learning_rate": 3.248126059518785e-05, |
| "loss": 47.7639, |
| "step": 728 |
| }, |
| { |
| "epoch": 2.4919135308246596, |
| "grad_norm": 3.8187856674194336, |
| "learning_rate": 3.2393867169052385e-05, |
| "loss": 48.2195, |
| "step": 730 |
| }, |
| { |
| "epoch": 2.4987456631972247, |
| "grad_norm": 5.273796081542969, |
| "learning_rate": 3.230637461492043e-05, |
| "loss": 49.7512, |
| "step": 732 |
| }, |
| { |
| "epoch": 2.5055777955697893, |
| "grad_norm": 4.126491069793701, |
| "learning_rate": 3.221878410578593e-05, |
| "loss": 49.0844, |
| "step": 734 |
| }, |
| { |
| "epoch": 2.512409927942354, |
| "grad_norm": 4.665433406829834, |
| "learning_rate": 3.213109681595612e-05, |
| "loss": 48.7829, |
| "step": 736 |
| }, |
| { |
| "epoch": 2.5192420603149186, |
| "grad_norm": 4.897470951080322, |
| "learning_rate": 3.2043313921035743e-05, |
| "loss": 49.5252, |
| "step": 738 |
| }, |
| { |
| "epoch": 2.5260741926874832, |
| "grad_norm": 5.257498264312744, |
| "learning_rate": 3.195543659791132e-05, |
| "loss": 50.4767, |
| "step": 740 |
| }, |
| { |
| "epoch": 2.532906325060048, |
| "grad_norm": 3.754957914352417, |
| "learning_rate": 3.186746602473533e-05, |
| "loss": 49.4055, |
| "step": 742 |
| }, |
| { |
| "epoch": 2.5397384574326125, |
| "grad_norm": 3.994774341583252, |
| "learning_rate": 3.177940338091043e-05, |
| "loss": 49.3039, |
| "step": 744 |
| }, |
| { |
| "epoch": 2.5465705898051776, |
| "grad_norm": 4.923650741577148, |
| "learning_rate": 3.169124984707367e-05, |
| "loss": 48.6568, |
| "step": 746 |
| }, |
| { |
| "epoch": 2.5534027221777422, |
| "grad_norm": 6.377063274383545, |
| "learning_rate": 3.160300660508064e-05, |
| "loss": 48.7655, |
| "step": 748 |
| }, |
| { |
| "epoch": 2.560234854550307, |
| "grad_norm": 3.7124524116516113, |
| "learning_rate": 3.151467483798961e-05, |
| "loss": 48.0997, |
| "step": 750 |
| }, |
| { |
| "epoch": 2.560234854550307, |
| "eval_loss": 0.7798339128494263, |
| "eval_runtime": 119.2173, |
| "eval_samples_per_second": 33.091, |
| "eval_steps_per_second": 8.279, |
| "step": 750 |
| }, |
| { |
| "epoch": 2.5670669869228715, |
| "grad_norm": 4.752464294433594, |
| "learning_rate": 3.14262557300457e-05, |
| "loss": 48.422, |
| "step": 752 |
| }, |
| { |
| "epoch": 2.5738991192954366, |
| "grad_norm": 4.635769844055176, |
| "learning_rate": 3.1337750466665e-05, |
| "loss": 48.9177, |
| "step": 754 |
| }, |
| { |
| "epoch": 2.580731251668001, |
| "grad_norm": 4.357526779174805, |
| "learning_rate": 3.124916023441865e-05, |
| "loss": 49.4801, |
| "step": 756 |
| }, |
| { |
| "epoch": 2.587563384040566, |
| "grad_norm": 16.189651489257812, |
| "learning_rate": 3.116048622101694e-05, |
| "loss": 49.275, |
| "step": 758 |
| }, |
| { |
| "epoch": 2.5943955164131305, |
| "grad_norm": 3.983285903930664, |
| "learning_rate": 3.107172961529343e-05, |
| "loss": 47.968, |
| "step": 760 |
| }, |
| { |
| "epoch": 2.601227648785695, |
| "grad_norm": 4.357701301574707, |
| "learning_rate": 3.098289160718895e-05, |
| "loss": 47.8592, |
| "step": 762 |
| }, |
| { |
| "epoch": 2.6080597811582598, |
| "grad_norm": 3.9686052799224854, |
| "learning_rate": 3.0893973387735687e-05, |
| "loss": 49.5191, |
| "step": 764 |
| }, |
| { |
| "epoch": 2.6148919135308244, |
| "grad_norm": 3.9062581062316895, |
| "learning_rate": 3.0804976149041195e-05, |
| "loss": 48.5485, |
| "step": 766 |
| }, |
| { |
| "epoch": 2.6217240459033895, |
| "grad_norm": 4.7290143966674805, |
| "learning_rate": 3.071590108427244e-05, |
| "loss": 49.2073, |
| "step": 768 |
| }, |
| { |
| "epoch": 2.628556178275954, |
| "grad_norm": 4.57703161239624, |
| "learning_rate": 3.062674938763976e-05, |
| "loss": 49.7624, |
| "step": 770 |
| }, |
| { |
| "epoch": 2.6353883106485188, |
| "grad_norm": 4.4061737060546875, |
| "learning_rate": 3.0537522254380905e-05, |
| "loss": 49.0566, |
| "step": 772 |
| }, |
| { |
| "epoch": 2.6422204430210834, |
| "grad_norm": 4.166697978973389, |
| "learning_rate": 3.044822088074496e-05, |
| "loss": 49.3193, |
| "step": 774 |
| }, |
| { |
| "epoch": 2.6490525753936485, |
| "grad_norm": 3.5513172149658203, |
| "learning_rate": 3.0358846463976372e-05, |
| "loss": 48.9675, |
| "step": 776 |
| }, |
| { |
| "epoch": 2.655884707766213, |
| "grad_norm": 4.9701995849609375, |
| "learning_rate": 3.026940020229882e-05, |
| "loss": 49.6229, |
| "step": 778 |
| }, |
| { |
| "epoch": 2.6627168401387777, |
| "grad_norm": 4.223094463348389, |
| "learning_rate": 3.017988329489923e-05, |
| "loss": 47.1613, |
| "step": 780 |
| }, |
| { |
| "epoch": 2.6695489725113424, |
| "grad_norm": 4.849906921386719, |
| "learning_rate": 3.0090296941911633e-05, |
| "loss": 47.5764, |
| "step": 782 |
| }, |
| { |
| "epoch": 2.676381104883907, |
| "grad_norm": 3.507953643798828, |
| "learning_rate": 3.0000642344401113e-05, |
| "loss": 47.1944, |
| "step": 784 |
| }, |
| { |
| "epoch": 2.6832132372564717, |
| "grad_norm": 4.040694713592529, |
| "learning_rate": 2.9910920704347696e-05, |
| "loss": 48.6472, |
| "step": 786 |
| }, |
| { |
| "epoch": 2.6900453696290363, |
| "grad_norm": 5.141117095947266, |
| "learning_rate": 2.9821133224630226e-05, |
| "loss": 47.177, |
| "step": 788 |
| }, |
| { |
| "epoch": 2.6968775020016014, |
| "grad_norm": 4.463181018829346, |
| "learning_rate": 2.9731281109010256e-05, |
| "loss": 47.4283, |
| "step": 790 |
| }, |
| { |
| "epoch": 2.703709634374166, |
| "grad_norm": 3.586456060409546, |
| "learning_rate": 2.9641365562115887e-05, |
| "loss": 48.9784, |
| "step": 792 |
| }, |
| { |
| "epoch": 2.7105417667467306, |
| "grad_norm": 3.9780969619750977, |
| "learning_rate": 2.9551387789425638e-05, |
| "loss": 48.601, |
| "step": 794 |
| }, |
| { |
| "epoch": 2.7173738991192953, |
| "grad_norm": 4.445759296417236, |
| "learning_rate": 2.9461348997252265e-05, |
| "loss": 49.9106, |
| "step": 796 |
| }, |
| { |
| "epoch": 2.7242060314918604, |
| "grad_norm": 4.416858673095703, |
| "learning_rate": 2.9371250392726614e-05, |
| "loss": 48.3298, |
| "step": 798 |
| }, |
| { |
| "epoch": 2.731038163864425, |
| "grad_norm": 4.36728572845459, |
| "learning_rate": 2.9281093183781403e-05, |
| "loss": 48.6063, |
| "step": 800 |
| }, |
| { |
| "epoch": 2.731038163864425, |
| "eval_loss": 0.7699871063232422, |
| "eval_runtime": 119.5951, |
| "eval_samples_per_second": 32.986, |
| "eval_steps_per_second": 8.253, |
| "step": 800 |
| }, |
| { |
| "epoch": 2.7378702962369896, |
| "grad_norm": 5.540378570556641, |
| "learning_rate": 2.919087857913508e-05, |
| "loss": 49.4323, |
| "step": 802 |
| }, |
| { |
| "epoch": 2.7447024286095543, |
| "grad_norm": 3.73681640625, |
| "learning_rate": 2.9100607788275545e-05, |
| "loss": 49.0439, |
| "step": 804 |
| }, |
| { |
| "epoch": 2.751534560982119, |
| "grad_norm": 4.437684535980225, |
| "learning_rate": 2.9010282021444008e-05, |
| "loss": 48.8682, |
| "step": 806 |
| }, |
| { |
| "epoch": 2.7583666933546835, |
| "grad_norm": 4.933871746063232, |
| "learning_rate": 2.891990248961871e-05, |
| "loss": 48.0791, |
| "step": 808 |
| }, |
| { |
| "epoch": 2.7651988257272486, |
| "grad_norm": 4.351380825042725, |
| "learning_rate": 2.8829470404498697e-05, |
| "loss": 47.0584, |
| "step": 810 |
| }, |
| { |
| "epoch": 2.7720309580998133, |
| "grad_norm": 4.953640937805176, |
| "learning_rate": 2.8738986978487625e-05, |
| "loss": 50.0531, |
| "step": 812 |
| }, |
| { |
| "epoch": 2.778863090472378, |
| "grad_norm": 3.676950216293335, |
| "learning_rate": 2.8648453424677434e-05, |
| "loss": 46.9994, |
| "step": 814 |
| }, |
| { |
| "epoch": 2.7856952228449425, |
| "grad_norm": 4.177380084991455, |
| "learning_rate": 2.8557870956832132e-05, |
| "loss": 48.3932, |
| "step": 816 |
| }, |
| { |
| "epoch": 2.7925273552175076, |
| "grad_norm": 4.177119731903076, |
| "learning_rate": 2.846724078937149e-05, |
| "loss": 48.2385, |
| "step": 818 |
| }, |
| { |
| "epoch": 2.7993594875900722, |
| "grad_norm": 4.261831283569336, |
| "learning_rate": 2.8376564137354795e-05, |
| "loss": 48.813, |
| "step": 820 |
| }, |
| { |
| "epoch": 2.806191619962637, |
| "grad_norm": 3.7779037952423096, |
| "learning_rate": 2.8285842216464543e-05, |
| "loss": 48.801, |
| "step": 822 |
| }, |
| { |
| "epoch": 2.8130237523352015, |
| "grad_norm": 5.378250598907471, |
| "learning_rate": 2.8195076242990122e-05, |
| "loss": 45.9584, |
| "step": 824 |
| }, |
| { |
| "epoch": 2.819855884707766, |
| "grad_norm": 3.5369153022766113, |
| "learning_rate": 2.8104267433811533e-05, |
| "loss": 46.97, |
| "step": 826 |
| }, |
| { |
| "epoch": 2.826688017080331, |
| "grad_norm": 3.493602991104126, |
| "learning_rate": 2.8013417006383076e-05, |
| "loss": 46.7352, |
| "step": 828 |
| }, |
| { |
| "epoch": 2.8335201494528954, |
| "grad_norm": 5.41981840133667, |
| "learning_rate": 2.7922526178717017e-05, |
| "loss": 48.4586, |
| "step": 830 |
| }, |
| { |
| "epoch": 2.8403522818254605, |
| "grad_norm": 4.6053948402404785, |
| "learning_rate": 2.783159616936723e-05, |
| "loss": 46.5008, |
| "step": 832 |
| }, |
| { |
| "epoch": 2.847184414198025, |
| "grad_norm": 4.136333465576172, |
| "learning_rate": 2.774062819741293e-05, |
| "loss": 47.3448, |
| "step": 834 |
| }, |
| { |
| "epoch": 2.85401654657059, |
| "grad_norm": 3.927877187728882, |
| "learning_rate": 2.764962348244228e-05, |
| "loss": 46.7369, |
| "step": 836 |
| }, |
| { |
| "epoch": 2.8608486789431544, |
| "grad_norm": 4.283491611480713, |
| "learning_rate": 2.7558583244536007e-05, |
| "loss": 48.098, |
| "step": 838 |
| }, |
| { |
| "epoch": 2.8676808113157195, |
| "grad_norm": 3.802030563354492, |
| "learning_rate": 2.7467508704251137e-05, |
| "loss": 48.2908, |
| "step": 840 |
| }, |
| { |
| "epoch": 2.874512943688284, |
| "grad_norm": 5.212815761566162, |
| "learning_rate": 2.7376401082604564e-05, |
| "loss": 47.8921, |
| "step": 842 |
| }, |
| { |
| "epoch": 2.8813450760608488, |
| "grad_norm": 4.39296293258667, |
| "learning_rate": 2.7285261601056698e-05, |
| "loss": 48.2491, |
| "step": 844 |
| }, |
| { |
| "epoch": 2.8881772084334134, |
| "grad_norm": 5.428844928741455, |
| "learning_rate": 2.7194091481495076e-05, |
| "loss": 49.1209, |
| "step": 846 |
| }, |
| { |
| "epoch": 2.895009340805978, |
| "grad_norm": 3.9836559295654297, |
| "learning_rate": 2.7102891946217994e-05, |
| "loss": 47.0515, |
| "step": 848 |
| }, |
| { |
| "epoch": 2.9018414731785427, |
| "grad_norm": 3.1067824363708496, |
| "learning_rate": 2.7011664217918154e-05, |
| "loss": 46.0087, |
| "step": 850 |
| }, |
| { |
| "epoch": 2.9018414731785427, |
| "eval_loss": 0.760260820388794, |
| "eval_runtime": 119.6698, |
| "eval_samples_per_second": 32.966, |
| "eval_steps_per_second": 8.248, |
| "step": 850 |
| }, |
| { |
| "epoch": 2.9086736055511073, |
| "grad_norm": 4.688024997711182, |
| "learning_rate": 2.6920409519666174e-05, |
| "loss": 47.0489, |
| "step": 852 |
| }, |
| { |
| "epoch": 2.9155057379236724, |
| "grad_norm": 4.777935981750488, |
| "learning_rate": 2.6829129074894304e-05, |
| "loss": 48.1153, |
| "step": 854 |
| }, |
| { |
| "epoch": 2.922337870296237, |
| "grad_norm": 4.912516117095947, |
| "learning_rate": 2.6737824107379948e-05, |
| "loss": 48.0798, |
| "step": 856 |
| }, |
| { |
| "epoch": 2.9291700026688017, |
| "grad_norm": 4.066973686218262, |
| "learning_rate": 2.6646495841229287e-05, |
| "loss": 46.9194, |
| "step": 858 |
| }, |
| { |
| "epoch": 2.9360021350413663, |
| "grad_norm": 4.499208927154541, |
| "learning_rate": 2.655514550086086e-05, |
| "loss": 48.3087, |
| "step": 860 |
| }, |
| { |
| "epoch": 2.9428342674139314, |
| "grad_norm": 4.891952991485596, |
| "learning_rate": 2.6463774310989154e-05, |
| "loss": 46.8565, |
| "step": 862 |
| }, |
| { |
| "epoch": 2.949666399786496, |
| "grad_norm": 3.8262720108032227, |
| "learning_rate": 2.637238349660819e-05, |
| "loss": 46.7596, |
| "step": 864 |
| }, |
| { |
| "epoch": 2.9564985321590607, |
| "grad_norm": 5.6072492599487305, |
| "learning_rate": 2.6280974282975063e-05, |
| "loss": 45.254, |
| "step": 866 |
| }, |
| { |
| "epoch": 2.9633306645316253, |
| "grad_norm": 3.9889800548553467, |
| "learning_rate": 2.6189547895593562e-05, |
| "loss": 46.754, |
| "step": 868 |
| }, |
| { |
| "epoch": 2.97016279690419, |
| "grad_norm": 3.7260525226593018, |
| "learning_rate": 2.6098105560197722e-05, |
| "loss": 46.6516, |
| "step": 870 |
| }, |
| { |
| "epoch": 2.9769949292767546, |
| "grad_norm": 4.090394973754883, |
| "learning_rate": 2.600664850273538e-05, |
| "loss": 47.2404, |
| "step": 872 |
| }, |
| { |
| "epoch": 2.983827061649319, |
| "grad_norm": 3.6287267208099365, |
| "learning_rate": 2.5915177949351765e-05, |
| "loss": 46.3821, |
| "step": 874 |
| }, |
| { |
| "epoch": 2.9906591940218843, |
| "grad_norm": 3.5229976177215576, |
| "learning_rate": 2.582369512637302e-05, |
| "loss": 46.8471, |
| "step": 876 |
| }, |
| { |
| "epoch": 2.997491326394449, |
| "grad_norm": 3.532615900039673, |
| "learning_rate": 2.5732201260289806e-05, |
| "loss": 47.0364, |
| "step": 878 |
| }, |
| { |
| "epoch": 3.0034160661862823, |
| "grad_norm": 3.482403039932251, |
| "learning_rate": 2.564069757774082e-05, |
| "loss": 40.3241, |
| "step": 880 |
| }, |
| { |
| "epoch": 3.010248198558847, |
| "grad_norm": 3.94649600982666, |
| "learning_rate": 2.554918530549637e-05, |
| "loss": 46.7226, |
| "step": 882 |
| }, |
| { |
| "epoch": 3.0170803309314116, |
| "grad_norm": 4.395301818847656, |
| "learning_rate": 2.545766567044194e-05, |
| "loss": 45.266, |
| "step": 884 |
| }, |
| { |
| "epoch": 3.0239124633039767, |
| "grad_norm": 4.813998699188232, |
| "learning_rate": 2.5366139899561696e-05, |
| "loss": 46.8651, |
| "step": 886 |
| }, |
| { |
| "epoch": 3.0307445956765413, |
| "grad_norm": 5.5799174308776855, |
| "learning_rate": 2.527460921992209e-05, |
| "loss": 46.5727, |
| "step": 888 |
| }, |
| { |
| "epoch": 3.037576728049106, |
| "grad_norm": 6.693199634552002, |
| "learning_rate": 2.518307485865538e-05, |
| "loss": 47.987, |
| "step": 890 |
| }, |
| { |
| "epoch": 3.0444088604216706, |
| "grad_norm": 6.33953332901001, |
| "learning_rate": 2.509153804294318e-05, |
| "loss": 45.7221, |
| "step": 892 |
| }, |
| { |
| "epoch": 3.051240992794235, |
| "grad_norm": 4.887784957885742, |
| "learning_rate": 2.5e-05, |
| "loss": 44.5186, |
| "step": 894 |
| }, |
| { |
| "epoch": 3.0580731251668003, |
| "grad_norm": 4.337290287017822, |
| "learning_rate": 2.490846195705683e-05, |
| "loss": 46.394, |
| "step": 896 |
| }, |
| { |
| "epoch": 3.064905257539365, |
| "grad_norm": 3.7094030380249023, |
| "learning_rate": 2.4816925141344623e-05, |
| "loss": 45.122, |
| "step": 898 |
| }, |
| { |
| "epoch": 3.0717373899119296, |
| "grad_norm": 3.71903920173645, |
| "learning_rate": 2.4725390780077908e-05, |
| "loss": 44.7121, |
| "step": 900 |
| }, |
| { |
| "epoch": 3.0717373899119296, |
| "eval_loss": 0.7495905160903931, |
| "eval_runtime": 119.7503, |
| "eval_samples_per_second": 32.944, |
| "eval_steps_per_second": 8.242, |
| "step": 900 |
| }, |
| { |
| "epoch": 3.078569522284494, |
| "grad_norm": 4.690406799316406, |
| "learning_rate": 2.4633860100438316e-05, |
| "loss": 45.6299, |
| "step": 902 |
| }, |
| { |
| "epoch": 3.085401654657059, |
| "grad_norm": 4.29756498336792, |
| "learning_rate": 2.4542334329558077e-05, |
| "loss": 48.2504, |
| "step": 904 |
| }, |
| { |
| "epoch": 3.092233787029624, |
| "grad_norm": 5.62404727935791, |
| "learning_rate": 2.4450814694503636e-05, |
| "loss": 47.6091, |
| "step": 906 |
| }, |
| { |
| "epoch": 3.0990659194021886, |
| "grad_norm": 3.726529836654663, |
| "learning_rate": 2.435930242225919e-05, |
| "loss": 46.4755, |
| "step": 908 |
| }, |
| { |
| "epoch": 3.105898051774753, |
| "grad_norm": 6.04416036605835, |
| "learning_rate": 2.4267798739710203e-05, |
| "loss": 46.9715, |
| "step": 910 |
| }, |
| { |
| "epoch": 3.112730184147318, |
| "grad_norm": 3.8375885486602783, |
| "learning_rate": 2.4176304873626985e-05, |
| "loss": 47.9794, |
| "step": 912 |
| }, |
| { |
| "epoch": 3.1195623165198825, |
| "grad_norm": 3.296687602996826, |
| "learning_rate": 2.4084822050648237e-05, |
| "loss": 45.0776, |
| "step": 914 |
| }, |
| { |
| "epoch": 3.126394448892447, |
| "grad_norm": 3.546963930130005, |
| "learning_rate": 2.399335149726463e-05, |
| "loss": 44.6584, |
| "step": 916 |
| }, |
| { |
| "epoch": 3.133226581265012, |
| "grad_norm": 3.896601676940918, |
| "learning_rate": 2.390189443980229e-05, |
| "loss": 47.0284, |
| "step": 918 |
| }, |
| { |
| "epoch": 3.140058713637577, |
| "grad_norm": 3.570570468902588, |
| "learning_rate": 2.3810452104406444e-05, |
| "loss": 46.4413, |
| "step": 920 |
| }, |
| { |
| "epoch": 3.1468908460101415, |
| "grad_norm": 4.160488605499268, |
| "learning_rate": 2.3719025717024946e-05, |
| "loss": 47.1564, |
| "step": 922 |
| }, |
| { |
| "epoch": 3.153722978382706, |
| "grad_norm": 5.714613914489746, |
| "learning_rate": 2.3627616503391814e-05, |
| "loss": 48.2275, |
| "step": 924 |
| }, |
| { |
| "epoch": 3.1605551107552707, |
| "grad_norm": 4.362124919891357, |
| "learning_rate": 2.3536225689010845e-05, |
| "loss": 47.0592, |
| "step": 926 |
| }, |
| { |
| "epoch": 3.167387243127836, |
| "grad_norm": 6.478647708892822, |
| "learning_rate": 2.3444854499139142e-05, |
| "loss": 47.4139, |
| "step": 928 |
| }, |
| { |
| "epoch": 3.1742193755004005, |
| "grad_norm": 3.713979721069336, |
| "learning_rate": 2.3353504158770722e-05, |
| "loss": 47.7301, |
| "step": 930 |
| }, |
| { |
| "epoch": 3.181051507872965, |
| "grad_norm": 3.875537872314453, |
| "learning_rate": 2.3262175892620065e-05, |
| "loss": 45.6112, |
| "step": 932 |
| }, |
| { |
| "epoch": 3.1878836402455297, |
| "grad_norm": 5.328731536865234, |
| "learning_rate": 2.3170870925105702e-05, |
| "loss": 46.6125, |
| "step": 934 |
| }, |
| { |
| "epoch": 3.1947157726180944, |
| "grad_norm": 5.152383327484131, |
| "learning_rate": 2.307959048033383e-05, |
| "loss": 45.6076, |
| "step": 936 |
| }, |
| { |
| "epoch": 3.201547904990659, |
| "grad_norm": 4.689112186431885, |
| "learning_rate": 2.2988335782081855e-05, |
| "loss": 45.648, |
| "step": 938 |
| }, |
| { |
| "epoch": 3.208380037363224, |
| "grad_norm": 3.3412325382232666, |
| "learning_rate": 2.2897108053782e-05, |
| "loss": 44.4993, |
| "step": 940 |
| }, |
| { |
| "epoch": 3.2152121697357887, |
| "grad_norm": 11.583976745605469, |
| "learning_rate": 2.280590851850493e-05, |
| "loss": 46.3174, |
| "step": 942 |
| }, |
| { |
| "epoch": 3.2220443021083534, |
| "grad_norm": 4.012174606323242, |
| "learning_rate": 2.271473839894331e-05, |
| "loss": 46.3054, |
| "step": 944 |
| }, |
| { |
| "epoch": 3.228876434480918, |
| "grad_norm": 6.315187931060791, |
| "learning_rate": 2.2623598917395438e-05, |
| "loss": 44.3273, |
| "step": 946 |
| }, |
| { |
| "epoch": 3.2357085668534826, |
| "grad_norm": 5.612927436828613, |
| "learning_rate": 2.253249129574887e-05, |
| "loss": 46.8669, |
| "step": 948 |
| }, |
| { |
| "epoch": 3.2425406992260477, |
| "grad_norm": 3.7026705741882324, |
| "learning_rate": 2.2441416755463995e-05, |
| "loss": 46.4012, |
| "step": 950 |
| }, |
| { |
| "epoch": 3.2425406992260477, |
| "eval_loss": 0.7383518218994141, |
| "eval_runtime": 118.6959, |
| "eval_samples_per_second": 33.236, |
| "eval_steps_per_second": 8.315, |
| "step": 950 |
| }, |
| { |
| "epoch": 3.2493728315986123, |
| "grad_norm": 4.251457214355469, |
| "learning_rate": 2.2350376517557727e-05, |
| "loss": 47.1319, |
| "step": 952 |
| }, |
| { |
| "epoch": 3.256204963971177, |
| "grad_norm": 4.500071048736572, |
| "learning_rate": 2.2259371802587068e-05, |
| "loss": 47.0883, |
| "step": 954 |
| }, |
| { |
| "epoch": 3.2630370963437416, |
| "grad_norm": 4.684493064880371, |
| "learning_rate": 2.216840383063277e-05, |
| "loss": 45.0587, |
| "step": 956 |
| }, |
| { |
| "epoch": 3.2698692287163063, |
| "grad_norm": 3.853529453277588, |
| "learning_rate": 2.2077473821282996e-05, |
| "loss": 46.3262, |
| "step": 958 |
| }, |
| { |
| "epoch": 3.276701361088871, |
| "grad_norm": 5.501523971557617, |
| "learning_rate": 2.1986582993616926e-05, |
| "loss": 44.8375, |
| "step": 960 |
| }, |
| { |
| "epoch": 3.283533493461436, |
| "grad_norm": 15.540706634521484, |
| "learning_rate": 2.1895732566188476e-05, |
| "loss": 45.117, |
| "step": 962 |
| }, |
| { |
| "epoch": 3.2903656258340006, |
| "grad_norm": 2.6855862140655518, |
| "learning_rate": 2.1804923757009884e-05, |
| "loss": 45.9567, |
| "step": 964 |
| }, |
| { |
| "epoch": 3.2971977582065652, |
| "grad_norm": 4.529240131378174, |
| "learning_rate": 2.1714157783535463e-05, |
| "loss": 44.7532, |
| "step": 966 |
| }, |
| { |
| "epoch": 3.30402989057913, |
| "grad_norm": 4.690282344818115, |
| "learning_rate": 2.1623435862645204e-05, |
| "loss": 45.8376, |
| "step": 968 |
| }, |
| { |
| "epoch": 3.3108620229516945, |
| "grad_norm": 5.309507846832275, |
| "learning_rate": 2.153275921062851e-05, |
| "loss": 46.1757, |
| "step": 970 |
| }, |
| { |
| "epoch": 3.3176941553242596, |
| "grad_norm": 4.278385639190674, |
| "learning_rate": 2.1442129043167874e-05, |
| "loss": 46.6388, |
| "step": 972 |
| }, |
| { |
| "epoch": 3.3245262876968242, |
| "grad_norm": 4.2424516677856445, |
| "learning_rate": 2.1351546575322572e-05, |
| "loss": 45.1695, |
| "step": 974 |
| }, |
| { |
| "epoch": 3.331358420069389, |
| "grad_norm": 3.695155143737793, |
| "learning_rate": 2.126101302151238e-05, |
| "loss": 45.9417, |
| "step": 976 |
| }, |
| { |
| "epoch": 3.3381905524419535, |
| "grad_norm": 4.2003374099731445, |
| "learning_rate": 2.1170529595501305e-05, |
| "loss": 44.4002, |
| "step": 978 |
| }, |
| { |
| "epoch": 3.345022684814518, |
| "grad_norm": 4.378734588623047, |
| "learning_rate": 2.1080097510381298e-05, |
| "loss": 45.4517, |
| "step": 980 |
| }, |
| { |
| "epoch": 3.351854817187083, |
| "grad_norm": 3.96730637550354, |
| "learning_rate": 2.098971797855599e-05, |
| "loss": 43.9996, |
| "step": 982 |
| }, |
| { |
| "epoch": 3.358686949559648, |
| "grad_norm": 3.6162188053131104, |
| "learning_rate": 2.089939221172446e-05, |
| "loss": 43.9178, |
| "step": 984 |
| }, |
| { |
| "epoch": 3.3655190819322125, |
| "grad_norm": 4.3834099769592285, |
| "learning_rate": 2.0809121420864923e-05, |
| "loss": 46.2701, |
| "step": 986 |
| }, |
| { |
| "epoch": 3.372351214304777, |
| "grad_norm": 4.271561145782471, |
| "learning_rate": 2.07189068162186e-05, |
| "loss": 45.7546, |
| "step": 988 |
| }, |
| { |
| "epoch": 3.3791833466773418, |
| "grad_norm": 3.5791757106781006, |
| "learning_rate": 2.0628749607273396e-05, |
| "loss": 45.3079, |
| "step": 990 |
| }, |
| { |
| "epoch": 3.3860154790499064, |
| "grad_norm": 4.5101318359375, |
| "learning_rate": 2.0538651002747744e-05, |
| "loss": 46.5476, |
| "step": 992 |
| }, |
| { |
| "epoch": 3.3928476114224715, |
| "grad_norm": 5.944687366485596, |
| "learning_rate": 2.0448612210574365e-05, |
| "loss": 44.0355, |
| "step": 994 |
| }, |
| { |
| "epoch": 3.399679743795036, |
| "grad_norm": 4.936254501342773, |
| "learning_rate": 2.0358634437884112e-05, |
| "loss": 46.0717, |
| "step": 996 |
| }, |
| { |
| "epoch": 3.4065118761676008, |
| "grad_norm": 4.114757537841797, |
| "learning_rate": 2.0268718890989753e-05, |
| "loss": 44.5295, |
| "step": 998 |
| }, |
| { |
| "epoch": 3.4133440085401654, |
| "grad_norm": 8.12585735321045, |
| "learning_rate": 2.0178866775369777e-05, |
| "loss": 45.0747, |
| "step": 1000 |
| }, |
| { |
| "epoch": 3.4133440085401654, |
| "eval_loss": 0.7275528907775879, |
| "eval_runtime": 119.5885, |
| "eval_samples_per_second": 32.988, |
| "eval_steps_per_second": 8.253, |
| "step": 1000 |
| }, |
| { |
| "epoch": 3.4304243394715774, |
| "grad_norm": 4.9336113929748535, |
| "learning_rate": 2.0089079295652306e-05, |
| "loss": 45.5736, |
| "step": 1002 |
| }, |
| { |
| "epoch": 3.437256471844142, |
| "grad_norm": 5.042412757873535, |
| "learning_rate": 1.9999357655598893e-05, |
| "loss": 45.6651, |
| "step": 1004 |
| }, |
| { |
| "epoch": 3.4440886042167067, |
| "grad_norm": 3.9377660751342773, |
| "learning_rate": 1.9909703058088376e-05, |
| "loss": 44.5559, |
| "step": 1006 |
| }, |
| { |
| "epoch": 3.4509207365892713, |
| "grad_norm": 4.054321765899658, |
| "learning_rate": 1.9820116705100777e-05, |
| "loss": 45.1868, |
| "step": 1008 |
| }, |
| { |
| "epoch": 3.457752868961836, |
| "grad_norm": 4.860738277435303, |
| "learning_rate": 1.9730599797701177e-05, |
| "loss": 44.6737, |
| "step": 1010 |
| }, |
| { |
| "epoch": 3.4645850013344006, |
| "grad_norm": 3.950925827026367, |
| "learning_rate": 1.9641153536023644e-05, |
| "loss": 43.7733, |
| "step": 1012 |
| }, |
| { |
| "epoch": 3.4714171337069657, |
| "grad_norm": 3.831669569015503, |
| "learning_rate": 1.9551779119255043e-05, |
| "loss": 43.7403, |
| "step": 1014 |
| }, |
| { |
| "epoch": 3.4782492660795303, |
| "grad_norm": 4.114947319030762, |
| "learning_rate": 1.9462477745619108e-05, |
| "loss": 45.5074, |
| "step": 1016 |
| }, |
| { |
| "epoch": 3.485081398452095, |
| "grad_norm": 3.405243158340454, |
| "learning_rate": 1.9373250612360246e-05, |
| "loss": 46.4417, |
| "step": 1018 |
| }, |
| { |
| "epoch": 3.4919135308246596, |
| "grad_norm": 4.80495023727417, |
| "learning_rate": 1.928409891572757e-05, |
| "loss": 44.9758, |
| "step": 1020 |
| }, |
| { |
| "epoch": 3.4987456631972247, |
| "grad_norm": 4.239831447601318, |
| "learning_rate": 1.919502385095881e-05, |
| "loss": 44.6174, |
| "step": 1022 |
| }, |
| { |
| "epoch": 3.5055777955697893, |
| "grad_norm": 4.724026203155518, |
| "learning_rate": 1.9106026612264316e-05, |
| "loss": 44.7325, |
| "step": 1024 |
| }, |
| { |
| "epoch": 3.512409927942354, |
| "grad_norm": 3.4634554386138916, |
| "learning_rate": 1.9017108392811065e-05, |
| "loss": 43.7796, |
| "step": 1026 |
| }, |
| { |
| "epoch": 3.5192420603149186, |
| "grad_norm": 4.715716361999512, |
| "learning_rate": 1.8928270384706584e-05, |
| "loss": 45.2777, |
| "step": 1028 |
| }, |
| { |
| "epoch": 3.5260741926874832, |
| "grad_norm": 5.100541114807129, |
| "learning_rate": 1.8839513778983066e-05, |
| "loss": 46.4359, |
| "step": 1030 |
| }, |
| { |
| "epoch": 3.532906325060048, |
| "grad_norm": 4.475189685821533, |
| "learning_rate": 1.875083976558136e-05, |
| "loss": 44.0298, |
| "step": 1032 |
| }, |
| { |
| "epoch": 3.5397384574326125, |
| "grad_norm": 4.431650161743164, |
| "learning_rate": 1.8662249533335003e-05, |
| "loss": 44.2631, |
| "step": 1034 |
| }, |
| { |
| "epoch": 3.5465705898051776, |
| "grad_norm": 4.561038970947266, |
| "learning_rate": 1.8573744269954298e-05, |
| "loss": 43.9968, |
| "step": 1036 |
| }, |
| { |
| "epoch": 3.5534027221777422, |
| "grad_norm": 3.4181675910949707, |
| "learning_rate": 1.848532516201039e-05, |
| "loss": 43.372, |
| "step": 1038 |
| }, |
| { |
| "epoch": 3.560234854550307, |
| "grad_norm": 4.05961799621582, |
| "learning_rate": 1.8396993394919372e-05, |
| "loss": 43.5887, |
| "step": 1040 |
| }, |
| { |
| "epoch": 3.5670669869228715, |
| "grad_norm": 4.183586597442627, |
| "learning_rate": 1.8308750152926337e-05, |
| "loss": 43.1976, |
| "step": 1042 |
| }, |
| { |
| "epoch": 3.5738991192954366, |
| "grad_norm": 4.6883745193481445, |
| "learning_rate": 1.8220596619089576e-05, |
| "loss": 44.4463, |
| "step": 1044 |
| }, |
| { |
| "epoch": 3.580731251668001, |
| "grad_norm": 4.490588665008545, |
| "learning_rate": 1.8132533975264682e-05, |
| "loss": 44.3332, |
| "step": 1046 |
| }, |
| { |
| "epoch": 3.587563384040566, |
| "grad_norm": 4.937854766845703, |
| "learning_rate": 1.8044563402088684e-05, |
| "loss": 45.1199, |
| "step": 1048 |
| }, |
| { |
| "epoch": 3.5943955164131305, |
| "grad_norm": 3.8182907104492188, |
| "learning_rate": 1.795668607896426e-05, |
| "loss": 45.2035, |
| "step": 1050 |
| }, |
| { |
| "epoch": 3.5943955164131305, |
| "eval_loss": 0.7135393619537354, |
| "eval_runtime": 130.7813, |
| "eval_samples_per_second": 30.165, |
| "eval_steps_per_second": 7.547, |
| "step": 1050 |
| }, |
| { |
| "epoch": 3.601227648785695, |
| "grad_norm": 3.3739826679229736, |
| "learning_rate": 1.7868903184043887e-05, |
| "loss": 43.5257, |
| "step": 1052 |
| }, |
| { |
| "epoch": 3.6080597811582598, |
| "grad_norm": 3.8119192123413086, |
| "learning_rate": 1.7781215894214078e-05, |
| "loss": 44.9718, |
| "step": 1054 |
| }, |
| { |
| "epoch": 3.6148919135308244, |
| "grad_norm": 3.6780483722686768, |
| "learning_rate": 1.7693625385079577e-05, |
| "loss": 44.496, |
| "step": 1056 |
| }, |
| { |
| "epoch": 3.6217240459033895, |
| "grad_norm": 4.625596523284912, |
| "learning_rate": 1.7606132830947614e-05, |
| "loss": 43.6496, |
| "step": 1058 |
| }, |
| { |
| "epoch": 3.628556178275954, |
| "grad_norm": 5.467988967895508, |
| "learning_rate": 1.7518739404812155e-05, |
| "loss": 45.3773, |
| "step": 1060 |
| }, |
| { |
| "epoch": 3.6353883106485188, |
| "grad_norm": 3.7848103046417236, |
| "learning_rate": 1.7431446278338197e-05, |
| "loss": 43.6622, |
| "step": 1062 |
| }, |
| { |
| "epoch": 3.6422204430210834, |
| "grad_norm": 6.2495222091674805, |
| "learning_rate": 1.7344254621846016e-05, |
| "loss": 44.7325, |
| "step": 1064 |
| }, |
| { |
| "epoch": 3.6490525753936485, |
| "grad_norm": 4.541433811187744, |
| "learning_rate": 1.7257165604295513e-05, |
| "loss": 45.7111, |
| "step": 1066 |
| }, |
| { |
| "epoch": 3.655884707766213, |
| "grad_norm": 3.6900789737701416, |
| "learning_rate": 1.7170180393270532e-05, |
| "loss": 46.2799, |
| "step": 1068 |
| }, |
| { |
| "epoch": 3.6627168401387777, |
| "grad_norm": 3.999112129211426, |
| "learning_rate": 1.7083300154963193e-05, |
| "loss": 44.9348, |
| "step": 1070 |
| }, |
| { |
| "epoch": 3.6695489725113424, |
| "grad_norm": 4.940526008605957, |
| "learning_rate": 1.699652605415828e-05, |
| "loss": 45.9208, |
| "step": 1072 |
| }, |
| { |
| "epoch": 3.676381104883907, |
| "grad_norm": 3.8536486625671387, |
| "learning_rate": 1.6909859254217613e-05, |
| "loss": 45.3559, |
| "step": 1074 |
| }, |
| { |
| "epoch": 3.6832132372564717, |
| "grad_norm": 5.941255569458008, |
| "learning_rate": 1.682330091706446e-05, |
| "loss": 44.2183, |
| "step": 1076 |
| }, |
| { |
| "epoch": 3.6900453696290363, |
| "grad_norm": 4.6851091384887695, |
| "learning_rate": 1.6736852203167935e-05, |
| "loss": 45.0132, |
| "step": 1078 |
| }, |
| { |
| "epoch": 3.6968775020016014, |
| "grad_norm": 6.338913917541504, |
| "learning_rate": 1.6650514271527468e-05, |
| "loss": 44.5087, |
| "step": 1080 |
| }, |
| { |
| "epoch": 3.703709634374166, |
| "grad_norm": 6.134509086608887, |
| "learning_rate": 1.6564288279657252e-05, |
| "loss": 44.5929, |
| "step": 1082 |
| }, |
| { |
| "epoch": 3.7105417667467306, |
| "grad_norm": 3.0185976028442383, |
| "learning_rate": 1.647817538357072e-05, |
| "loss": 44.4708, |
| "step": 1084 |
| }, |
| { |
| "epoch": 3.7173738991192953, |
| "grad_norm": 4.479791641235352, |
| "learning_rate": 1.639217673776507e-05, |
| "loss": 44.4799, |
| "step": 1086 |
| }, |
| { |
| "epoch": 3.7242060314918604, |
| "grad_norm": 3.9354395866394043, |
| "learning_rate": 1.630629349520576e-05, |
| "loss": 43.3393, |
| "step": 1088 |
| }, |
| { |
| "epoch": 3.731038163864425, |
| "grad_norm": 4.530430316925049, |
| "learning_rate": 1.622052680731105e-05, |
| "loss": 43.1996, |
| "step": 1090 |
| }, |
| { |
| "epoch": 3.7378702962369896, |
| "grad_norm": 4.594604015350342, |
| "learning_rate": 1.613487782393661e-05, |
| "loss": 43.6473, |
| "step": 1092 |
| }, |
| { |
| "epoch": 3.7447024286095543, |
| "grad_norm": 4.38798713684082, |
| "learning_rate": 1.604934769336004e-05, |
| "loss": 43.1229, |
| "step": 1094 |
| }, |
| { |
| "epoch": 3.751534560982119, |
| "grad_norm": 4.350236415863037, |
| "learning_rate": 1.5963937562265525e-05, |
| "loss": 44.7883, |
| "step": 1096 |
| }, |
| { |
| "epoch": 3.7583666933546835, |
| "grad_norm": 4.064984321594238, |
| "learning_rate": 1.587864857572842e-05, |
| "loss": 44.1865, |
| "step": 1098 |
| }, |
| { |
| "epoch": 3.7651988257272486, |
| "grad_norm": 4.607226848602295, |
| "learning_rate": 1.5793481877199946e-05, |
| "loss": 44.6176, |
| "step": 1100 |
| }, |
| { |
| "epoch": 3.7651988257272486, |
| "eval_loss": 0.7090520858764648, |
| "eval_runtime": 136.3013, |
| "eval_samples_per_second": 28.943, |
| "eval_steps_per_second": 7.241, |
| "step": 1100 |
| }, |
| { |
| "epoch": 3.7720309580998133, |
| "grad_norm": 4.4557719230651855, |
| "learning_rate": 1.5708438608491814e-05, |
| "loss": 42.0453, |
| "step": 1102 |
| }, |
| { |
| "epoch": 3.778863090472378, |
| "grad_norm": 5.199422359466553, |
| "learning_rate": 1.5623519909760954e-05, |
| "loss": 42.589, |
| "step": 1104 |
| }, |
| { |
| "epoch": 3.7856952228449425, |
| "grad_norm": 3.632471799850464, |
| "learning_rate": 1.5538726919494206e-05, |
| "loss": 43.7924, |
| "step": 1106 |
| }, |
| { |
| "epoch": 3.7925273552175076, |
| "grad_norm": 4.203450679779053, |
| "learning_rate": 1.5454060774493068e-05, |
| "loss": 45.02, |
| "step": 1108 |
| }, |
| { |
| "epoch": 3.7993594875900722, |
| "grad_norm": 5.149316310882568, |
| "learning_rate": 1.5369522609858446e-05, |
| "loss": 44.2724, |
| "step": 1110 |
| }, |
| { |
| "epoch": 3.806191619962637, |
| "grad_norm": 3.5306341648101807, |
| "learning_rate": 1.528511355897543e-05, |
| "loss": 44.2268, |
| "step": 1112 |
| }, |
| { |
| "epoch": 3.8130237523352015, |
| "grad_norm": 4.296536445617676, |
| "learning_rate": 1.5200834753498128e-05, |
| "loss": 44.0479, |
| "step": 1114 |
| }, |
| { |
| "epoch": 3.819855884707766, |
| "grad_norm": 2.969525098800659, |
| "learning_rate": 1.5116687323334467e-05, |
| "loss": 43.5543, |
| "step": 1116 |
| }, |
| { |
| "epoch": 3.826688017080331, |
| "grad_norm": 4.044551849365234, |
| "learning_rate": 1.5032672396631056e-05, |
| "loss": 45.7925, |
| "step": 1118 |
| }, |
| { |
| "epoch": 3.8335201494528954, |
| "grad_norm": 5.003629207611084, |
| "learning_rate": 1.4948791099758052e-05, |
| "loss": 44.2037, |
| "step": 1120 |
| }, |
| { |
| "epoch": 3.8403522818254605, |
| "grad_norm": 3.4248318672180176, |
| "learning_rate": 1.486504455729408e-05, |
| "loss": 43.9243, |
| "step": 1122 |
| }, |
| { |
| "epoch": 3.847184414198025, |
| "grad_norm": 4.228148937225342, |
| "learning_rate": 1.4781433892011131e-05, |
| "loss": 44.7779, |
| "step": 1124 |
| }, |
| { |
| "epoch": 3.85401654657059, |
| "grad_norm": 4.345002174377441, |
| "learning_rate": 1.4697960224859513e-05, |
| "loss": 43.0617, |
| "step": 1126 |
| }, |
| { |
| "epoch": 3.8608486789431544, |
| "grad_norm": 4.824610233306885, |
| "learning_rate": 1.4614624674952842e-05, |
| "loss": 43.2687, |
| "step": 1128 |
| }, |
| { |
| "epoch": 3.8676808113157195, |
| "grad_norm": 5.528540134429932, |
| "learning_rate": 1.4531428359553017e-05, |
| "loss": 43.5145, |
| "step": 1130 |
| }, |
| { |
| "epoch": 3.874512943688284, |
| "grad_norm": 3.7578537464141846, |
| "learning_rate": 1.4448372394055249e-05, |
| "loss": 43.2377, |
| "step": 1132 |
| }, |
| { |
| "epoch": 3.8813450760608488, |
| "grad_norm": 3.191563367843628, |
| "learning_rate": 1.436545789197313e-05, |
| "loss": 43.493, |
| "step": 1134 |
| }, |
| { |
| "epoch": 3.8881772084334134, |
| "grad_norm": 3.1072089672088623, |
| "learning_rate": 1.4282685964923642e-05, |
| "loss": 44.5567, |
| "step": 1136 |
| }, |
| { |
| "epoch": 3.895009340805978, |
| "grad_norm": 4.651160717010498, |
| "learning_rate": 1.4200057722612336e-05, |
| "loss": 42.7739, |
| "step": 1138 |
| }, |
| { |
| "epoch": 3.9018414731785427, |
| "grad_norm": 3.203441858291626, |
| "learning_rate": 1.4117574272818388e-05, |
| "loss": 43.1438, |
| "step": 1140 |
| }, |
| { |
| "epoch": 3.9086736055511073, |
| "grad_norm": 4.5728349685668945, |
| "learning_rate": 1.4035236721379757e-05, |
| "loss": 44.305, |
| "step": 1142 |
| }, |
| { |
| "epoch": 3.9155057379236724, |
| "grad_norm": 6.874294757843018, |
| "learning_rate": 1.3953046172178414e-05, |
| "loss": 42.8162, |
| "step": 1144 |
| }, |
| { |
| "epoch": 3.922337870296237, |
| "grad_norm": 5.198761463165283, |
| "learning_rate": 1.387100372712548e-05, |
| "loss": 44.2441, |
| "step": 1146 |
| }, |
| { |
| "epoch": 3.9291700026688017, |
| "grad_norm": 3.9007508754730225, |
| "learning_rate": 1.378911048614647e-05, |
| "loss": 43.0147, |
| "step": 1148 |
| }, |
| { |
| "epoch": 3.9360021350413663, |
| "grad_norm": 3.7035725116729736, |
| "learning_rate": 1.3707367547166569e-05, |
| "loss": 45.0733, |
| "step": 1150 |
| }, |
| { |
| "epoch": 3.9360021350413663, |
| "eval_loss": 0.7048025131225586, |
| "eval_runtime": 132.7997, |
| "eval_samples_per_second": 29.706, |
| "eval_steps_per_second": 7.432, |
| "step": 1150 |
| }, |
| { |
| "epoch": 3.9428342674139314, |
| "grad_norm": 5.101466655731201, |
| "learning_rate": 1.3625776006095881e-05, |
| "loss": 42.4982, |
| "step": 1152 |
| }, |
| { |
| "epoch": 3.949666399786496, |
| "grad_norm": 4.983183860778809, |
| "learning_rate": 1.354433695681474e-05, |
| "loss": 43.3568, |
| "step": 1154 |
| }, |
| { |
| "epoch": 3.9564985321590607, |
| "grad_norm": 3.6875593662261963, |
| "learning_rate": 1.3463051491159096e-05, |
| "loss": 45.16, |
| "step": 1156 |
| }, |
| { |
| "epoch": 3.9633306645316253, |
| "grad_norm": 4.482807636260986, |
| "learning_rate": 1.3381920698905787e-05, |
| "loss": 42.8545, |
| "step": 1158 |
| }, |
| { |
| "epoch": 3.97016279690419, |
| "grad_norm": 3.858903646469116, |
| "learning_rate": 1.3300945667758014e-05, |
| "loss": 42.5779, |
| "step": 1160 |
| }, |
| { |
| "epoch": 3.9769949292767546, |
| "grad_norm": 5.07602596282959, |
| "learning_rate": 1.3220127483330713e-05, |
| "loss": 43.8678, |
| "step": 1162 |
| }, |
| { |
| "epoch": 3.983827061649319, |
| "grad_norm": 5.183884620666504, |
| "learning_rate": 1.3139467229135999e-05, |
| "loss": 44.2575, |
| "step": 1164 |
| }, |
| { |
| "epoch": 3.9906591940218843, |
| "grad_norm": 5.44564962387085, |
| "learning_rate": 1.3058965986568648e-05, |
| "loss": 42.0898, |
| "step": 1166 |
| }, |
| { |
| "epoch": 3.997491326394449, |
| "grad_norm": 3.4175875186920166, |
| "learning_rate": 1.2978624834891628e-05, |
| "loss": 43.526, |
| "step": 1168 |
| }, |
| { |
| "epoch": 4.006832132372565, |
| "grad_norm": 5.1483588218688965, |
| "learning_rate": 1.2898444851221565e-05, |
| "loss": 60.1634, |
| "step": 1170 |
| }, |
| { |
| "epoch": 4.013664264745129, |
| "grad_norm": 4.452287673950195, |
| "learning_rate": 1.281842711051438e-05, |
| "loss": 41.7569, |
| "step": 1172 |
| }, |
| { |
| "epoch": 4.020496397117694, |
| "grad_norm": 4.024214267730713, |
| "learning_rate": 1.2738572685550799e-05, |
| "loss": 44.7667, |
| "step": 1174 |
| }, |
| { |
| "epoch": 4.0273285294902585, |
| "grad_norm": 5.533107757568359, |
| "learning_rate": 1.2658882646922034e-05, |
| "loss": 43.7144, |
| "step": 1176 |
| }, |
| { |
| "epoch": 4.034160661862823, |
| "grad_norm": 4.520675182342529, |
| "learning_rate": 1.2579358063015418e-05, |
| "loss": 43.3862, |
| "step": 1178 |
| }, |
| { |
| "epoch": 4.040992794235389, |
| "grad_norm": 4.086079120635986, |
| "learning_rate": 1.2500000000000006e-05, |
| "loss": 44.268, |
| "step": 1180 |
| }, |
| { |
| "epoch": 4.047824926607953, |
| "grad_norm": 3.335569381713867, |
| "learning_rate": 1.2420809521812404e-05, |
| "loss": 43.1871, |
| "step": 1182 |
| }, |
| { |
| "epoch": 4.054657058980518, |
| "grad_norm": 4.651849746704102, |
| "learning_rate": 1.2341787690142437e-05, |
| "loss": 43.4785, |
| "step": 1184 |
| }, |
| { |
| "epoch": 4.061489191353083, |
| "grad_norm": 3.9412457942962646, |
| "learning_rate": 1.2262935564418886e-05, |
| "loss": 42.1075, |
| "step": 1186 |
| }, |
| { |
| "epoch": 4.068321323725647, |
| "grad_norm": 5.621413230895996, |
| "learning_rate": 1.2184254201795365e-05, |
| "loss": 44.5849, |
| "step": 1188 |
| }, |
| { |
| "epoch": 4.075153456098212, |
| "grad_norm": 4.291881084442139, |
| "learning_rate": 1.2105744657136064e-05, |
| "loss": 42.9562, |
| "step": 1190 |
| }, |
| { |
| "epoch": 4.0819855884707765, |
| "grad_norm": 3.730132818222046, |
| "learning_rate": 1.2027407983001681e-05, |
| "loss": 44.0838, |
| "step": 1192 |
| }, |
| { |
| "epoch": 4.088817720843341, |
| "grad_norm": 3.540987968444824, |
| "learning_rate": 1.1949245229635245e-05, |
| "loss": 43.4705, |
| "step": 1194 |
| }, |
| { |
| "epoch": 4.095649853215906, |
| "grad_norm": 3.0649805068969727, |
| "learning_rate": 1.1871257444948098e-05, |
| "loss": 43.0996, |
| "step": 1196 |
| }, |
| { |
| "epoch": 4.10248198558847, |
| "grad_norm": 3.2024762630462646, |
| "learning_rate": 1.1793445674505776e-05, |
| "loss": 42.772, |
| "step": 1198 |
| }, |
| { |
| "epoch": 4.109314117961035, |
| "grad_norm": 3.462251663208008, |
| "learning_rate": 1.1715810961514073e-05, |
| "loss": 43.2502, |
| "step": 1200 |
| }, |
| { |
| "epoch": 4.109314117961035, |
| "eval_loss": 0.7009151577949524, |
| "eval_runtime": 133.1765, |
| "eval_samples_per_second": 29.622, |
| "eval_steps_per_second": 7.411, |
| "step": 1200 |
| }, |
| { |
| "epoch": 4.116146250333601, |
| "grad_norm": 4.633735656738281, |
| "learning_rate": 1.1638354346804971e-05, |
| "loss": 42.8239, |
| "step": 1202 |
| }, |
| { |
| "epoch": 4.122978382706165, |
| "grad_norm": 3.758700132369995, |
| "learning_rate": 1.1561076868822756e-05, |
| "loss": 43.3475, |
| "step": 1204 |
| }, |
| { |
| "epoch": 4.12981051507873, |
| "grad_norm": 4.143715858459473, |
| "learning_rate": 1.148397956361007e-05, |
| "loss": 44.0, |
| "step": 1206 |
| }, |
| { |
| "epoch": 4.1366426474512945, |
| "grad_norm": 5.201571941375732, |
| "learning_rate": 1.1407063464793966e-05, |
| "loss": 42.5036, |
| "step": 1208 |
| }, |
| { |
| "epoch": 4.143474779823859, |
| "grad_norm": 3.4282047748565674, |
| "learning_rate": 1.133032960357216e-05, |
| "loss": 43.0577, |
| "step": 1210 |
| }, |
| { |
| "epoch": 4.150306912196424, |
| "grad_norm": 4.114802837371826, |
| "learning_rate": 1.1253779008699131e-05, |
| "loss": 43.3517, |
| "step": 1212 |
| }, |
| { |
| "epoch": 4.157139044568988, |
| "grad_norm": 3.979163408279419, |
| "learning_rate": 1.1177412706472321e-05, |
| "loss": 42.5044, |
| "step": 1214 |
| }, |
| { |
| "epoch": 4.163971176941553, |
| "grad_norm": 4.363109588623047, |
| "learning_rate": 1.1101231720718442e-05, |
| "loss": 43.8954, |
| "step": 1216 |
| }, |
| { |
| "epoch": 4.170803309314118, |
| "grad_norm": 4.6219401359558105, |
| "learning_rate": 1.1025237072779663e-05, |
| "loss": 43.413, |
| "step": 1218 |
| }, |
| { |
| "epoch": 4.177635441686682, |
| "grad_norm": 4.945540904998779, |
| "learning_rate": 1.09494297815e-05, |
| "loss": 43.9628, |
| "step": 1220 |
| }, |
| { |
| "epoch": 4.184467574059248, |
| "grad_norm": 4.4585747718811035, |
| "learning_rate": 1.0873810863211595e-05, |
| "loss": 42.6454, |
| "step": 1222 |
| }, |
| { |
| "epoch": 4.1912997064318125, |
| "grad_norm": 4.659883499145508, |
| "learning_rate": 1.0798381331721109e-05, |
| "loss": 42.5656, |
| "step": 1224 |
| }, |
| { |
| "epoch": 4.198131838804377, |
| "grad_norm": 4.411434650421143, |
| "learning_rate": 1.0723142198296155e-05, |
| "loss": 41.2252, |
| "step": 1226 |
| }, |
| { |
| "epoch": 4.204963971176942, |
| "grad_norm": 4.985414028167725, |
| "learning_rate": 1.0648094471651724e-05, |
| "loss": 42.05, |
| "step": 1228 |
| }, |
| { |
| "epoch": 4.211796103549506, |
| "grad_norm": 5.09487771987915, |
| "learning_rate": 1.0573239157936619e-05, |
| "loss": 42.9917, |
| "step": 1230 |
| }, |
| { |
| "epoch": 4.218628235922071, |
| "grad_norm": 4.299539089202881, |
| "learning_rate": 1.049857726072005e-05, |
| "loss": 42.7934, |
| "step": 1232 |
| }, |
| { |
| "epoch": 4.225460368294636, |
| "grad_norm": 4.075766086578369, |
| "learning_rate": 1.0424109780978103e-05, |
| "loss": 41.0067, |
| "step": 1234 |
| }, |
| { |
| "epoch": 4.2322925006672, |
| "grad_norm": 4.9132232666015625, |
| "learning_rate": 1.034983771708035e-05, |
| "loss": 43.6556, |
| "step": 1236 |
| }, |
| { |
| "epoch": 4.239124633039765, |
| "grad_norm": 4.45914888381958, |
| "learning_rate": 1.0275762064776492e-05, |
| "loss": 42.588, |
| "step": 1238 |
| }, |
| { |
| "epoch": 4.24595676541233, |
| "grad_norm": 3.7621419429779053, |
| "learning_rate": 1.020188381718295e-05, |
| "loss": 41.7435, |
| "step": 1240 |
| }, |
| { |
| "epoch": 4.252788897784894, |
| "grad_norm": 2.9593658447265625, |
| "learning_rate": 1.0128203964769601e-05, |
| "loss": 43.7138, |
| "step": 1242 |
| }, |
| { |
| "epoch": 4.25962103015746, |
| "grad_norm": 4.333788871765137, |
| "learning_rate": 1.0054723495346482e-05, |
| "loss": 42.7332, |
| "step": 1244 |
| }, |
| { |
| "epoch": 4.266453162530024, |
| "grad_norm": 4.040637493133545, |
| "learning_rate": 9.981443394050525e-06, |
| "loss": 43.0547, |
| "step": 1246 |
| }, |
| { |
| "epoch": 4.273285294902589, |
| "grad_norm": 5.255796432495117, |
| "learning_rate": 9.908364643332399e-06, |
| "loss": 42.1078, |
| "step": 1248 |
| }, |
| { |
| "epoch": 4.280117427275154, |
| "grad_norm": 3.434884786605835, |
| "learning_rate": 9.835488222943285e-06, |
| "loss": 42.6684, |
| "step": 1250 |
| }, |
| { |
| "epoch": 4.280117427275154, |
| "eval_loss": 0.6948874592781067, |
| "eval_runtime": 138.5111, |
| "eval_samples_per_second": 28.481, |
| "eval_steps_per_second": 7.126, |
| "step": 1250 |
| }, |
| { |
| "epoch": 4.286949559647718, |
| "grad_norm": 4.761016368865967, |
| "learning_rate": 9.762815109921761e-06, |
| "loss": 43.8, |
| "step": 1252 |
| }, |
| { |
| "epoch": 4.293781692020283, |
| "grad_norm": 5.999067783355713, |
| "learning_rate": 9.690346278580726e-06, |
| "loss": 42.8654, |
| "step": 1254 |
| }, |
| { |
| "epoch": 4.300613824392848, |
| "grad_norm": 4.777903079986572, |
| "learning_rate": 9.618082700494319e-06, |
| "loss": 42.3409, |
| "step": 1256 |
| }, |
| { |
| "epoch": 4.307445956765412, |
| "grad_norm": 4.543084144592285, |
| "learning_rate": 9.546025344484869e-06, |
| "loss": 43.6205, |
| "step": 1258 |
| }, |
| { |
| "epoch": 4.314278089137977, |
| "grad_norm": 3.6853065490722656, |
| "learning_rate": 9.474175176609956e-06, |
| "loss": 43.9045, |
| "step": 1260 |
| }, |
| { |
| "epoch": 4.3211102215105415, |
| "grad_norm": 4.3578338623046875, |
| "learning_rate": 9.402533160149416e-06, |
| "loss": 41.781, |
| "step": 1262 |
| }, |
| { |
| "epoch": 4.327942353883106, |
| "grad_norm": 4.191073894500732, |
| "learning_rate": 9.331100255592437e-06, |
| "loss": 42.5713, |
| "step": 1264 |
| }, |
| { |
| "epoch": 4.334774486255672, |
| "grad_norm": 5.591835021972656, |
| "learning_rate": 9.259877420624721e-06, |
| "loss": 42.9316, |
| "step": 1266 |
| }, |
| { |
| "epoch": 4.341606618628236, |
| "grad_norm": 4.916292667388916, |
| "learning_rate": 9.18886561011557e-06, |
| "loss": 42.9316, |
| "step": 1268 |
| }, |
| { |
| "epoch": 4.348438751000801, |
| "grad_norm": 3.4310858249664307, |
| "learning_rate": 9.118065776105159e-06, |
| "loss": 42.0445, |
| "step": 1270 |
| }, |
| { |
| "epoch": 4.3552708833733655, |
| "grad_norm": 3.6645348072052, |
| "learning_rate": 9.047478867791732e-06, |
| "loss": 41.5698, |
| "step": 1272 |
| }, |
| { |
| "epoch": 4.36210301574593, |
| "grad_norm": 4.118466854095459, |
| "learning_rate": 8.977105831518864e-06, |
| "loss": 41.7493, |
| "step": 1274 |
| }, |
| { |
| "epoch": 4.368935148118495, |
| "grad_norm": 4.731881141662598, |
| "learning_rate": 8.906947610762825e-06, |
| "loss": 41.2277, |
| "step": 1276 |
| }, |
| { |
| "epoch": 4.3757672804910595, |
| "grad_norm": 4.580758571624756, |
| "learning_rate": 8.837005146119872e-06, |
| "loss": 42.3467, |
| "step": 1278 |
| }, |
| { |
| "epoch": 4.382599412863624, |
| "grad_norm": 5.310960292816162, |
| "learning_rate": 8.767279375293672e-06, |
| "loss": 43.1447, |
| "step": 1280 |
| }, |
| { |
| "epoch": 4.389431545236189, |
| "grad_norm": 4.382359027862549, |
| "learning_rate": 8.697771233082744e-06, |
| "loss": 42.4424, |
| "step": 1282 |
| }, |
| { |
| "epoch": 4.396263677608753, |
| "grad_norm": 3.6488263607025146, |
| "learning_rate": 8.628481651367876e-06, |
| "loss": 43.8516, |
| "step": 1284 |
| }, |
| { |
| "epoch": 4.403095809981318, |
| "grad_norm": 3.2983975410461426, |
| "learning_rate": 8.55941155909968e-06, |
| "loss": 43.3322, |
| "step": 1286 |
| }, |
| { |
| "epoch": 4.4099279423538835, |
| "grad_norm": 3.5116684436798096, |
| "learning_rate": 8.490561882286136e-06, |
| "loss": 41.4651, |
| "step": 1288 |
| }, |
| { |
| "epoch": 4.416760074726448, |
| "grad_norm": 3.5123932361602783, |
| "learning_rate": 8.421933543980126e-06, |
| "loss": 43.1034, |
| "step": 1290 |
| }, |
| { |
| "epoch": 4.423592207099013, |
| "grad_norm": 4.123583793640137, |
| "learning_rate": 8.353527464267104e-06, |
| "loss": 43.566, |
| "step": 1292 |
| }, |
| { |
| "epoch": 4.430424339471577, |
| "grad_norm": 3.6427931785583496, |
| "learning_rate": 8.285344560252777e-06, |
| "loss": 42.0333, |
| "step": 1294 |
| }, |
| { |
| "epoch": 4.437256471844142, |
| "grad_norm": 3.8917388916015625, |
| "learning_rate": 8.217385746050742e-06, |
| "loss": 42.0382, |
| "step": 1296 |
| }, |
| { |
| "epoch": 4.444088604216707, |
| "grad_norm": 4.964122772216797, |
| "learning_rate": 8.149651932770308e-06, |
| "loss": 43.6584, |
| "step": 1298 |
| }, |
| { |
| "epoch": 4.450920736589271, |
| "grad_norm": 4.227240085601807, |
| "learning_rate": 8.082144028504233e-06, |
| "loss": 42.4086, |
| "step": 1300 |
| }, |
| { |
| "epoch": 4.450920736589271, |
| "eval_loss": 0.6897044777870178, |
| "eval_runtime": 131.8148, |
| "eval_samples_per_second": 29.928, |
| "eval_steps_per_second": 7.488, |
| "step": 1300 |
| }, |
| { |
| "epoch": 4.457752868961836, |
| "grad_norm": 4.605757713317871, |
| "learning_rate": 8.014862938316542e-06, |
| "loss": 43.7962, |
| "step": 1302 |
| }, |
| { |
| "epoch": 4.464585001334401, |
| "grad_norm": 4.2398176193237305, |
| "learning_rate": 7.947809564230445e-06, |
| "loss": 42.3544, |
| "step": 1304 |
| }, |
| { |
| "epoch": 4.471417133706965, |
| "grad_norm": 5.234216213226318, |
| "learning_rate": 7.880984805216185e-06, |
| "loss": 41.9833, |
| "step": 1306 |
| }, |
| { |
| "epoch": 4.47824926607953, |
| "grad_norm": 3.9220240116119385, |
| "learning_rate": 7.814389557179017e-06, |
| "loss": 42.0345, |
| "step": 1308 |
| }, |
| { |
| "epoch": 4.485081398452095, |
| "grad_norm": 5.44996976852417, |
| "learning_rate": 7.748024712947205e-06, |
| "loss": 42.0309, |
| "step": 1310 |
| }, |
| { |
| "epoch": 4.49191353082466, |
| "grad_norm": 5.07472038269043, |
| "learning_rate": 7.681891162260015e-06, |
| "loss": 42.6996, |
| "step": 1312 |
| }, |
| { |
| "epoch": 4.498745663197225, |
| "grad_norm": 3.818120241165161, |
| "learning_rate": 7.615989791755834e-06, |
| "loss": 42.8775, |
| "step": 1314 |
| }, |
| { |
| "epoch": 4.505577795569789, |
| "grad_norm": 4.252802848815918, |
| "learning_rate": 7.5503214849602516e-06, |
| "loss": 42.4118, |
| "step": 1316 |
| }, |
| { |
| "epoch": 4.512409927942354, |
| "grad_norm": 4.17697286605835, |
| "learning_rate": 7.484887122274215e-06, |
| "loss": 41.2153, |
| "step": 1318 |
| }, |
| { |
| "epoch": 4.519242060314919, |
| "grad_norm": 3.7324466705322266, |
| "learning_rate": 7.419687580962223e-06, |
| "loss": 42.3343, |
| "step": 1320 |
| }, |
| { |
| "epoch": 4.526074192687483, |
| "grad_norm": 3.870089054107666, |
| "learning_rate": 7.354723735140609e-06, |
| "loss": 42.0028, |
| "step": 1322 |
| }, |
| { |
| "epoch": 4.532906325060048, |
| "grad_norm": 3.6424801349639893, |
| "learning_rate": 7.289996455765749e-06, |
| "loss": 43.5842, |
| "step": 1324 |
| }, |
| { |
| "epoch": 4.5397384574326125, |
| "grad_norm": 4.695961952209473, |
| "learning_rate": 7.225506610622456e-06, |
| "loss": 42.0951, |
| "step": 1326 |
| }, |
| { |
| "epoch": 4.546570589805177, |
| "grad_norm": 4.842666149139404, |
| "learning_rate": 7.161255064312283e-06, |
| "loss": 43.8668, |
| "step": 1328 |
| }, |
| { |
| "epoch": 4.553402722177742, |
| "grad_norm": 4.4085822105407715, |
| "learning_rate": 7.0972426782419884e-06, |
| "loss": 43.7836, |
| "step": 1330 |
| }, |
| { |
| "epoch": 4.560234854550307, |
| "grad_norm": 3.606607437133789, |
| "learning_rate": 7.033470310611945e-06, |
| "loss": 41.4304, |
| "step": 1332 |
| }, |
| { |
| "epoch": 4.567066986922872, |
| "grad_norm": 4.789222717285156, |
| "learning_rate": 6.969938816404639e-06, |
| "loss": 41.6355, |
| "step": 1334 |
| }, |
| { |
| "epoch": 4.573899119295437, |
| "grad_norm": 4.463109493255615, |
| "learning_rate": 6.906649047373246e-06, |
| "loss": 43.4969, |
| "step": 1336 |
| }, |
| { |
| "epoch": 4.580731251668001, |
| "grad_norm": 4.483322620391846, |
| "learning_rate": 6.843601852030171e-06, |
| "loss": 42.4094, |
| "step": 1338 |
| }, |
| { |
| "epoch": 4.587563384040566, |
| "grad_norm": 4.021024703979492, |
| "learning_rate": 6.780798075635675e-06, |
| "loss": 42.2893, |
| "step": 1340 |
| }, |
| { |
| "epoch": 4.5943955164131305, |
| "grad_norm": 3.9479868412017822, |
| "learning_rate": 6.718238560186571e-06, |
| "loss": 40.8073, |
| "step": 1342 |
| }, |
| { |
| "epoch": 4.601227648785695, |
| "grad_norm": 4.778145790100098, |
| "learning_rate": 6.655924144404907e-06, |
| "loss": 42.0845, |
| "step": 1344 |
| }, |
| { |
| "epoch": 4.60805978115826, |
| "grad_norm": 3.555271863937378, |
| "learning_rate": 6.593855663726722e-06, |
| "loss": 41.1015, |
| "step": 1346 |
| }, |
| { |
| "epoch": 4.614891913530824, |
| "grad_norm": 4.007204532623291, |
| "learning_rate": 6.532033950290886e-06, |
| "loss": 42.9137, |
| "step": 1348 |
| }, |
| { |
| "epoch": 4.621724045903389, |
| "grad_norm": 4.328546524047852, |
| "learning_rate": 6.470459832927881e-06, |
| "loss": 41.274, |
| "step": 1350 |
| }, |
| { |
| "epoch": 4.621724045903389, |
| "eval_loss": 0.6830974221229553, |
| "eval_runtime": 135.2812, |
| "eval_samples_per_second": 29.161, |
| "eval_steps_per_second": 7.296, |
| "step": 1350 |
| }, |
| { |
| "epoch": 4.628556178275954, |
| "grad_norm": 4.948083877563477, |
| "learning_rate": 6.409134137148737e-06, |
| "loss": 43.0462, |
| "step": 1352 |
| }, |
| { |
| "epoch": 4.635388310648519, |
| "grad_norm": 4.637773036956787, |
| "learning_rate": 6.3480576851339625e-06, |
| "loss": 42.6268, |
| "step": 1354 |
| }, |
| { |
| "epoch": 4.642220443021084, |
| "grad_norm": 3.72841215133667, |
| "learning_rate": 6.28723129572247e-06, |
| "loss": 41.0574, |
| "step": 1356 |
| }, |
| { |
| "epoch": 4.6490525753936485, |
| "grad_norm": 4.539714813232422, |
| "learning_rate": 6.226655784400684e-06, |
| "loss": 43.5752, |
| "step": 1358 |
| }, |
| { |
| "epoch": 4.655884707766213, |
| "grad_norm": 5.519583225250244, |
| "learning_rate": 6.166331963291519e-06, |
| "loss": 43.3111, |
| "step": 1360 |
| }, |
| { |
| "epoch": 4.662716840138778, |
| "grad_norm": 4.942199230194092, |
| "learning_rate": 6.106260641143546e-06, |
| "loss": 43.6514, |
| "step": 1362 |
| }, |
| { |
| "epoch": 4.669548972511342, |
| "grad_norm": 5.164299011230469, |
| "learning_rate": 6.046442623320145e-06, |
| "loss": 40.8611, |
| "step": 1364 |
| }, |
| { |
| "epoch": 4.676381104883907, |
| "grad_norm": 4.309698581695557, |
| "learning_rate": 5.986878711788702e-06, |
| "loss": 41.3937, |
| "step": 1366 |
| }, |
| { |
| "epoch": 4.683213237256472, |
| "grad_norm": 4.105101585388184, |
| "learning_rate": 5.927569705109828e-06, |
| "loss": 40.3001, |
| "step": 1368 |
| }, |
| { |
| "epoch": 4.690045369629036, |
| "grad_norm": 3.571514368057251, |
| "learning_rate": 5.868516398426716e-06, |
| "loss": 41.6858, |
| "step": 1370 |
| }, |
| { |
| "epoch": 4.696877502001601, |
| "grad_norm": 5.120858192443848, |
| "learning_rate": 5.809719583454415e-06, |
| "loss": 41.4156, |
| "step": 1372 |
| }, |
| { |
| "epoch": 4.703709634374166, |
| "grad_norm": 4.679799556732178, |
| "learning_rate": 5.751180048469243e-06, |
| "loss": 43.1858, |
| "step": 1374 |
| }, |
| { |
| "epoch": 4.710541766746731, |
| "grad_norm": 3.0465521812438965, |
| "learning_rate": 5.692898578298253e-06, |
| "loss": 41.213, |
| "step": 1376 |
| }, |
| { |
| "epoch": 4.717373899119296, |
| "grad_norm": 4.835347652435303, |
| "learning_rate": 5.634875954308638e-06, |
| "loss": 44.0938, |
| "step": 1378 |
| }, |
| { |
| "epoch": 4.72420603149186, |
| "grad_norm": 6.645193099975586, |
| "learning_rate": 5.577112954397321e-06, |
| "loss": 41.7528, |
| "step": 1380 |
| }, |
| { |
| "epoch": 4.731038163864425, |
| "grad_norm": 4.592052936553955, |
| "learning_rate": 5.519610352980501e-06, |
| "loss": 42.566, |
| "step": 1382 |
| }, |
| { |
| "epoch": 4.73787029623699, |
| "grad_norm": 3.7620317935943604, |
| "learning_rate": 5.462368920983249e-06, |
| "loss": 41.7184, |
| "step": 1384 |
| }, |
| { |
| "epoch": 4.744702428609554, |
| "grad_norm": 4.0445027351379395, |
| "learning_rate": 5.405389425829219e-06, |
| "loss": 41.6249, |
| "step": 1386 |
| }, |
| { |
| "epoch": 4.751534560982119, |
| "grad_norm": 3.744433641433716, |
| "learning_rate": 5.348672631430318e-06, |
| "loss": 43.0626, |
| "step": 1388 |
| }, |
| { |
| "epoch": 4.7583666933546835, |
| "grad_norm": 3.12141489982605, |
| "learning_rate": 5.292219298176476e-06, |
| "loss": 42.1533, |
| "step": 1390 |
| }, |
| { |
| "epoch": 4.765198825727248, |
| "grad_norm": 6.73304557800293, |
| "learning_rate": 5.236030182925475e-06, |
| "loss": 41.6015, |
| "step": 1392 |
| }, |
| { |
| "epoch": 4.772030958099813, |
| "grad_norm": 4.076465129852295, |
| "learning_rate": 5.1801060389927606e-06, |
| "loss": 43.2645, |
| "step": 1394 |
| }, |
| { |
| "epoch": 4.7788630904723775, |
| "grad_norm": 4.178272247314453, |
| "learning_rate": 5.124447616141381e-06, |
| "loss": 43.0354, |
| "step": 1396 |
| }, |
| { |
| "epoch": 4.785695222844943, |
| "grad_norm": 4.555927276611328, |
| "learning_rate": 5.06905566057192e-06, |
| "loss": 42.1086, |
| "step": 1398 |
| }, |
| { |
| "epoch": 4.792527355217508, |
| "grad_norm": 4.799075126647949, |
| "learning_rate": 5.013930914912476e-06, |
| "loss": 40.7555, |
| "step": 1400 |
| }, |
| { |
| "epoch": 4.792527355217508, |
| "eval_loss": 0.6814665198326111, |
| "eval_runtime": 134.9461, |
| "eval_samples_per_second": 29.234, |
| "eval_steps_per_second": 7.314, |
| "step": 1400 |
| }, |
| { |
| "epoch": 4.799359487590072, |
| "grad_norm": 3.7408673763275146, |
| "learning_rate": 4.959074118208726e-06, |
| "loss": 40.9295, |
| "step": 1402 |
| }, |
| { |
| "epoch": 4.806191619962637, |
| "grad_norm": 3.9520747661590576, |
| "learning_rate": 4.9044860059140275e-06, |
| "loss": 43.4186, |
| "step": 1404 |
| }, |
| { |
| "epoch": 4.8130237523352015, |
| "grad_norm": 4.115049839019775, |
| "learning_rate": 4.850167309879519e-06, |
| "loss": 42.2491, |
| "step": 1406 |
| }, |
| { |
| "epoch": 4.819855884707766, |
| "grad_norm": 5.181631088256836, |
| "learning_rate": 4.796118758344354e-06, |
| "loss": 41.583, |
| "step": 1408 |
| }, |
| { |
| "epoch": 4.826688017080331, |
| "grad_norm": 3.838186740875244, |
| "learning_rate": 4.742341075925916e-06, |
| "loss": 43.3278, |
| "step": 1410 |
| }, |
| { |
| "epoch": 4.833520149452895, |
| "grad_norm": 3.6494245529174805, |
| "learning_rate": 4.6888349836100825e-06, |
| "loss": 41.3961, |
| "step": 1412 |
| }, |
| { |
| "epoch": 4.84035228182546, |
| "grad_norm": 4.139842510223389, |
| "learning_rate": 4.6356011987416075e-06, |
| "loss": 43.4135, |
| "step": 1414 |
| }, |
| { |
| "epoch": 4.847184414198025, |
| "grad_norm": 4.385437965393066, |
| "learning_rate": 4.58264043501446e-06, |
| "loss": 42.1478, |
| "step": 1416 |
| }, |
| { |
| "epoch": 4.854016546570589, |
| "grad_norm": 3.691343307495117, |
| "learning_rate": 4.52995340246227e-06, |
| "loss": 42.4175, |
| "step": 1418 |
| }, |
| { |
| "epoch": 4.860848678943155, |
| "grad_norm": 4.149899482727051, |
| "learning_rate": 4.477540807448832e-06, |
| "loss": 42.4116, |
| "step": 1420 |
| }, |
| { |
| "epoch": 4.8676808113157195, |
| "grad_norm": 3.8960561752319336, |
| "learning_rate": 4.425403352658591e-06, |
| "loss": 41.2306, |
| "step": 1422 |
| }, |
| { |
| "epoch": 4.874512943688284, |
| "grad_norm": 3.6276168823242188, |
| "learning_rate": 4.373541737087264e-06, |
| "loss": 42.7317, |
| "step": 1424 |
| }, |
| { |
| "epoch": 4.881345076060849, |
| "grad_norm": 4.214303016662598, |
| "learning_rate": 4.32195665603245e-06, |
| "loss": 41.6166, |
| "step": 1426 |
| }, |
| { |
| "epoch": 4.888177208433413, |
| "grad_norm": 4.3136210441589355, |
| "learning_rate": 4.270648801084296e-06, |
| "loss": 42.3309, |
| "step": 1428 |
| }, |
| { |
| "epoch": 4.895009340805978, |
| "grad_norm": 5.340824604034424, |
| "learning_rate": 4.219618860116242e-06, |
| "loss": 40.6249, |
| "step": 1430 |
| }, |
| { |
| "epoch": 4.901841473178543, |
| "grad_norm": 3.750943183898926, |
| "learning_rate": 4.1688675172758064e-06, |
| "loss": 42.0754, |
| "step": 1432 |
| }, |
| { |
| "epoch": 4.908673605551107, |
| "grad_norm": 3.8021140098571777, |
| "learning_rate": 4.118395452975382e-06, |
| "loss": 42.8221, |
| "step": 1434 |
| }, |
| { |
| "epoch": 4.915505737923672, |
| "grad_norm": 5.09911584854126, |
| "learning_rate": 4.068203343883159e-06, |
| "loss": 42.3164, |
| "step": 1436 |
| }, |
| { |
| "epoch": 4.9223378702962375, |
| "grad_norm": 3.590981960296631, |
| "learning_rate": 4.018291862914001e-06, |
| "loss": 41.0773, |
| "step": 1438 |
| }, |
| { |
| "epoch": 4.929170002668801, |
| "grad_norm": 4.474262714385986, |
| "learning_rate": 3.968661679220468e-06, |
| "loss": 41.1827, |
| "step": 1440 |
| }, |
| { |
| "epoch": 4.936002135041367, |
| "grad_norm": 3.780853748321533, |
| "learning_rate": 3.919313458183838e-06, |
| "loss": 41.9009, |
| "step": 1442 |
| }, |
| { |
| "epoch": 4.942834267413931, |
| "grad_norm": 4.165524482727051, |
| "learning_rate": 3.8702478614051355e-06, |
| "loss": 41.6988, |
| "step": 1444 |
| }, |
| { |
| "epoch": 4.949666399786496, |
| "grad_norm": 4.537020683288574, |
| "learning_rate": 3.821465546696337e-06, |
| "loss": 42.6527, |
| "step": 1446 |
| }, |
| { |
| "epoch": 4.956498532159061, |
| "grad_norm": 5.992898941040039, |
| "learning_rate": 3.772967168071517e-06, |
| "loss": 42.3257, |
| "step": 1448 |
| }, |
| { |
| "epoch": 4.963330664531625, |
| "grad_norm": 5.681396007537842, |
| "learning_rate": 3.7247533757380603e-06, |
| "loss": 42.5366, |
| "step": 1450 |
| }, |
| { |
| "epoch": 4.963330664531625, |
| "eval_loss": 0.6770752668380737, |
| "eval_runtime": 133.8871, |
| "eval_samples_per_second": 29.465, |
| "eval_steps_per_second": 7.372, |
| "step": 1450 |
| }, |
| { |
| "epoch": 4.97016279690419, |
| "grad_norm": 4.46541166305542, |
| "learning_rate": 3.6768248160879787e-06, |
| "loss": 41.0476, |
| "step": 1452 |
| }, |
| { |
| "epoch": 4.976994929276755, |
| "grad_norm": 4.15000057220459, |
| "learning_rate": 3.6291821316892184e-06, |
| "loss": 40.7134, |
| "step": 1454 |
| }, |
| { |
| "epoch": 4.983827061649319, |
| "grad_norm": 4.230960369110107, |
| "learning_rate": 3.5818259612770744e-06, |
| "loss": 43.5967, |
| "step": 1456 |
| }, |
| { |
| "epoch": 4.990659194021884, |
| "grad_norm": 4.932849884033203, |
| "learning_rate": 3.53475693974559e-06, |
| "loss": 43.2516, |
| "step": 1458 |
| }, |
| { |
| "epoch": 4.997491326394449, |
| "grad_norm": 4.316704273223877, |
| "learning_rate": 3.487975698139084e-06, |
| "loss": 42.3811, |
| "step": 1460 |
| }, |
| { |
| "epoch": 5.003416066186283, |
| "grad_norm": 4.146729469299316, |
| "learning_rate": 3.4414828636436525e-06, |
| "loss": 36.1288, |
| "step": 1462 |
| }, |
| { |
| "epoch": 5.010248198558847, |
| "grad_norm": 5.610274791717529, |
| "learning_rate": 3.3952790595787987e-06, |
| "loss": 40.6556, |
| "step": 1464 |
| }, |
| { |
| "epoch": 5.017080330931412, |
| "grad_norm": 6.292807102203369, |
| "learning_rate": 3.3493649053890326e-06, |
| "loss": 42.2675, |
| "step": 1466 |
| }, |
| { |
| "epoch": 5.023912463303977, |
| "grad_norm": 4.371929168701172, |
| "learning_rate": 3.3037410166356143e-06, |
| "loss": 41.1544, |
| "step": 1468 |
| }, |
| { |
| "epoch": 5.030744595676541, |
| "grad_norm": 3.275562047958374, |
| "learning_rate": 3.258408004988278e-06, |
| "loss": 42.7401, |
| "step": 1470 |
| }, |
| { |
| "epoch": 5.037576728049106, |
| "grad_norm": 5.2857666015625, |
| "learning_rate": 3.2133664782169948e-06, |
| "loss": 39.4961, |
| "step": 1472 |
| }, |
| { |
| "epoch": 5.044408860421671, |
| "grad_norm": 3.9162814617156982, |
| "learning_rate": 3.168617040183897e-06, |
| "loss": 42.7691, |
| "step": 1474 |
| }, |
| { |
| "epoch": 5.051240992794235, |
| "grad_norm": 4.741237640380859, |
| "learning_rate": 3.1241602908351404e-06, |
| "loss": 39.9539, |
| "step": 1476 |
| }, |
| { |
| "epoch": 5.0580731251668, |
| "grad_norm": 4.904325008392334, |
| "learning_rate": 3.079996826192849e-06, |
| "loss": 40.999, |
| "step": 1478 |
| }, |
| { |
| "epoch": 5.0649052575393645, |
| "grad_norm": 3.9396679401397705, |
| "learning_rate": 3.036127238347164e-06, |
| "loss": 41.8233, |
| "step": 1480 |
| }, |
| { |
| "epoch": 5.071737389911929, |
| "grad_norm": 3.5699760913848877, |
| "learning_rate": 2.992552115448258e-06, |
| "loss": 41.4895, |
| "step": 1482 |
| }, |
| { |
| "epoch": 5.078569522284495, |
| "grad_norm": 4.227250099182129, |
| "learning_rate": 2.9492720416985e-06, |
| "loss": 41.7825, |
| "step": 1484 |
| }, |
| { |
| "epoch": 5.085401654657059, |
| "grad_norm": 3.8788514137268066, |
| "learning_rate": 2.9062875973445813e-06, |
| "loss": 41.4301, |
| "step": 1486 |
| }, |
| { |
| "epoch": 5.092233787029624, |
| "grad_norm": 3.7242729663848877, |
| "learning_rate": 2.8635993586697553e-06, |
| "loss": 40.2917, |
| "step": 1488 |
| }, |
| { |
| "epoch": 5.099065919402189, |
| "grad_norm": 5.645269870758057, |
| "learning_rate": 2.821207897986114e-06, |
| "loss": 41.1435, |
| "step": 1490 |
| }, |
| { |
| "epoch": 5.105898051774753, |
| "grad_norm": 3.9231839179992676, |
| "learning_rate": 2.779113783626916e-06, |
| "loss": 41.5506, |
| "step": 1492 |
| }, |
| { |
| "epoch": 5.112730184147318, |
| "grad_norm": 4.276205062866211, |
| "learning_rate": 2.7373175799389415e-06, |
| "loss": 40.4141, |
| "step": 1494 |
| }, |
| { |
| "epoch": 5.1195623165198825, |
| "grad_norm": 6.223433971405029, |
| "learning_rate": 2.6958198472749717e-06, |
| "loss": 42.1149, |
| "step": 1496 |
| }, |
| { |
| "epoch": 5.126394448892447, |
| "grad_norm": 4.167882442474365, |
| "learning_rate": 2.65462114198623e-06, |
| "loss": 40.7711, |
| "step": 1498 |
| }, |
| { |
| "epoch": 5.133226581265012, |
| "grad_norm": 3.588376998901367, |
| "learning_rate": 2.6137220164149435e-06, |
| "loss": 42.5513, |
| "step": 1500 |
| }, |
| { |
| "epoch": 5.133226581265012, |
| "eval_loss": 0.6761642694473267, |
| "eval_runtime": 137.9512, |
| "eval_samples_per_second": 28.597, |
| "eval_steps_per_second": 7.155, |
| "step": 1500 |
| }, |
| { |
| "epoch": 5.140058713637576, |
| "grad_norm": 4.149092674255371, |
| "learning_rate": 2.573123018886961e-06, |
| "loss": 40.5633, |
| "step": 1502 |
| }, |
| { |
| "epoch": 5.146890846010141, |
| "grad_norm": 3.9322760105133057, |
| "learning_rate": 2.5328246937043526e-06, |
| "loss": 41.3711, |
| "step": 1504 |
| }, |
| { |
| "epoch": 5.1537229783827065, |
| "grad_norm": 4.557422161102295, |
| "learning_rate": 2.492827581138149e-06, |
| "loss": 39.5696, |
| "step": 1506 |
| }, |
| { |
| "epoch": 5.160555110755271, |
| "grad_norm": 3.772927761077881, |
| "learning_rate": 2.4531322174210975e-06, |
| "loss": 42.9544, |
| "step": 1508 |
| }, |
| { |
| "epoch": 5.167387243127836, |
| "grad_norm": 4.051291465759277, |
| "learning_rate": 2.4137391347404476e-06, |
| "loss": 40.978, |
| "step": 1510 |
| }, |
| { |
| "epoch": 5.1742193755004005, |
| "grad_norm": 3.6557424068450928, |
| "learning_rate": 2.37464886123083e-06, |
| "loss": 41.606, |
| "step": 1512 |
| }, |
| { |
| "epoch": 5.181051507872965, |
| "grad_norm": 4.801413536071777, |
| "learning_rate": 2.3358619209672e-06, |
| "loss": 41.5917, |
| "step": 1514 |
| }, |
| { |
| "epoch": 5.18788364024553, |
| "grad_norm": 4.2001423835754395, |
| "learning_rate": 2.2973788339577613e-06, |
| "loss": 43.0596, |
| "step": 1516 |
| }, |
| { |
| "epoch": 5.194715772618094, |
| "grad_norm": 5.291867256164551, |
| "learning_rate": 2.2592001161370392e-06, |
| "loss": 40.3588, |
| "step": 1518 |
| }, |
| { |
| "epoch": 5.201547904990659, |
| "grad_norm": 3.7930984497070312, |
| "learning_rate": 2.2213262793589484e-06, |
| "loss": 42.0758, |
| "step": 1520 |
| }, |
| { |
| "epoch": 5.208380037363224, |
| "grad_norm": 4.888052940368652, |
| "learning_rate": 2.1837578313899098e-06, |
| "loss": 39.7415, |
| "step": 1522 |
| }, |
| { |
| "epoch": 5.215212169735788, |
| "grad_norm": 4.963688850402832, |
| "learning_rate": 2.1464952759020855e-06, |
| "loss": 42.05, |
| "step": 1524 |
| }, |
| { |
| "epoch": 5.222044302108353, |
| "grad_norm": 4.556923866271973, |
| "learning_rate": 2.109539112466588e-06, |
| "loss": 40.5828, |
| "step": 1526 |
| }, |
| { |
| "epoch": 5.228876434480918, |
| "grad_norm": 3.550285577774048, |
| "learning_rate": 2.0728898365467903e-06, |
| "loss": 41.4201, |
| "step": 1528 |
| }, |
| { |
| "epoch": 5.235708566853483, |
| "grad_norm": 4.290851593017578, |
| "learning_rate": 2.0365479394917147e-06, |
| "loss": 41.1988, |
| "step": 1530 |
| }, |
| { |
| "epoch": 5.242540699226048, |
| "grad_norm": 4.436618804931641, |
| "learning_rate": 2.0005139085293945e-06, |
| "loss": 41.1016, |
| "step": 1532 |
| }, |
| { |
| "epoch": 5.249372831598612, |
| "grad_norm": 6.221188068389893, |
| "learning_rate": 1.9647882267603862e-06, |
| "loss": 42.1538, |
| "step": 1534 |
| }, |
| { |
| "epoch": 5.256204963971177, |
| "grad_norm": 4.712629795074463, |
| "learning_rate": 1.9293713731512673e-06, |
| "loss": 41.1176, |
| "step": 1536 |
| }, |
| { |
| "epoch": 5.263037096343742, |
| "grad_norm": 4.693170070648193, |
| "learning_rate": 1.894263822528225e-06, |
| "loss": 41.3687, |
| "step": 1538 |
| }, |
| { |
| "epoch": 5.269869228716306, |
| "grad_norm": 4.854535102844238, |
| "learning_rate": 1.8594660455706763e-06, |
| "loss": 41.6856, |
| "step": 1540 |
| }, |
| { |
| "epoch": 5.276701361088871, |
| "grad_norm": 3.5167202949523926, |
| "learning_rate": 1.8249785088049893e-06, |
| "loss": 42.5848, |
| "step": 1542 |
| }, |
| { |
| "epoch": 5.2835334934614355, |
| "grad_norm": 4.029543399810791, |
| "learning_rate": 1.790801674598186e-06, |
| "loss": 41.8932, |
| "step": 1544 |
| }, |
| { |
| "epoch": 5.290365625834, |
| "grad_norm": 4.217826843261719, |
| "learning_rate": 1.7569360011517848e-06, |
| "loss": 41.478, |
| "step": 1546 |
| }, |
| { |
| "epoch": 5.297197758206565, |
| "grad_norm": 3.8237998485565186, |
| "learning_rate": 1.7233819424956248e-06, |
| "loss": 42.5394, |
| "step": 1548 |
| }, |
| { |
| "epoch": 5.30402989057913, |
| "grad_norm": 5.044140338897705, |
| "learning_rate": 1.6901399484818004e-06, |
| "loss": 41.0466, |
| "step": 1550 |
| }, |
| { |
| "epoch": 5.30402989057913, |
| "eval_loss": 0.6723917722702026, |
| "eval_runtime": 132.3674, |
| "eval_samples_per_second": 29.803, |
| "eval_steps_per_second": 7.457, |
| "step": 1550 |
| }, |
| { |
| "epoch": 5.310862022951695, |
| "grad_norm": 4.023882865905762, |
| "learning_rate": 1.6572104647786247e-06, |
| "loss": 40.4515, |
| "step": 1552 |
| }, |
| { |
| "epoch": 5.31769415532426, |
| "grad_norm": 5.667575836181641, |
| "learning_rate": 1.624593932864632e-06, |
| "loss": 42.2196, |
| "step": 1554 |
| }, |
| { |
| "epoch": 5.324526287696824, |
| "grad_norm": 3.771815299987793, |
| "learning_rate": 1.5922907900227018e-06, |
| "loss": 41.1018, |
| "step": 1556 |
| }, |
| { |
| "epoch": 5.331358420069389, |
| "grad_norm": 4.044847011566162, |
| "learning_rate": 1.5603014693341662e-06, |
| "loss": 40.8528, |
| "step": 1558 |
| }, |
| { |
| "epoch": 5.3381905524419535, |
| "grad_norm": 4.64625358581543, |
| "learning_rate": 1.5286263996730026e-06, |
| "loss": 41.612, |
| "step": 1560 |
| }, |
| { |
| "epoch": 5.345022684814518, |
| "grad_norm": 5.102336406707764, |
| "learning_rate": 1.497266005700107e-06, |
| "loss": 40.965, |
| "step": 1562 |
| }, |
| { |
| "epoch": 5.351854817187083, |
| "grad_norm": 3.1535797119140625, |
| "learning_rate": 1.4662207078575684e-06, |
| "loss": 40.5264, |
| "step": 1564 |
| }, |
| { |
| "epoch": 5.358686949559647, |
| "grad_norm": 3.740694522857666, |
| "learning_rate": 1.4354909223630669e-06, |
| "loss": 41.5863, |
| "step": 1566 |
| }, |
| { |
| "epoch": 5.365519081932212, |
| "grad_norm": 4.79527473449707, |
| "learning_rate": 1.40507706120426e-06, |
| "loss": 41.3632, |
| "step": 1568 |
| }, |
| { |
| "epoch": 5.372351214304777, |
| "grad_norm": 4.936699867248535, |
| "learning_rate": 1.3749795321332887e-06, |
| "loss": 41.898, |
| "step": 1570 |
| }, |
| { |
| "epoch": 5.379183346677342, |
| "grad_norm": 6.228104114532471, |
| "learning_rate": 1.3451987386612851e-06, |
| "loss": 41.3327, |
| "step": 1572 |
| }, |
| { |
| "epoch": 5.386015479049907, |
| "grad_norm": 3.9607808589935303, |
| "learning_rate": 1.3157350800529878e-06, |
| "loss": 39.3806, |
| "step": 1574 |
| }, |
| { |
| "epoch": 5.3928476114224715, |
| "grad_norm": 3.2485790252685547, |
| "learning_rate": 1.286588951321363e-06, |
| "loss": 39.292, |
| "step": 1576 |
| }, |
| { |
| "epoch": 5.399679743795036, |
| "grad_norm": 4.702234745025635, |
| "learning_rate": 1.2577607432223276e-06, |
| "loss": 40.3127, |
| "step": 1578 |
| }, |
| { |
| "epoch": 5.406511876167601, |
| "grad_norm": 4.465649127960205, |
| "learning_rate": 1.2292508422495158e-06, |
| "loss": 41.7889, |
| "step": 1580 |
| }, |
| { |
| "epoch": 5.413344008540165, |
| "grad_norm": 4.618641376495361, |
| "learning_rate": 1.2010596306290589e-06, |
| "loss": 41.2257, |
| "step": 1582 |
| }, |
| { |
| "epoch": 5.42017614091273, |
| "grad_norm": 4.093713283538818, |
| "learning_rate": 1.1731874863145143e-06, |
| "loss": 41.7067, |
| "step": 1584 |
| }, |
| { |
| "epoch": 5.427008273285295, |
| "grad_norm": 5.642305374145508, |
| "learning_rate": 1.145634782981761e-06, |
| "loss": 41.1947, |
| "step": 1586 |
| }, |
| { |
| "epoch": 5.433840405657859, |
| "grad_norm": 3.9637906551361084, |
| "learning_rate": 1.1184018900240011e-06, |
| "loss": 41.5425, |
| "step": 1588 |
| }, |
| { |
| "epoch": 5.440672538030424, |
| "grad_norm": 4.328593730926514, |
| "learning_rate": 1.0914891725468141e-06, |
| "loss": 41.7915, |
| "step": 1590 |
| }, |
| { |
| "epoch": 5.4475046704029895, |
| "grad_norm": 4.559619903564453, |
| "learning_rate": 1.06489699136324e-06, |
| "loss": 39.5462, |
| "step": 1592 |
| }, |
| { |
| "epoch": 5.454336802775554, |
| "grad_norm": 4.174973011016846, |
| "learning_rate": 1.0386257029889768e-06, |
| "loss": 40.6458, |
| "step": 1594 |
| }, |
| { |
| "epoch": 5.461168935148119, |
| "grad_norm": 3.249431610107422, |
| "learning_rate": 1.0126756596375686e-06, |
| "loss": 41.4128, |
| "step": 1596 |
| }, |
| { |
| "epoch": 5.468001067520683, |
| "grad_norm": 4.598479747772217, |
| "learning_rate": 9.87047209215694e-07, |
| "loss": 41.7854, |
| "step": 1598 |
| }, |
| { |
| "epoch": 5.474833199893248, |
| "grad_norm": 3.558709144592285, |
| "learning_rate": 9.617406953185138e-07, |
| "loss": 41.9632, |
| "step": 1600 |
| }, |
| { |
| "epoch": 5.474833199893248, |
| "eval_loss": 0.6698766350746155, |
| "eval_runtime": 133.9539, |
| "eval_samples_per_second": 29.45, |
| "eval_steps_per_second": 7.368, |
| "step": 1600 |
| }, |
| { |
| "epoch": 5.481665332265813, |
| "grad_norm": 5.397751331329346, |
| "learning_rate": 9.36756457225052e-07, |
| "loss": 40.2635, |
| "step": 1602 |
| }, |
| { |
| "epoch": 5.488497464638377, |
| "grad_norm": 5.443418502807617, |
| "learning_rate": 9.120948298936421e-07, |
| "loss": 40.6923, |
| "step": 1604 |
| }, |
| { |
| "epoch": 5.495329597010942, |
| "grad_norm": 3.991673707962036, |
| "learning_rate": 8.87756143957455e-07, |
| "loss": 40.0543, |
| "step": 1606 |
| }, |
| { |
| "epoch": 5.502161729383507, |
| "grad_norm": 4.649523735046387, |
| "learning_rate": 8.637407257200497e-07, |
| "loss": 41.3534, |
| "step": 1608 |
| }, |
| { |
| "epoch": 5.508993861756071, |
| "grad_norm": 4.675793170928955, |
| "learning_rate": 8.400488971509968e-07, |
| "loss": 39.8315, |
| "step": 1610 |
| }, |
| { |
| "epoch": 5.515825994128637, |
| "grad_norm": 3.273359775543213, |
| "learning_rate": 8.166809758815896e-07, |
| "loss": 39.9979, |
| "step": 1612 |
| }, |
| { |
| "epoch": 5.5226581265012005, |
| "grad_norm": 4.165469169616699, |
| "learning_rate": 7.936372752005399e-07, |
| "loss": 39.3362, |
| "step": 1614 |
| }, |
| { |
| "epoch": 5.529490258873766, |
| "grad_norm": 4.015806674957275, |
| "learning_rate": 7.709181040498254e-07, |
| "loss": 40.7772, |
| "step": 1616 |
| }, |
| { |
| "epoch": 5.536322391246331, |
| "grad_norm": 6.13747501373291, |
| "learning_rate": 7.485237670205175e-07, |
| "loss": 40.8463, |
| "step": 1618 |
| }, |
| { |
| "epoch": 5.543154523618895, |
| "grad_norm": 3.6014761924743652, |
| "learning_rate": 7.264545643486997e-07, |
| "loss": 40.231, |
| "step": 1620 |
| }, |
| { |
| "epoch": 5.54998665599146, |
| "grad_norm": 4.055222034454346, |
| "learning_rate": 7.047107919114588e-07, |
| "loss": 42.5435, |
| "step": 1622 |
| }, |
| { |
| "epoch": 5.5568187883640245, |
| "grad_norm": 5.444411277770996, |
| "learning_rate": 6.832927412229018e-07, |
| "loss": 41.0914, |
| "step": 1624 |
| }, |
| { |
| "epoch": 5.563650920736589, |
| "grad_norm": 3.4832520484924316, |
| "learning_rate": 6.622006994302543e-07, |
| "loss": 42.297, |
| "step": 1626 |
| }, |
| { |
| "epoch": 5.570483053109154, |
| "grad_norm": 5.123753547668457, |
| "learning_rate": 6.41434949310013e-07, |
| "loss": 40.4283, |
| "step": 1628 |
| }, |
| { |
| "epoch": 5.5773151854817185, |
| "grad_norm": 5.2065277099609375, |
| "learning_rate": 6.209957692641544e-07, |
| "loss": 40.5581, |
| "step": 1630 |
| }, |
| { |
| "epoch": 5.584147317854283, |
| "grad_norm": 4.573667049407959, |
| "learning_rate": 6.008834333163876e-07, |
| "loss": 39.4126, |
| "step": 1632 |
| }, |
| { |
| "epoch": 5.590979450226849, |
| "grad_norm": 5.208593368530273, |
| "learning_rate": 5.810982111085106e-07, |
| "loss": 40.7202, |
| "step": 1634 |
| }, |
| { |
| "epoch": 5.597811582599413, |
| "grad_norm": 4.341737747192383, |
| "learning_rate": 5.616403678967624e-07, |
| "loss": 40.9683, |
| "step": 1636 |
| }, |
| { |
| "epoch": 5.604643714971978, |
| "grad_norm": 4.836015701293945, |
| "learning_rate": 5.42510164548285e-07, |
| "loss": 40.4273, |
| "step": 1638 |
| }, |
| { |
| "epoch": 5.6114758473445425, |
| "grad_norm": 4.308472633361816, |
| "learning_rate": 5.237078575376336e-07, |
| "loss": 41.0492, |
| "step": 1640 |
| }, |
| { |
| "epoch": 5.618307979717107, |
| "grad_norm": 4.316090106964111, |
| "learning_rate": 5.052336989433082e-07, |
| "loss": 40.6806, |
| "step": 1642 |
| }, |
| { |
| "epoch": 5.625140112089672, |
| "grad_norm": 3.6825830936431885, |
| "learning_rate": 4.870879364444109e-07, |
| "loss": 40.5467, |
| "step": 1644 |
| }, |
| { |
| "epoch": 5.631972244462236, |
| "grad_norm": 5.199794769287109, |
| "learning_rate": 4.692708133172991e-07, |
| "loss": 39.4587, |
| "step": 1646 |
| }, |
| { |
| "epoch": 5.638804376834801, |
| "grad_norm": 3.3388471603393555, |
| "learning_rate": 4.517825684323324e-07, |
| "loss": 39.1098, |
| "step": 1648 |
| }, |
| { |
| "epoch": 5.645636509207366, |
| "grad_norm": 4.200729846954346, |
| "learning_rate": 4.346234362506724e-07, |
| "loss": 40.122, |
| "step": 1650 |
| }, |
| { |
| "epoch": 5.645636509207366, |
| "eval_loss": 0.6662212014198303, |
| "eval_runtime": 137.6293, |
| "eval_samples_per_second": 28.664, |
| "eval_steps_per_second": 7.171, |
| "step": 1650 |
| }, |
| { |
| "epoch": 5.65246864157993, |
| "grad_norm": 3.9246127605438232, |
| "learning_rate": 4.1779364682113796e-07, |
| "loss": 40.0725, |
| "step": 1652 |
| }, |
| { |
| "epoch": 5.659300773952495, |
| "grad_norm": 4.904084205627441, |
| "learning_rate": 4.012934257771134e-07, |
| "loss": 40.0188, |
| "step": 1654 |
| }, |
| { |
| "epoch": 5.6661329063250605, |
| "grad_norm": 4.436688423156738, |
| "learning_rate": 3.851229943335394e-07, |
| "loss": 39.9216, |
| "step": 1656 |
| }, |
| { |
| "epoch": 5.672965038697625, |
| "grad_norm": 4.027088642120361, |
| "learning_rate": 3.6928256928393247e-07, |
| "loss": 41.4124, |
| "step": 1658 |
| }, |
| { |
| "epoch": 5.67979717107019, |
| "grad_norm": 3.796221971511841, |
| "learning_rate": 3.537723629974815e-07, |
| "loss": 39.8851, |
| "step": 1660 |
| }, |
| { |
| "epoch": 5.686629303442754, |
| "grad_norm": 4.7540130615234375, |
| "learning_rate": 3.3859258341621125e-07, |
| "loss": 40.1716, |
| "step": 1662 |
| }, |
| { |
| "epoch": 5.693461435815319, |
| "grad_norm": 4.521333694458008, |
| "learning_rate": 3.237434340521789e-07, |
| "loss": 41.4182, |
| "step": 1664 |
| }, |
| { |
| "epoch": 5.700293568187884, |
| "grad_norm": 4.776477336883545, |
| "learning_rate": 3.0922511398475683e-07, |
| "loss": 41.2698, |
| "step": 1666 |
| }, |
| { |
| "epoch": 5.707125700560448, |
| "grad_norm": 4.749114990234375, |
| "learning_rate": 2.9503781785795713e-07, |
| "loss": 42.4175, |
| "step": 1668 |
| }, |
| { |
| "epoch": 5.713957832933013, |
| "grad_norm": 4.831925392150879, |
| "learning_rate": 2.8118173587782516e-07, |
| "loss": 40.593, |
| "step": 1670 |
| }, |
| { |
| "epoch": 5.720789965305578, |
| "grad_norm": 4.17523193359375, |
| "learning_rate": 2.6765705380989437e-07, |
| "loss": 39.8755, |
| "step": 1672 |
| }, |
| { |
| "epoch": 5.727622097678142, |
| "grad_norm": 4.183824062347412, |
| "learning_rate": 2.544639529766829e-07, |
| "loss": 40.7682, |
| "step": 1674 |
| }, |
| { |
| "epoch": 5.734454230050707, |
| "grad_norm": 4.203549385070801, |
| "learning_rate": 2.416026102552732e-07, |
| "loss": 40.1932, |
| "step": 1676 |
| }, |
| { |
| "epoch": 5.741286362423272, |
| "grad_norm": 4.252909183502197, |
| "learning_rate": 2.290731980749361e-07, |
| "loss": 41.4024, |
| "step": 1678 |
| }, |
| { |
| "epoch": 5.748118494795837, |
| "grad_norm": 4.110680103302002, |
| "learning_rate": 2.168758844148272e-07, |
| "loss": 40.8089, |
| "step": 1680 |
| }, |
| { |
| "epoch": 5.754950627168402, |
| "grad_norm": 4.860687732696533, |
| "learning_rate": 2.050108328017164e-07, |
| "loss": 41.278, |
| "step": 1682 |
| }, |
| { |
| "epoch": 5.761782759540966, |
| "grad_norm": 7.037466526031494, |
| "learning_rate": 1.93478202307823e-07, |
| "loss": 42.0162, |
| "step": 1684 |
| }, |
| { |
| "epoch": 5.768614891913531, |
| "grad_norm": 4.048498630523682, |
| "learning_rate": 1.8227814754865068e-07, |
| "loss": 41.2187, |
| "step": 1686 |
| }, |
| { |
| "epoch": 5.775447024286096, |
| "grad_norm": 3.721379518508911, |
| "learning_rate": 1.7141081868094212e-07, |
| "loss": 41.8383, |
| "step": 1688 |
| }, |
| { |
| "epoch": 5.78227915665866, |
| "grad_norm": 6.793107509613037, |
| "learning_rate": 1.6087636140065532e-07, |
| "loss": 40.5894, |
| "step": 1690 |
| }, |
| { |
| "epoch": 5.789111289031225, |
| "grad_norm": 4.424513339996338, |
| "learning_rate": 1.5067491694100154e-07, |
| "loss": 41.2666, |
| "step": 1692 |
| }, |
| { |
| "epoch": 5.7959434214037895, |
| "grad_norm": 4.707203388214111, |
| "learning_rate": 1.4080662207056894e-07, |
| "loss": 41.2405, |
| "step": 1694 |
| }, |
| { |
| "epoch": 5.802775553776354, |
| "grad_norm": 2.994469165802002, |
| "learning_rate": 1.3127160909147672e-07, |
| "loss": 42.6466, |
| "step": 1696 |
| }, |
| { |
| "epoch": 5.809607686148919, |
| "grad_norm": 3.029481887817383, |
| "learning_rate": 1.220700058376073e-07, |
| "loss": 40.642, |
| "step": 1698 |
| }, |
| { |
| "epoch": 5.816439818521484, |
| "grad_norm": 3.4690332412719727, |
| "learning_rate": 1.1320193567288529e-07, |
| "loss": 41.02, |
| "step": 1700 |
| }, |
| { |
| "epoch": 5.816439818521484, |
| "eval_loss": 0.6652334928512573, |
| "eval_runtime": 134.4616, |
| "eval_samples_per_second": 29.339, |
| "eval_steps_per_second": 7.34, |
| "step": 1700 |
| }, |
| { |
| "epoch": 5.823271950894049, |
| "grad_norm": 5.008721828460693, |
| "learning_rate": 1.0466751748963444e-07, |
| "loss": 40.1855, |
| "step": 1702 |
| }, |
| { |
| "epoch": 5.830104083266614, |
| "grad_norm": 5.638387680053711, |
| "learning_rate": 9.646686570697061e-08, |
| "loss": 40.6194, |
| "step": 1704 |
| }, |
| { |
| "epoch": 5.836936215639178, |
| "grad_norm": 5.234898567199707, |
| "learning_rate": 8.860009026928629e-08, |
| "loss": 40.6608, |
| "step": 1706 |
| }, |
| { |
| "epoch": 5.843768348011743, |
| "grad_norm": 4.212846279144287, |
| "learning_rate": 8.106729664475176e-08, |
| "loss": 41.4097, |
| "step": 1708 |
| }, |
| { |
| "epoch": 5.8506004803843075, |
| "grad_norm": 3.5884008407592773, |
| "learning_rate": 7.386858582392187e-08, |
| "loss": 39.4515, |
| "step": 1710 |
| }, |
| { |
| "epoch": 5.857432612756872, |
| "grad_norm": 4.441662788391113, |
| "learning_rate": 6.700405431837587e-08, |
| "loss": 41.8026, |
| "step": 1712 |
| }, |
| { |
| "epoch": 5.864264745129437, |
| "grad_norm": 5.290170192718506, |
| "learning_rate": 6.047379415941856e-08, |
| "loss": 40.8839, |
| "step": 1714 |
| }, |
| { |
| "epoch": 5.871096877502001, |
| "grad_norm": 3.4507861137390137, |
| "learning_rate": 5.4277892896853476e-08, |
| "loss": 40.574, |
| "step": 1716 |
| }, |
| { |
| "epoch": 5.877929009874566, |
| "grad_norm": 3.869871139526367, |
| "learning_rate": 4.8416433597803234e-08, |
| "loss": 41.8288, |
| "step": 1718 |
| }, |
| { |
| "epoch": 5.884761142247131, |
| "grad_norm": 4.644185543060303, |
| "learning_rate": 4.2889494845599344e-08, |
| "loss": 41.318, |
| "step": 1720 |
| }, |
| { |
| "epoch": 5.891593274619696, |
| "grad_norm": 3.191018581390381, |
| "learning_rate": 3.769715073872748e-08, |
| "loss": 41.1112, |
| "step": 1722 |
| }, |
| { |
| "epoch": 5.898425406992261, |
| "grad_norm": 3.394134998321533, |
| "learning_rate": 3.283947088983663e-08, |
| "loss": 41.9932, |
| "step": 1724 |
| }, |
| { |
| "epoch": 5.9052575393648254, |
| "grad_norm": 4.62444543838501, |
| "learning_rate": 2.831652042480093e-08, |
| "loss": 39.9583, |
| "step": 1726 |
| }, |
| { |
| "epoch": 5.91208967173739, |
| "grad_norm": 4.27966833114624, |
| "learning_rate": 2.4128359981850924e-08, |
| "loss": 39.915, |
| "step": 1728 |
| }, |
| { |
| "epoch": 5.918921804109955, |
| "grad_norm": 3.7036333084106445, |
| "learning_rate": 2.0275045710760334e-08, |
| "loss": 40.0384, |
| "step": 1730 |
| }, |
| { |
| "epoch": 5.925753936482519, |
| "grad_norm": 5.249677658081055, |
| "learning_rate": 1.6756629272085545e-08, |
| "loss": 40.1564, |
| "step": 1732 |
| }, |
| { |
| "epoch": 5.932586068855084, |
| "grad_norm": 4.477707862854004, |
| "learning_rate": 1.3573157836485606e-08, |
| "loss": 40.6008, |
| "step": 1734 |
| }, |
| { |
| "epoch": 5.939418201227649, |
| "grad_norm": 4.939481258392334, |
| "learning_rate": 1.0724674084083841e-08, |
| "loss": 40.9639, |
| "step": 1736 |
| }, |
| { |
| "epoch": 5.946250333600213, |
| "grad_norm": 2.9428999423980713, |
| "learning_rate": 8.211216203890537e-09, |
| "loss": 40.9722, |
| "step": 1738 |
| }, |
| { |
| "epoch": 5.953082465972778, |
| "grad_norm": 4.589330673217773, |
| "learning_rate": 6.032817893297793e-09, |
| "loss": 41.4832, |
| "step": 1740 |
| }, |
| { |
| "epoch": 5.9599145983453425, |
| "grad_norm": 5.4429450035095215, |
| "learning_rate": 4.1895083576271035e-09, |
| "loss": 41.8059, |
| "step": 1742 |
| }, |
| { |
| "epoch": 5.966746730717908, |
| "grad_norm": 3.5152432918548584, |
| "learning_rate": 2.681312309735229e-09, |
| "loss": 41.2228, |
| "step": 1744 |
| }, |
| { |
| "epoch": 5.973578863090473, |
| "grad_norm": 4.573424339294434, |
| "learning_rate": 1.5082499696839059e-09, |
| "loss": 41.9849, |
| "step": 1746 |
| }, |
| { |
| "epoch": 5.980410995463037, |
| "grad_norm": 4.099581718444824, |
| "learning_rate": 6.703370644706164e-10, |
| "loss": 40.6948, |
| "step": 1748 |
| }, |
| { |
| "epoch": 5.987243127835602, |
| "grad_norm": 4.090056896209717, |
| "learning_rate": 1.6758482781209507e-10, |
| "loss": 40.9226, |
| "step": 1750 |
| }, |
| { |
| "epoch": 5.987243127835602, |
| "eval_loss": 0.6658891439437866, |
| "eval_runtime": 134.1369, |
| "eval_samples_per_second": 29.41, |
| "eval_steps_per_second": 7.358, |
| "step": 1750 |
| }, |
| { |
| "epoch": 5.994075260208167, |
| "grad_norm": 4.494061470031738, |
| "learning_rate": 0.0, |
| "loss": 41.0993, |
| "step": 1752 |
| } |
| ], |
| "logging_steps": 2, |
| "max_steps": 1752, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 6, |
| "save_steps": 50, |
| "stateful_callbacks": { |
| "EarlyStoppingCallback": { |
| "args": { |
| "early_stopping_patience": 3, |
| "early_stopping_threshold": 0.0 |
| }, |
| "attributes": { |
| "early_stopping_patience_counter": 1 |
| } |
| }, |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 9.616163439072248e+18, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|