{ "best_global_step": 1800, "best_metric": 0.74, "best_model_checkpoint": "/mnt/parscratch/users/acr24wz/etu/topcon/qwen3_4B/cpt_model/balanced/finetuned/all/checkpoint-1800", "epoch": 2.0642662458757712, "eval_steps": 100, "global_step": 1800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0011476115334959117, "grad_norm": 201.0, "learning_rate": 0.0, "loss": 18.5701, "step": 1 }, { "epoch": 0.0022952230669918234, "grad_norm": 1392.0, "learning_rate": 4.587155963302753e-08, "loss": 12.0441, "step": 2 }, { "epoch": 0.0034428346004877347, "grad_norm": 161.0, "learning_rate": 9.174311926605506e-08, "loss": 14.3223, "step": 3 }, { "epoch": 0.004590446133983647, "grad_norm": 150.0, "learning_rate": 1.376146788990826e-07, "loss": 10.3759, "step": 4 }, { "epoch": 0.005738057667479558, "grad_norm": 141.0, "learning_rate": 1.8348623853211012e-07, "loss": 11.4624, "step": 5 }, { "epoch": 0.006885669200975469, "grad_norm": 159.0, "learning_rate": 2.2935779816513764e-07, "loss": 10.5127, "step": 6 }, { "epoch": 0.008033280734471382, "grad_norm": 166.0, "learning_rate": 2.752293577981652e-07, "loss": 15.7339, "step": 7 }, { "epoch": 0.009180892267967294, "grad_norm": 310.0, "learning_rate": 3.211009174311927e-07, "loss": 21.2237, "step": 8 }, { "epoch": 0.010328503801463204, "grad_norm": 153.0, "learning_rate": 3.6697247706422023e-07, "loss": 11.4438, "step": 9 }, { "epoch": 0.011476115334959116, "grad_norm": 146.0, "learning_rate": 4.128440366972478e-07, "loss": 14.27, "step": 10 }, { "epoch": 0.012623726868455028, "grad_norm": 185.0, "learning_rate": 4.587155963302753e-07, "loss": 17.4331, "step": 11 }, { "epoch": 0.013771338401950939, "grad_norm": 157.0, "learning_rate": 5.045871559633028e-07, "loss": 16.0972, "step": 12 }, { "epoch": 0.014918949935446851, "grad_norm": 146.0, "learning_rate": 5.504587155963304e-07, "loss": 10.9198, "step": 13 }, { "epoch": 0.016066561468942763, "grad_norm": 194.0, "learning_rate": 5.963302752293579e-07, "loss": 14.1635, "step": 14 }, { "epoch": 0.017214173002438674, "grad_norm": 162.0, "learning_rate": 6.422018348623854e-07, "loss": 11.853, "step": 15 }, { "epoch": 0.018361784535934587, "grad_norm": 160.0, "learning_rate": 6.880733944954129e-07, "loss": 12.7435, "step": 16 }, { "epoch": 0.019509396069430498, "grad_norm": 158.0, "learning_rate": 7.339449541284405e-07, "loss": 11.8396, "step": 17 }, { "epoch": 0.020657007602926408, "grad_norm": 165.0, "learning_rate": 7.79816513761468e-07, "loss": 11.5206, "step": 18 }, { "epoch": 0.021804619136422322, "grad_norm": 177.0, "learning_rate": 8.256880733944956e-07, "loss": 11.5111, "step": 19 }, { "epoch": 0.022952230669918233, "grad_norm": 157.0, "learning_rate": 8.71559633027523e-07, "loss": 14.7131, "step": 20 }, { "epoch": 0.024099842203414143, "grad_norm": 172.0, "learning_rate": 9.174311926605506e-07, "loss": 11.1313, "step": 21 }, { "epoch": 0.025247453736910057, "grad_norm": 194.0, "learning_rate": 9.633027522935782e-07, "loss": 16.6221, "step": 22 }, { "epoch": 0.026395065270405967, "grad_norm": 268.0, "learning_rate": 1.0091743119266057e-06, "loss": 15.6201, "step": 23 }, { "epoch": 0.027542676803901878, "grad_norm": 218.0, "learning_rate": 1.055045871559633e-06, "loss": 11.6275, "step": 24 }, { "epoch": 0.02869028833739779, "grad_norm": 158.0, "learning_rate": 1.1009174311926608e-06, "loss": 15.5678, "step": 25 }, { "epoch": 0.029837899870893702, "grad_norm": 236.0, "learning_rate": 1.1467889908256882e-06, "loss": 15.4133, "step": 26 }, { "epoch": 0.030985511404389616, "grad_norm": 131.0, "learning_rate": 1.1926605504587159e-06, "loss": 12.6406, "step": 27 }, { "epoch": 0.032133122937885526, "grad_norm": 139.0, "learning_rate": 1.2385321100917433e-06, "loss": 15.0131, "step": 28 }, { "epoch": 0.03328073447138144, "grad_norm": 150.0, "learning_rate": 1.2844036697247707e-06, "loss": 13.4583, "step": 29 }, { "epoch": 0.03442834600487735, "grad_norm": 166.0, "learning_rate": 1.3302752293577984e-06, "loss": 15.6894, "step": 30 }, { "epoch": 0.03557595753837326, "grad_norm": 168.0, "learning_rate": 1.3761467889908258e-06, "loss": 13.8435, "step": 31 }, { "epoch": 0.036723569071869175, "grad_norm": 191.0, "learning_rate": 1.4220183486238535e-06, "loss": 8.6607, "step": 32 }, { "epoch": 0.03787118060536508, "grad_norm": 140.0, "learning_rate": 1.467889908256881e-06, "loss": 12.4132, "step": 33 }, { "epoch": 0.039018792138860996, "grad_norm": 201.0, "learning_rate": 1.5137614678899084e-06, "loss": 13.8843, "step": 34 }, { "epoch": 0.04016640367235691, "grad_norm": 164.0, "learning_rate": 1.559633027522936e-06, "loss": 15.1008, "step": 35 }, { "epoch": 0.041314015205852817, "grad_norm": 284.0, "learning_rate": 1.6055045871559635e-06, "loss": 21.6636, "step": 36 }, { "epoch": 0.04246162673934873, "grad_norm": 394.0, "learning_rate": 1.6513761467889911e-06, "loss": 14.0359, "step": 37 }, { "epoch": 0.043609238272844644, "grad_norm": 161.0, "learning_rate": 1.6972477064220186e-06, "loss": 13.4677, "step": 38 }, { "epoch": 0.04475684980634055, "grad_norm": 164.0, "learning_rate": 1.743119266055046e-06, "loss": 8.3447, "step": 39 }, { "epoch": 0.045904461339836465, "grad_norm": 148.0, "learning_rate": 1.7889908256880737e-06, "loss": 6.5679, "step": 40 }, { "epoch": 0.04705207287333238, "grad_norm": 168.0, "learning_rate": 1.8348623853211011e-06, "loss": 15.6762, "step": 41 }, { "epoch": 0.048199684406828286, "grad_norm": 164.0, "learning_rate": 1.8807339449541288e-06, "loss": 6.2052, "step": 42 }, { "epoch": 0.0493472959403242, "grad_norm": 103.5, "learning_rate": 1.9266055045871564e-06, "loss": 8.8464, "step": 43 }, { "epoch": 0.050494907473820114, "grad_norm": 372.0, "learning_rate": 1.9724770642201837e-06, "loss": 15.5934, "step": 44 }, { "epoch": 0.05164251900731602, "grad_norm": 210.0, "learning_rate": 2.0183486238532113e-06, "loss": 11.6431, "step": 45 }, { "epoch": 0.052790130540811935, "grad_norm": 125.5, "learning_rate": 2.064220183486239e-06, "loss": 8.5935, "step": 46 }, { "epoch": 0.05393774207430785, "grad_norm": 102.5, "learning_rate": 2.110091743119266e-06, "loss": 9.1192, "step": 47 }, { "epoch": 0.055085353607803755, "grad_norm": 143.0, "learning_rate": 2.155963302752294e-06, "loss": 8.8696, "step": 48 }, { "epoch": 0.05623296514129967, "grad_norm": 137.0, "learning_rate": 2.2018348623853215e-06, "loss": 11.2497, "step": 49 }, { "epoch": 0.05738057667479558, "grad_norm": 129.0, "learning_rate": 2.2477064220183487e-06, "loss": 11.6115, "step": 50 }, { "epoch": 0.05852818820829149, "grad_norm": 149.0, "learning_rate": 2.2935779816513764e-06, "loss": 13.9466, "step": 51 }, { "epoch": 0.059675799741787404, "grad_norm": 120.5, "learning_rate": 2.339449541284404e-06, "loss": 9.2116, "step": 52 }, { "epoch": 0.06082341127528332, "grad_norm": 240.0, "learning_rate": 2.3853211009174317e-06, "loss": 21.5138, "step": 53 }, { "epoch": 0.06197102280877923, "grad_norm": 112.0, "learning_rate": 2.431192660550459e-06, "loss": 9.7849, "step": 54 }, { "epoch": 0.06311863434227515, "grad_norm": 135.0, "learning_rate": 2.4770642201834866e-06, "loss": 12.795, "step": 55 }, { "epoch": 0.06426624587577105, "grad_norm": 222.0, "learning_rate": 2.522935779816514e-06, "loss": 15.822, "step": 56 }, { "epoch": 0.06541385740926696, "grad_norm": 137.0, "learning_rate": 2.5688073394495415e-06, "loss": 14.4134, "step": 57 }, { "epoch": 0.06656146894276288, "grad_norm": 100.0, "learning_rate": 2.6146788990825687e-06, "loss": 10.1907, "step": 58 }, { "epoch": 0.06770908047625879, "grad_norm": 154.0, "learning_rate": 2.6605504587155968e-06, "loss": 14.2824, "step": 59 }, { "epoch": 0.0688566920097547, "grad_norm": 109.0, "learning_rate": 2.706422018348624e-06, "loss": 8.6855, "step": 60 }, { "epoch": 0.07000430354325061, "grad_norm": 198.0, "learning_rate": 2.7522935779816517e-06, "loss": 14.2233, "step": 61 }, { "epoch": 0.07115191507674652, "grad_norm": 134.0, "learning_rate": 2.798165137614679e-06, "loss": 11.8776, "step": 62 }, { "epoch": 0.07229952661024243, "grad_norm": 112.5, "learning_rate": 2.844036697247707e-06, "loss": 9.321, "step": 63 }, { "epoch": 0.07344713814373835, "grad_norm": 118.0, "learning_rate": 2.8899082568807342e-06, "loss": 10.727, "step": 64 }, { "epoch": 0.07459474967723426, "grad_norm": 167.0, "learning_rate": 2.935779816513762e-06, "loss": 13.9879, "step": 65 }, { "epoch": 0.07574236121073016, "grad_norm": 100.0, "learning_rate": 2.981651376146789e-06, "loss": 7.9334, "step": 66 }, { "epoch": 0.07688997274422608, "grad_norm": 153.0, "learning_rate": 3.0275229357798168e-06, "loss": 14.9343, "step": 67 }, { "epoch": 0.07803758427772199, "grad_norm": 119.0, "learning_rate": 3.073394495412844e-06, "loss": 8.6676, "step": 68 }, { "epoch": 0.0791851958112179, "grad_norm": 117.5, "learning_rate": 3.119266055045872e-06, "loss": 10.101, "step": 69 }, { "epoch": 0.08033280734471382, "grad_norm": 85.5, "learning_rate": 3.1651376146788993e-06, "loss": 7.5899, "step": 70 }, { "epoch": 0.08148041887820973, "grad_norm": 174.0, "learning_rate": 3.211009174311927e-06, "loss": 15.9673, "step": 71 }, { "epoch": 0.08262803041170563, "grad_norm": 221.0, "learning_rate": 3.256880733944954e-06, "loss": 14.1455, "step": 72 }, { "epoch": 0.08377564194520155, "grad_norm": 172.0, "learning_rate": 3.3027522935779823e-06, "loss": 15.9228, "step": 73 }, { "epoch": 0.08492325347869746, "grad_norm": 144.0, "learning_rate": 3.3486238532110095e-06, "loss": 12.6043, "step": 74 }, { "epoch": 0.08607086501219337, "grad_norm": 118.5, "learning_rate": 3.394495412844037e-06, "loss": 9.2068, "step": 75 }, { "epoch": 0.08721847654568929, "grad_norm": 147.0, "learning_rate": 3.4403669724770644e-06, "loss": 11.8722, "step": 76 }, { "epoch": 0.0883660880791852, "grad_norm": 119.5, "learning_rate": 3.486238532110092e-06, "loss": 10.4207, "step": 77 }, { "epoch": 0.0895136996126811, "grad_norm": 170.0, "learning_rate": 3.5321100917431193e-06, "loss": 14.4936, "step": 78 }, { "epoch": 0.09066131114617702, "grad_norm": 183.0, "learning_rate": 3.5779816513761473e-06, "loss": 14.2192, "step": 79 }, { "epoch": 0.09180892267967293, "grad_norm": 128.0, "learning_rate": 3.6238532110091746e-06, "loss": 12.4628, "step": 80 }, { "epoch": 0.09295653421316884, "grad_norm": 100.0, "learning_rate": 3.6697247706422022e-06, "loss": 5.9004, "step": 81 }, { "epoch": 0.09410414574666476, "grad_norm": 163.0, "learning_rate": 3.7155963302752295e-06, "loss": 12.766, "step": 82 }, { "epoch": 0.09525175728016066, "grad_norm": 202.0, "learning_rate": 3.7614678899082575e-06, "loss": 14.3118, "step": 83 }, { "epoch": 0.09639936881365657, "grad_norm": 314.0, "learning_rate": 3.8073394495412848e-06, "loss": 12.7559, "step": 84 }, { "epoch": 0.09754698034715249, "grad_norm": 100.0, "learning_rate": 3.853211009174313e-06, "loss": 7.6448, "step": 85 }, { "epoch": 0.0986945918806484, "grad_norm": 135.0, "learning_rate": 3.89908256880734e-06, "loss": 11.1222, "step": 86 }, { "epoch": 0.0998422034141443, "grad_norm": 176.0, "learning_rate": 3.944954128440367e-06, "loss": 11.0153, "step": 87 }, { "epoch": 0.10098981494764023, "grad_norm": 130.0, "learning_rate": 3.9908256880733945e-06, "loss": 11.3109, "step": 88 }, { "epoch": 0.10213742648113613, "grad_norm": 176.0, "learning_rate": 4.036697247706423e-06, "loss": 11.3729, "step": 89 }, { "epoch": 0.10328503801463204, "grad_norm": 132.0, "learning_rate": 4.08256880733945e-06, "loss": 10.5579, "step": 90 }, { "epoch": 0.10443264954812796, "grad_norm": 126.0, "learning_rate": 4.128440366972478e-06, "loss": 9.2442, "step": 91 }, { "epoch": 0.10558026108162387, "grad_norm": 151.0, "learning_rate": 4.174311926605505e-06, "loss": 13.6998, "step": 92 }, { "epoch": 0.10672787261511978, "grad_norm": 99.5, "learning_rate": 4.220183486238532e-06, "loss": 8.6871, "step": 93 }, { "epoch": 0.1078754841486157, "grad_norm": 128.0, "learning_rate": 4.26605504587156e-06, "loss": 8.4919, "step": 94 }, { "epoch": 0.1090230956821116, "grad_norm": 132.0, "learning_rate": 4.311926605504588e-06, "loss": 8.9568, "step": 95 }, { "epoch": 0.11017070721560751, "grad_norm": 135.0, "learning_rate": 4.357798165137615e-06, "loss": 11.2536, "step": 96 }, { "epoch": 0.11131831874910343, "grad_norm": 141.0, "learning_rate": 4.403669724770643e-06, "loss": 10.4686, "step": 97 }, { "epoch": 0.11246593028259934, "grad_norm": 78.5, "learning_rate": 4.44954128440367e-06, "loss": 4.7855, "step": 98 }, { "epoch": 0.11361354181609525, "grad_norm": 126.5, "learning_rate": 4.4954128440366975e-06, "loss": 8.6237, "step": 99 }, { "epoch": 0.11476115334959117, "grad_norm": 104.5, "learning_rate": 4.541284403669725e-06, "loss": 6.5662, "step": 100 }, { "epoch": 0.11476115334959117, "eval_accuracy": 0.46, "eval_loss": 10.765486717224121, "eval_runtime": 49.6485, "eval_samples_per_second": 2.014, "eval_steps_per_second": 2.014, "step": 100 }, { "epoch": 0.11590876488308707, "grad_norm": 103.0, "learning_rate": 4.587155963302753e-06, "loss": 6.5649, "step": 101 }, { "epoch": 0.11705637641658298, "grad_norm": 144.0, "learning_rate": 4.63302752293578e-06, "loss": 8.2535, "step": 102 }, { "epoch": 0.1182039879500789, "grad_norm": 135.0, "learning_rate": 4.678899082568808e-06, "loss": 11.0001, "step": 103 }, { "epoch": 0.11935159948357481, "grad_norm": 109.0, "learning_rate": 4.724770642201835e-06, "loss": 8.2321, "step": 104 }, { "epoch": 0.12049921101707071, "grad_norm": 134.0, "learning_rate": 4.770642201834863e-06, "loss": 10.8236, "step": 105 }, { "epoch": 0.12164682255056664, "grad_norm": 133.0, "learning_rate": 4.816513761467891e-06, "loss": 10.03, "step": 106 }, { "epoch": 0.12279443408406254, "grad_norm": 148.0, "learning_rate": 4.862385321100918e-06, "loss": 13.1908, "step": 107 }, { "epoch": 0.12394204561755846, "grad_norm": 64.0, "learning_rate": 4.908256880733945e-06, "loss": 3.6086, "step": 108 }, { "epoch": 0.12508965715105436, "grad_norm": 139.0, "learning_rate": 4.954128440366973e-06, "loss": 10.9146, "step": 109 }, { "epoch": 0.1262372686845503, "grad_norm": 100.5, "learning_rate": 5e-06, "loss": 7.6266, "step": 110 }, { "epoch": 0.1273848802180462, "grad_norm": 102.0, "learning_rate": 5.045871559633028e-06, "loss": 8.3553, "step": 111 }, { "epoch": 0.1285324917515421, "grad_norm": 145.0, "learning_rate": 5.091743119266055e-06, "loss": 8.7646, "step": 112 }, { "epoch": 0.129680103285038, "grad_norm": 178.0, "learning_rate": 5.137614678899083e-06, "loss": 12.6374, "step": 113 }, { "epoch": 0.13082771481853392, "grad_norm": 91.5, "learning_rate": 5.18348623853211e-06, "loss": 5.8455, "step": 114 }, { "epoch": 0.13197532635202983, "grad_norm": 122.0, "learning_rate": 5.229357798165137e-06, "loss": 9.7438, "step": 115 }, { "epoch": 0.13312293788552576, "grad_norm": 192.0, "learning_rate": 5.275229357798165e-06, "loss": 9.9915, "step": 116 }, { "epoch": 0.13427054941902167, "grad_norm": 108.0, "learning_rate": 5.3211009174311936e-06, "loss": 7.6686, "step": 117 }, { "epoch": 0.13541816095251757, "grad_norm": 153.0, "learning_rate": 5.366972477064221e-06, "loss": 10.4111, "step": 118 }, { "epoch": 0.13656577248601348, "grad_norm": 139.0, "learning_rate": 5.412844036697248e-06, "loss": 6.46, "step": 119 }, { "epoch": 0.1377133840195094, "grad_norm": 79.0, "learning_rate": 5.458715596330275e-06, "loss": 5.2337, "step": 120 }, { "epoch": 0.1388609955530053, "grad_norm": 114.5, "learning_rate": 5.504587155963303e-06, "loss": 5.2836, "step": 121 }, { "epoch": 0.14000860708650123, "grad_norm": 99.5, "learning_rate": 5.5504587155963306e-06, "loss": 7.6412, "step": 122 }, { "epoch": 0.14115621861999714, "grad_norm": 147.0, "learning_rate": 5.596330275229358e-06, "loss": 9.4328, "step": 123 }, { "epoch": 0.14230383015349304, "grad_norm": 114.5, "learning_rate": 5.642201834862385e-06, "loss": 7.6121, "step": 124 }, { "epoch": 0.14345144168698895, "grad_norm": 131.0, "learning_rate": 5.688073394495414e-06, "loss": 8.1481, "step": 125 }, { "epoch": 0.14459905322048486, "grad_norm": 124.5, "learning_rate": 5.733944954128441e-06, "loss": 6.9154, "step": 126 }, { "epoch": 0.14574666475398076, "grad_norm": 125.0, "learning_rate": 5.7798165137614684e-06, "loss": 7.5579, "step": 127 }, { "epoch": 0.1468942762874767, "grad_norm": 69.0, "learning_rate": 5.825688073394496e-06, "loss": 4.5767, "step": 128 }, { "epoch": 0.1480418878209726, "grad_norm": 136.0, "learning_rate": 5.871559633027524e-06, "loss": 8.226, "step": 129 }, { "epoch": 0.1491894993544685, "grad_norm": 592.0, "learning_rate": 5.917431192660551e-06, "loss": 4.7686, "step": 130 }, { "epoch": 0.15033711088796442, "grad_norm": 199.0, "learning_rate": 5.963302752293578e-06, "loss": 8.1914, "step": 131 }, { "epoch": 0.15148472242146033, "grad_norm": 99.0, "learning_rate": 6.0091743119266054e-06, "loss": 4.6827, "step": 132 }, { "epoch": 0.15263233395495623, "grad_norm": 79.0, "learning_rate": 6.0550458715596335e-06, "loss": 4.0036, "step": 133 }, { "epoch": 0.15377994548845217, "grad_norm": 104.0, "learning_rate": 6.100917431192661e-06, "loss": 5.5383, "step": 134 }, { "epoch": 0.15492755702194808, "grad_norm": 89.5, "learning_rate": 6.146788990825688e-06, "loss": 5.6737, "step": 135 }, { "epoch": 0.15607516855544398, "grad_norm": 126.0, "learning_rate": 6.192660550458715e-06, "loss": 6.3379, "step": 136 }, { "epoch": 0.1572227800889399, "grad_norm": 106.0, "learning_rate": 6.238532110091744e-06, "loss": 5.8609, "step": 137 }, { "epoch": 0.1583703916224358, "grad_norm": 74.0, "learning_rate": 6.284403669724771e-06, "loss": 2.5903, "step": 138 }, { "epoch": 0.1595180031559317, "grad_norm": 166.0, "learning_rate": 6.330275229357799e-06, "loss": 6.5836, "step": 139 }, { "epoch": 0.16066561468942764, "grad_norm": 132.0, "learning_rate": 6.376146788990826e-06, "loss": 4.7941, "step": 140 }, { "epoch": 0.16181322622292355, "grad_norm": 79.5, "learning_rate": 6.422018348623854e-06, "loss": 3.4315, "step": 141 }, { "epoch": 0.16296083775641945, "grad_norm": 90.0, "learning_rate": 6.467889908256881e-06, "loss": 2.8439, "step": 142 }, { "epoch": 0.16410844928991536, "grad_norm": 147.0, "learning_rate": 6.513761467889908e-06, "loss": 5.9459, "step": 143 }, { "epoch": 0.16525606082341127, "grad_norm": 127.5, "learning_rate": 6.559633027522936e-06, "loss": 5.9421, "step": 144 }, { "epoch": 0.1664036723569072, "grad_norm": 108.5, "learning_rate": 6.6055045871559645e-06, "loss": 4.3347, "step": 145 }, { "epoch": 0.1675512838904031, "grad_norm": 110.0, "learning_rate": 6.651376146788992e-06, "loss": 2.862, "step": 146 }, { "epoch": 0.16869889542389901, "grad_norm": 91.5, "learning_rate": 6.697247706422019e-06, "loss": 3.0382, "step": 147 }, { "epoch": 0.16984650695739492, "grad_norm": 90.0, "learning_rate": 6.743119266055046e-06, "loss": 2.4137, "step": 148 }, { "epoch": 0.17099411849089083, "grad_norm": 205.0, "learning_rate": 6.788990825688074e-06, "loss": 3.6585, "step": 149 }, { "epoch": 0.17214173002438674, "grad_norm": 132.0, "learning_rate": 6.8348623853211015e-06, "loss": 3.3452, "step": 150 }, { "epoch": 0.17328934155788267, "grad_norm": 102.5, "learning_rate": 6.880733944954129e-06, "loss": 2.6872, "step": 151 }, { "epoch": 0.17443695309137858, "grad_norm": 92.5, "learning_rate": 6.926605504587156e-06, "loss": 2.7081, "step": 152 }, { "epoch": 0.17558456462487448, "grad_norm": 97.0, "learning_rate": 6.972477064220184e-06, "loss": 1.789, "step": 153 }, { "epoch": 0.1767321761583704, "grad_norm": 96.5, "learning_rate": 7.018348623853211e-06, "loss": 2.1933, "step": 154 }, { "epoch": 0.1778797876918663, "grad_norm": 117.0, "learning_rate": 7.0642201834862385e-06, "loss": 1.5972, "step": 155 }, { "epoch": 0.1790273992253622, "grad_norm": 70.0, "learning_rate": 7.110091743119267e-06, "loss": 1.6302, "step": 156 }, { "epoch": 0.18017501075885814, "grad_norm": 50.0, "learning_rate": 7.155963302752295e-06, "loss": 1.1936, "step": 157 }, { "epoch": 0.18132262229235405, "grad_norm": 71.5, "learning_rate": 7.201834862385322e-06, "loss": 1.2134, "step": 158 }, { "epoch": 0.18247023382584995, "grad_norm": 37.75, "learning_rate": 7.247706422018349e-06, "loss": 0.8042, "step": 159 }, { "epoch": 0.18361784535934586, "grad_norm": 54.75, "learning_rate": 7.293577981651376e-06, "loss": 0.7016, "step": 160 }, { "epoch": 0.18476545689284177, "grad_norm": 141.0, "learning_rate": 7.3394495412844045e-06, "loss": 1.6214, "step": 161 }, { "epoch": 0.18591306842633767, "grad_norm": 118.0, "learning_rate": 7.385321100917432e-06, "loss": 1.4091, "step": 162 }, { "epoch": 0.1870606799598336, "grad_norm": 42.0, "learning_rate": 7.431192660550459e-06, "loss": 0.636, "step": 163 }, { "epoch": 0.18820829149332952, "grad_norm": 165.0, "learning_rate": 7.477064220183486e-06, "loss": 1.8344, "step": 164 }, { "epoch": 0.18935590302682542, "grad_norm": 62.0, "learning_rate": 7.522935779816515e-06, "loss": 0.5202, "step": 165 }, { "epoch": 0.19050351456032133, "grad_norm": 87.5, "learning_rate": 7.568807339449542e-06, "loss": 1.1639, "step": 166 }, { "epoch": 0.19165112609381724, "grad_norm": 53.0, "learning_rate": 7.6146788990825695e-06, "loss": 0.907, "step": 167 }, { "epoch": 0.19279873762731314, "grad_norm": 59.5, "learning_rate": 7.660550458715596e-06, "loss": 1.0624, "step": 168 }, { "epoch": 0.19394634916080908, "grad_norm": 37.0, "learning_rate": 7.706422018348626e-06, "loss": 0.6051, "step": 169 }, { "epoch": 0.19509396069430499, "grad_norm": 50.75, "learning_rate": 7.752293577981652e-06, "loss": 0.9568, "step": 170 }, { "epoch": 0.1962415722278009, "grad_norm": 99.5, "learning_rate": 7.79816513761468e-06, "loss": 1.0009, "step": 171 }, { "epoch": 0.1973891837612968, "grad_norm": 58.75, "learning_rate": 7.844036697247707e-06, "loss": 1.2179, "step": 172 }, { "epoch": 0.1985367952947927, "grad_norm": 30.375, "learning_rate": 7.889908256880735e-06, "loss": 0.2789, "step": 173 }, { "epoch": 0.1996844068282886, "grad_norm": 48.5, "learning_rate": 7.935779816513763e-06, "loss": 0.7911, "step": 174 }, { "epoch": 0.20083201836178455, "grad_norm": 42.25, "learning_rate": 7.981651376146789e-06, "loss": 0.8686, "step": 175 }, { "epoch": 0.20197962989528045, "grad_norm": 141.0, "learning_rate": 8.027522935779817e-06, "loss": 1.1276, "step": 176 }, { "epoch": 0.20312724142877636, "grad_norm": 156.0, "learning_rate": 8.073394495412845e-06, "loss": 0.8758, "step": 177 }, { "epoch": 0.20427485296227227, "grad_norm": 32.75, "learning_rate": 8.119266055045872e-06, "loss": 0.6642, "step": 178 }, { "epoch": 0.20542246449576818, "grad_norm": 56.25, "learning_rate": 8.1651376146789e-06, "loss": 1.0594, "step": 179 }, { "epoch": 0.20657007602926408, "grad_norm": 34.5, "learning_rate": 8.211009174311926e-06, "loss": 0.6556, "step": 180 }, { "epoch": 0.20771768756276002, "grad_norm": 80.5, "learning_rate": 8.256880733944956e-06, "loss": 0.8868, "step": 181 }, { "epoch": 0.20886529909625592, "grad_norm": 47.5, "learning_rate": 8.302752293577982e-06, "loss": 0.7725, "step": 182 }, { "epoch": 0.21001291062975183, "grad_norm": 23.0, "learning_rate": 8.34862385321101e-06, "loss": 0.6719, "step": 183 }, { "epoch": 0.21116052216324774, "grad_norm": 72.0, "learning_rate": 8.394495412844037e-06, "loss": 0.8492, "step": 184 }, { "epoch": 0.21230813369674364, "grad_norm": 73.0, "learning_rate": 8.440366972477065e-06, "loss": 0.7163, "step": 185 }, { "epoch": 0.21345574523023955, "grad_norm": 82.0, "learning_rate": 8.486238532110093e-06, "loss": 0.7227, "step": 186 }, { "epoch": 0.2146033567637355, "grad_norm": 48.75, "learning_rate": 8.53211009174312e-06, "loss": 0.8237, "step": 187 }, { "epoch": 0.2157509682972314, "grad_norm": 30.75, "learning_rate": 8.577981651376147e-06, "loss": 0.7007, "step": 188 }, { "epoch": 0.2168985798307273, "grad_norm": 76.0, "learning_rate": 8.623853211009175e-06, "loss": 0.7568, "step": 189 }, { "epoch": 0.2180461913642232, "grad_norm": 126.0, "learning_rate": 8.669724770642203e-06, "loss": 0.8139, "step": 190 }, { "epoch": 0.21919380289771911, "grad_norm": 67.5, "learning_rate": 8.71559633027523e-06, "loss": 0.7062, "step": 191 }, { "epoch": 0.22034141443121502, "grad_norm": 26.375, "learning_rate": 8.761467889908258e-06, "loss": 0.5425, "step": 192 }, { "epoch": 0.22148902596471096, "grad_norm": 105.5, "learning_rate": 8.807339449541286e-06, "loss": 0.8822, "step": 193 }, { "epoch": 0.22263663749820686, "grad_norm": 131.0, "learning_rate": 8.853211009174312e-06, "loss": 0.9047, "step": 194 }, { "epoch": 0.22378424903170277, "grad_norm": 56.5, "learning_rate": 8.89908256880734e-06, "loss": 0.5039, "step": 195 }, { "epoch": 0.22493186056519868, "grad_norm": 73.5, "learning_rate": 8.944954128440367e-06, "loss": 0.7597, "step": 196 }, { "epoch": 0.22607947209869458, "grad_norm": 56.25, "learning_rate": 8.990825688073395e-06, "loss": 0.742, "step": 197 }, { "epoch": 0.2272270836321905, "grad_norm": 71.5, "learning_rate": 9.036697247706423e-06, "loss": 0.892, "step": 198 }, { "epoch": 0.22837469516568643, "grad_norm": 33.0, "learning_rate": 9.08256880733945e-06, "loss": 0.6746, "step": 199 }, { "epoch": 0.22952230669918233, "grad_norm": 95.0, "learning_rate": 9.128440366972477e-06, "loss": 0.8428, "step": 200 }, { "epoch": 0.22952230669918233, "eval_accuracy": 0.23, "eval_loss": 0.7526699900627136, "eval_runtime": 49.2923, "eval_samples_per_second": 2.029, "eval_steps_per_second": 2.029, "step": 200 }, { "epoch": 0.23066991823267824, "grad_norm": 43.0, "learning_rate": 9.174311926605506e-06, "loss": 0.6504, "step": 201 }, { "epoch": 0.23181752976617415, "grad_norm": 46.75, "learning_rate": 9.220183486238534e-06, "loss": 0.7568, "step": 202 }, { "epoch": 0.23296514129967005, "grad_norm": 76.5, "learning_rate": 9.26605504587156e-06, "loss": 0.5601, "step": 203 }, { "epoch": 0.23411275283316596, "grad_norm": 82.0, "learning_rate": 9.311926605504588e-06, "loss": 0.6661, "step": 204 }, { "epoch": 0.2352603643666619, "grad_norm": 63.75, "learning_rate": 9.357798165137616e-06, "loss": 0.7619, "step": 205 }, { "epoch": 0.2364079759001578, "grad_norm": 28.5, "learning_rate": 9.403669724770643e-06, "loss": 0.6332, "step": 206 }, { "epoch": 0.2375555874336537, "grad_norm": 48.75, "learning_rate": 9.44954128440367e-06, "loss": 0.8103, "step": 207 }, { "epoch": 0.23870319896714962, "grad_norm": 32.25, "learning_rate": 9.495412844036697e-06, "loss": 0.8623, "step": 208 }, { "epoch": 0.23985081050064552, "grad_norm": 51.25, "learning_rate": 9.541284403669727e-06, "loss": 0.6734, "step": 209 }, { "epoch": 0.24099842203414143, "grad_norm": 106.0, "learning_rate": 9.587155963302753e-06, "loss": 0.7637, "step": 210 }, { "epoch": 0.24214603356763736, "grad_norm": 43.5, "learning_rate": 9.633027522935781e-06, "loss": 0.6827, "step": 211 }, { "epoch": 0.24329364510113327, "grad_norm": 56.25, "learning_rate": 9.678899082568808e-06, "loss": 0.9193, "step": 212 }, { "epoch": 0.24444125663462918, "grad_norm": 67.5, "learning_rate": 9.724770642201836e-06, "loss": 0.8784, "step": 213 }, { "epoch": 0.24558886816812509, "grad_norm": 61.0, "learning_rate": 9.770642201834864e-06, "loss": 0.6853, "step": 214 }, { "epoch": 0.246736479701621, "grad_norm": 33.5, "learning_rate": 9.81651376146789e-06, "loss": 0.6893, "step": 215 }, { "epoch": 0.24788409123511693, "grad_norm": 20.5, "learning_rate": 9.862385321100918e-06, "loss": 0.6858, "step": 216 }, { "epoch": 0.24903170276861283, "grad_norm": 51.0, "learning_rate": 9.908256880733946e-06, "loss": 0.5894, "step": 217 }, { "epoch": 0.2501793143021087, "grad_norm": 61.0, "learning_rate": 9.954128440366973e-06, "loss": 0.9096, "step": 218 }, { "epoch": 0.2513269258356047, "grad_norm": 28.625, "learning_rate": 1e-05, "loss": 0.801, "step": 219 }, { "epoch": 0.2524745373691006, "grad_norm": 41.0, "learning_rate": 1.0045871559633029e-05, "loss": 0.6585, "step": 220 }, { "epoch": 0.2536221489025965, "grad_norm": 39.75, "learning_rate": 1.0091743119266055e-05, "loss": 0.7587, "step": 221 }, { "epoch": 0.2547697604360924, "grad_norm": 40.0, "learning_rate": 1.0137614678899083e-05, "loss": 0.7094, "step": 222 }, { "epoch": 0.2559173719695883, "grad_norm": 684.0, "learning_rate": 1.018348623853211e-05, "loss": 0.7388, "step": 223 }, { "epoch": 0.2570649835030842, "grad_norm": 54.5, "learning_rate": 1.0229357798165138e-05, "loss": 0.7495, "step": 224 }, { "epoch": 0.2582125950365801, "grad_norm": 65.5, "learning_rate": 1.0275229357798166e-05, "loss": 0.834, "step": 225 }, { "epoch": 0.259360206570076, "grad_norm": 68.5, "learning_rate": 1.0321100917431192e-05, "loss": 0.9911, "step": 226 }, { "epoch": 0.26050781810357193, "grad_norm": 59.75, "learning_rate": 1.036697247706422e-05, "loss": 0.7996, "step": 227 }, { "epoch": 0.26165542963706784, "grad_norm": 39.0, "learning_rate": 1.041284403669725e-05, "loss": 0.7586, "step": 228 }, { "epoch": 0.26280304117056374, "grad_norm": 41.0, "learning_rate": 1.0458715596330275e-05, "loss": 0.6575, "step": 229 }, { "epoch": 0.26395065270405965, "grad_norm": 25.75, "learning_rate": 1.0504587155963305e-05, "loss": 0.5676, "step": 230 }, { "epoch": 0.2650982642375556, "grad_norm": 38.5, "learning_rate": 1.055045871559633e-05, "loss": 0.7107, "step": 231 }, { "epoch": 0.2662458757710515, "grad_norm": 29.0, "learning_rate": 1.0596330275229359e-05, "loss": 0.5768, "step": 232 }, { "epoch": 0.26739348730454743, "grad_norm": 67.0, "learning_rate": 1.0642201834862387e-05, "loss": 0.8002, "step": 233 }, { "epoch": 0.26854109883804334, "grad_norm": 92.0, "learning_rate": 1.0688073394495414e-05, "loss": 0.9373, "step": 234 }, { "epoch": 0.26968871037153924, "grad_norm": 95.0, "learning_rate": 1.0733944954128442e-05, "loss": 0.9883, "step": 235 }, { "epoch": 0.27083632190503515, "grad_norm": 32.25, "learning_rate": 1.077981651376147e-05, "loss": 0.3327, "step": 236 }, { "epoch": 0.27198393343853106, "grad_norm": 38.75, "learning_rate": 1.0825688073394496e-05, "loss": 0.9128, "step": 237 }, { "epoch": 0.27313154497202696, "grad_norm": 113.5, "learning_rate": 1.0871559633027524e-05, "loss": 0.7185, "step": 238 }, { "epoch": 0.27427915650552287, "grad_norm": 78.5, "learning_rate": 1.091743119266055e-05, "loss": 0.7406, "step": 239 }, { "epoch": 0.2754267680390188, "grad_norm": 54.25, "learning_rate": 1.0963302752293579e-05, "loss": 0.5355, "step": 240 }, { "epoch": 0.2765743795725147, "grad_norm": 88.0, "learning_rate": 1.1009174311926607e-05, "loss": 0.7876, "step": 241 }, { "epoch": 0.2777219911060106, "grad_norm": 25.125, "learning_rate": 1.1055045871559633e-05, "loss": 0.7005, "step": 242 }, { "epoch": 0.27886960263950655, "grad_norm": 62.0, "learning_rate": 1.1100917431192661e-05, "loss": 0.6772, "step": 243 }, { "epoch": 0.28001721417300246, "grad_norm": 88.5, "learning_rate": 1.114678899082569e-05, "loss": 0.7296, "step": 244 }, { "epoch": 0.28116482570649837, "grad_norm": 29.375, "learning_rate": 1.1192660550458716e-05, "loss": 0.7339, "step": 245 }, { "epoch": 0.2823124372399943, "grad_norm": 21.75, "learning_rate": 1.1238532110091744e-05, "loss": 0.5743, "step": 246 }, { "epoch": 0.2834600487734902, "grad_norm": 127.5, "learning_rate": 1.128440366972477e-05, "loss": 0.9532, "step": 247 }, { "epoch": 0.2846076603069861, "grad_norm": 97.0, "learning_rate": 1.1330275229357798e-05, "loss": 0.9855, "step": 248 }, { "epoch": 0.285755271840482, "grad_norm": 54.25, "learning_rate": 1.1376146788990828e-05, "loss": 0.6011, "step": 249 }, { "epoch": 0.2869028833739779, "grad_norm": 27.125, "learning_rate": 1.1422018348623853e-05, "loss": 0.4934, "step": 250 }, { "epoch": 0.2880504949074738, "grad_norm": 156.0, "learning_rate": 1.1467889908256882e-05, "loss": 1.0312, "step": 251 }, { "epoch": 0.2891981064409697, "grad_norm": 31.5, "learning_rate": 1.151376146788991e-05, "loss": 0.6735, "step": 252 }, { "epoch": 0.2903457179744656, "grad_norm": 26.0, "learning_rate": 1.1559633027522937e-05, "loss": 0.5176, "step": 253 }, { "epoch": 0.29149332950796153, "grad_norm": 28.0, "learning_rate": 1.1605504587155965e-05, "loss": 0.7067, "step": 254 }, { "epoch": 0.2926409410414575, "grad_norm": 50.75, "learning_rate": 1.1651376146788991e-05, "loss": 0.5816, "step": 255 }, { "epoch": 0.2937885525749534, "grad_norm": 33.0, "learning_rate": 1.169724770642202e-05, "loss": 0.5099, "step": 256 }, { "epoch": 0.2949361641084493, "grad_norm": 63.25, "learning_rate": 1.1743119266055047e-05, "loss": 0.6038, "step": 257 }, { "epoch": 0.2960837756419452, "grad_norm": 152.0, "learning_rate": 1.1788990825688074e-05, "loss": 1.2612, "step": 258 }, { "epoch": 0.2972313871754411, "grad_norm": 55.5, "learning_rate": 1.1834862385321102e-05, "loss": 0.8309, "step": 259 }, { "epoch": 0.298378998708937, "grad_norm": 49.75, "learning_rate": 1.188073394495413e-05, "loss": 0.7434, "step": 260 }, { "epoch": 0.29952661024243293, "grad_norm": 38.25, "learning_rate": 1.1926605504587156e-05, "loss": 0.6988, "step": 261 }, { "epoch": 0.30067422177592884, "grad_norm": 31.25, "learning_rate": 1.1972477064220184e-05, "loss": 0.674, "step": 262 }, { "epoch": 0.30182183330942475, "grad_norm": 61.25, "learning_rate": 1.2018348623853211e-05, "loss": 0.8105, "step": 263 }, { "epoch": 0.30296944484292065, "grad_norm": 67.0, "learning_rate": 1.2064220183486239e-05, "loss": 0.7834, "step": 264 }, { "epoch": 0.30411705637641656, "grad_norm": 34.0, "learning_rate": 1.2110091743119267e-05, "loss": 0.6694, "step": 265 }, { "epoch": 0.30526466790991247, "grad_norm": 48.75, "learning_rate": 1.2155963302752293e-05, "loss": 0.4389, "step": 266 }, { "epoch": 0.30641227944340843, "grad_norm": 45.0, "learning_rate": 1.2201834862385321e-05, "loss": 0.9619, "step": 267 }, { "epoch": 0.30755989097690434, "grad_norm": 27.25, "learning_rate": 1.2247706422018351e-05, "loss": 0.8181, "step": 268 }, { "epoch": 0.30870750251040024, "grad_norm": 78.5, "learning_rate": 1.2293577981651376e-05, "loss": 0.8289, "step": 269 }, { "epoch": 0.30985511404389615, "grad_norm": 29.625, "learning_rate": 1.2339449541284406e-05, "loss": 0.66, "step": 270 }, { "epoch": 0.31100272557739206, "grad_norm": 51.25, "learning_rate": 1.238532110091743e-05, "loss": 0.6833, "step": 271 }, { "epoch": 0.31215033711088797, "grad_norm": 45.0, "learning_rate": 1.243119266055046e-05, "loss": 0.6545, "step": 272 }, { "epoch": 0.3132979486443839, "grad_norm": 35.5, "learning_rate": 1.2477064220183488e-05, "loss": 0.6642, "step": 273 }, { "epoch": 0.3144455601778798, "grad_norm": 27.75, "learning_rate": 1.2522935779816515e-05, "loss": 0.7786, "step": 274 }, { "epoch": 0.3155931717113757, "grad_norm": 103.0, "learning_rate": 1.2568807339449543e-05, "loss": 0.9578, "step": 275 }, { "epoch": 0.3167407832448716, "grad_norm": 61.75, "learning_rate": 1.261467889908257e-05, "loss": 0.5513, "step": 276 }, { "epoch": 0.3178883947783675, "grad_norm": 86.5, "learning_rate": 1.2660550458715597e-05, "loss": 0.855, "step": 277 }, { "epoch": 0.3190360063118634, "grad_norm": 47.0, "learning_rate": 1.2706422018348625e-05, "loss": 0.7903, "step": 278 }, { "epoch": 0.32018361784535937, "grad_norm": 21.125, "learning_rate": 1.2752293577981652e-05, "loss": 0.6084, "step": 279 }, { "epoch": 0.3213312293788553, "grad_norm": 53.0, "learning_rate": 1.279816513761468e-05, "loss": 0.7655, "step": 280 }, { "epoch": 0.3224788409123512, "grad_norm": 69.0, "learning_rate": 1.2844036697247708e-05, "loss": 0.7763, "step": 281 }, { "epoch": 0.3236264524458471, "grad_norm": 98.0, "learning_rate": 1.2889908256880734e-05, "loss": 0.8355, "step": 282 }, { "epoch": 0.324774063979343, "grad_norm": 65.0, "learning_rate": 1.2935779816513762e-05, "loss": 0.7071, "step": 283 }, { "epoch": 0.3259216755128389, "grad_norm": 25.75, "learning_rate": 1.298165137614679e-05, "loss": 0.8358, "step": 284 }, { "epoch": 0.3270692870463348, "grad_norm": 48.25, "learning_rate": 1.3027522935779817e-05, "loss": 0.7069, "step": 285 }, { "epoch": 0.3282168985798307, "grad_norm": 27.75, "learning_rate": 1.3073394495412845e-05, "loss": 0.601, "step": 286 }, { "epoch": 0.3293645101133266, "grad_norm": 44.25, "learning_rate": 1.3119266055045871e-05, "loss": 0.6844, "step": 287 }, { "epoch": 0.33051212164682253, "grad_norm": 73.0, "learning_rate": 1.31651376146789e-05, "loss": 1.5458, "step": 288 }, { "epoch": 0.33165973318031844, "grad_norm": 36.0, "learning_rate": 1.3211009174311929e-05, "loss": 0.8631, "step": 289 }, { "epoch": 0.3328073447138144, "grad_norm": 60.25, "learning_rate": 1.3256880733944954e-05, "loss": 0.7894, "step": 290 }, { "epoch": 0.3339549562473103, "grad_norm": 46.75, "learning_rate": 1.3302752293577984e-05, "loss": 0.7715, "step": 291 }, { "epoch": 0.3351025677808062, "grad_norm": 17.5, "learning_rate": 1.3348623853211012e-05, "loss": 0.5756, "step": 292 }, { "epoch": 0.3362501793143021, "grad_norm": 32.0, "learning_rate": 1.3394495412844038e-05, "loss": 0.7155, "step": 293 }, { "epoch": 0.33739779084779803, "grad_norm": 96.5, "learning_rate": 1.3440366972477066e-05, "loss": 0.6518, "step": 294 }, { "epoch": 0.33854540238129394, "grad_norm": 37.75, "learning_rate": 1.3486238532110092e-05, "loss": 0.6962, "step": 295 }, { "epoch": 0.33969301391478984, "grad_norm": 57.5, "learning_rate": 1.353211009174312e-05, "loss": 0.5687, "step": 296 }, { "epoch": 0.34084062544828575, "grad_norm": 20.125, "learning_rate": 1.3577981651376149e-05, "loss": 0.7647, "step": 297 }, { "epoch": 0.34198823698178166, "grad_norm": 22.75, "learning_rate": 1.3623853211009175e-05, "loss": 0.7313, "step": 298 }, { "epoch": 0.34313584851527756, "grad_norm": 71.0, "learning_rate": 1.3669724770642203e-05, "loss": 0.8702, "step": 299 }, { "epoch": 0.34428346004877347, "grad_norm": 70.0, "learning_rate": 1.3715596330275231e-05, "loss": 0.7895, "step": 300 }, { "epoch": 0.34428346004877347, "eval_accuracy": 0.22, "eval_loss": 0.6987403631210327, "eval_runtime": 49.3136, "eval_samples_per_second": 2.028, "eval_steps_per_second": 2.028, "step": 300 }, { "epoch": 0.3454310715822694, "grad_norm": 32.0, "learning_rate": 1.3761467889908258e-05, "loss": 0.6857, "step": 301 }, { "epoch": 0.34657868311576534, "grad_norm": 58.25, "learning_rate": 1.3807339449541286e-05, "loss": 0.6662, "step": 302 }, { "epoch": 0.34772629464926125, "grad_norm": 26.875, "learning_rate": 1.3853211009174312e-05, "loss": 0.5594, "step": 303 }, { "epoch": 0.34887390618275715, "grad_norm": 36.5, "learning_rate": 1.389908256880734e-05, "loss": 0.6889, "step": 304 }, { "epoch": 0.35002151771625306, "grad_norm": 49.0, "learning_rate": 1.3944954128440368e-05, "loss": 0.6969, "step": 305 }, { "epoch": 0.35116912924974897, "grad_norm": 173.0, "learning_rate": 1.3990825688073395e-05, "loss": 0.6462, "step": 306 }, { "epoch": 0.3523167407832449, "grad_norm": 60.0, "learning_rate": 1.4036697247706423e-05, "loss": 0.539, "step": 307 }, { "epoch": 0.3534643523167408, "grad_norm": 27.125, "learning_rate": 1.4082568807339452e-05, "loss": 0.855, "step": 308 }, { "epoch": 0.3546119638502367, "grad_norm": 61.5, "learning_rate": 1.4128440366972477e-05, "loss": 0.7295, "step": 309 }, { "epoch": 0.3557595753837326, "grad_norm": 56.25, "learning_rate": 1.4174311926605507e-05, "loss": 0.8013, "step": 310 }, { "epoch": 0.3569071869172285, "grad_norm": 19.375, "learning_rate": 1.4220183486238533e-05, "loss": 0.6061, "step": 311 }, { "epoch": 0.3580547984507244, "grad_norm": 22.75, "learning_rate": 1.4266055045871561e-05, "loss": 0.2982, "step": 312 }, { "epoch": 0.3592024099842203, "grad_norm": 102.0, "learning_rate": 1.431192660550459e-05, "loss": 1.1087, "step": 313 }, { "epoch": 0.3603500215177163, "grad_norm": 42.25, "learning_rate": 1.4357798165137616e-05, "loss": 0.8645, "step": 314 }, { "epoch": 0.3614976330512122, "grad_norm": 112.0, "learning_rate": 1.4403669724770644e-05, "loss": 1.2745, "step": 315 }, { "epoch": 0.3626452445847081, "grad_norm": 21.125, "learning_rate": 1.4449541284403672e-05, "loss": 0.3714, "step": 316 }, { "epoch": 0.363792856118204, "grad_norm": 87.0, "learning_rate": 1.4495412844036698e-05, "loss": 0.9653, "step": 317 }, { "epoch": 0.3649404676516999, "grad_norm": 82.0, "learning_rate": 1.4541284403669726e-05, "loss": 0.982, "step": 318 }, { "epoch": 0.3660880791851958, "grad_norm": 27.375, "learning_rate": 1.4587155963302753e-05, "loss": 0.4173, "step": 319 }, { "epoch": 0.3672356907186917, "grad_norm": 74.0, "learning_rate": 1.463302752293578e-05, "loss": 0.9199, "step": 320 }, { "epoch": 0.3683833022521876, "grad_norm": 57.0, "learning_rate": 1.4678899082568809e-05, "loss": 0.6555, "step": 321 }, { "epoch": 0.36953091378568353, "grad_norm": 40.25, "learning_rate": 1.4724770642201835e-05, "loss": 0.5512, "step": 322 }, { "epoch": 0.37067852531917944, "grad_norm": 51.0, "learning_rate": 1.4770642201834863e-05, "loss": 0.8541, "step": 323 }, { "epoch": 0.37182613685267535, "grad_norm": 112.0, "learning_rate": 1.4816513761467891e-05, "loss": 1.031, "step": 324 }, { "epoch": 0.37297374838617126, "grad_norm": 17.625, "learning_rate": 1.4862385321100918e-05, "loss": 0.448, "step": 325 }, { "epoch": 0.3741213599196672, "grad_norm": 109.0, "learning_rate": 1.4908256880733946e-05, "loss": 1.0731, "step": 326 }, { "epoch": 0.3752689714531631, "grad_norm": 76.0, "learning_rate": 1.4954128440366972e-05, "loss": 0.9293, "step": 327 }, { "epoch": 0.37641658298665903, "grad_norm": 142.0, "learning_rate": 1.5000000000000002e-05, "loss": 1.2647, "step": 328 }, { "epoch": 0.37756419452015494, "grad_norm": 17.0, "learning_rate": 1.504587155963303e-05, "loss": 0.5858, "step": 329 }, { "epoch": 0.37871180605365085, "grad_norm": 63.5, "learning_rate": 1.5091743119266057e-05, "loss": 1.1073, "step": 330 }, { "epoch": 0.37985941758714675, "grad_norm": 23.125, "learning_rate": 1.5137614678899085e-05, "loss": 0.6616, "step": 331 }, { "epoch": 0.38100702912064266, "grad_norm": 23.5, "learning_rate": 1.5183486238532111e-05, "loss": 0.81, "step": 332 }, { "epoch": 0.38215464065413857, "grad_norm": 44.5, "learning_rate": 1.5229357798165139e-05, "loss": 0.7774, "step": 333 }, { "epoch": 0.3833022521876345, "grad_norm": 53.0, "learning_rate": 1.5275229357798167e-05, "loss": 0.7527, "step": 334 }, { "epoch": 0.3844498637211304, "grad_norm": 26.0, "learning_rate": 1.5321100917431192e-05, "loss": 0.5953, "step": 335 }, { "epoch": 0.3855974752546263, "grad_norm": 81.5, "learning_rate": 1.536697247706422e-05, "loss": 1.1549, "step": 336 }, { "epoch": 0.3867450867881222, "grad_norm": 40.25, "learning_rate": 1.541284403669725e-05, "loss": 0.6953, "step": 337 }, { "epoch": 0.38789269832161816, "grad_norm": 59.5, "learning_rate": 1.5458715596330276e-05, "loss": 0.9157, "step": 338 }, { "epoch": 0.38904030985511406, "grad_norm": 37.25, "learning_rate": 1.5504587155963304e-05, "loss": 0.6101, "step": 339 }, { "epoch": 0.39018792138860997, "grad_norm": 47.75, "learning_rate": 1.555045871559633e-05, "loss": 0.6971, "step": 340 }, { "epoch": 0.3913355329221059, "grad_norm": 35.5, "learning_rate": 1.559633027522936e-05, "loss": 0.6038, "step": 341 }, { "epoch": 0.3924831444556018, "grad_norm": 258.0, "learning_rate": 1.564220183486239e-05, "loss": 0.7838, "step": 342 }, { "epoch": 0.3936307559890977, "grad_norm": 19.625, "learning_rate": 1.5688073394495413e-05, "loss": 0.6458, "step": 343 }, { "epoch": 0.3947783675225936, "grad_norm": 78.5, "learning_rate": 1.573394495412844e-05, "loss": 0.8405, "step": 344 }, { "epoch": 0.3959259790560895, "grad_norm": 118.5, "learning_rate": 1.577981651376147e-05, "loss": 1.0364, "step": 345 }, { "epoch": 0.3970735905895854, "grad_norm": 30.0, "learning_rate": 1.5825688073394497e-05, "loss": 0.5703, "step": 346 }, { "epoch": 0.3982212021230813, "grad_norm": 60.75, "learning_rate": 1.5871559633027525e-05, "loss": 0.8595, "step": 347 }, { "epoch": 0.3993688136565772, "grad_norm": 78.5, "learning_rate": 1.591743119266055e-05, "loss": 0.8161, "step": 348 }, { "epoch": 0.40051642519007313, "grad_norm": 33.75, "learning_rate": 1.5963302752293578e-05, "loss": 0.7062, "step": 349 }, { "epoch": 0.4016640367235691, "grad_norm": 20.75, "learning_rate": 1.6009174311926606e-05, "loss": 0.825, "step": 350 }, { "epoch": 0.402811648257065, "grad_norm": 20.375, "learning_rate": 1.6055045871559634e-05, "loss": 0.5635, "step": 351 }, { "epoch": 0.4039592597905609, "grad_norm": 26.0, "learning_rate": 1.6100917431192662e-05, "loss": 0.7392, "step": 352 }, { "epoch": 0.4051068713240568, "grad_norm": 39.75, "learning_rate": 1.614678899082569e-05, "loss": 0.6261, "step": 353 }, { "epoch": 0.4062544828575527, "grad_norm": 40.0, "learning_rate": 1.6192660550458715e-05, "loss": 0.6046, "step": 354 }, { "epoch": 0.40740209439104863, "grad_norm": 106.0, "learning_rate": 1.6238532110091743e-05, "loss": 0.9682, "step": 355 }, { "epoch": 0.40854970592454454, "grad_norm": 51.25, "learning_rate": 1.628440366972477e-05, "loss": 0.7811, "step": 356 }, { "epoch": 0.40969731745804044, "grad_norm": 38.0, "learning_rate": 1.63302752293578e-05, "loss": 0.8129, "step": 357 }, { "epoch": 0.41084492899153635, "grad_norm": 25.375, "learning_rate": 1.6376146788990827e-05, "loss": 0.5273, "step": 358 }, { "epoch": 0.41199254052503226, "grad_norm": 34.0, "learning_rate": 1.6422018348623852e-05, "loss": 0.5413, "step": 359 }, { "epoch": 0.41314015205852817, "grad_norm": 19.25, "learning_rate": 1.6467889908256884e-05, "loss": 0.5278, "step": 360 }, { "epoch": 0.4142877635920241, "grad_norm": 24.0, "learning_rate": 1.6513761467889912e-05, "loss": 0.6991, "step": 361 }, { "epoch": 0.41543537512552003, "grad_norm": 180.0, "learning_rate": 1.6559633027522936e-05, "loss": 0.7462, "step": 362 }, { "epoch": 0.41658298665901594, "grad_norm": 27.125, "learning_rate": 1.6605504587155964e-05, "loss": 0.7057, "step": 363 }, { "epoch": 0.41773059819251185, "grad_norm": 75.0, "learning_rate": 1.6651376146788993e-05, "loss": 0.8487, "step": 364 }, { "epoch": 0.41887820972600776, "grad_norm": 52.5, "learning_rate": 1.669724770642202e-05, "loss": 0.83, "step": 365 }, { "epoch": 0.42002582125950366, "grad_norm": 18.5, "learning_rate": 1.674311926605505e-05, "loss": 0.4915, "step": 366 }, { "epoch": 0.42117343279299957, "grad_norm": 76.0, "learning_rate": 1.6788990825688073e-05, "loss": 0.6684, "step": 367 }, { "epoch": 0.4223210443264955, "grad_norm": 21.625, "learning_rate": 1.68348623853211e-05, "loss": 0.5262, "step": 368 }, { "epoch": 0.4234686558599914, "grad_norm": 13.375, "learning_rate": 1.688073394495413e-05, "loss": 0.5778, "step": 369 }, { "epoch": 0.4246162673934873, "grad_norm": 20.375, "learning_rate": 1.6926605504587158e-05, "loss": 0.5191, "step": 370 }, { "epoch": 0.4257638789269832, "grad_norm": 91.5, "learning_rate": 1.6972477064220186e-05, "loss": 1.0079, "step": 371 }, { "epoch": 0.4269114904604791, "grad_norm": 55.25, "learning_rate": 1.701834862385321e-05, "loss": 0.6378, "step": 372 }, { "epoch": 0.42805910199397507, "grad_norm": 39.5, "learning_rate": 1.706422018348624e-05, "loss": 0.836, "step": 373 }, { "epoch": 0.429206713527471, "grad_norm": 42.75, "learning_rate": 1.7110091743119267e-05, "loss": 0.5683, "step": 374 }, { "epoch": 0.4303543250609669, "grad_norm": 60.25, "learning_rate": 1.7155963302752295e-05, "loss": 0.4543, "step": 375 }, { "epoch": 0.4315019365944628, "grad_norm": 21.375, "learning_rate": 1.7201834862385323e-05, "loss": 0.5242, "step": 376 }, { "epoch": 0.4326495481279587, "grad_norm": 17.625, "learning_rate": 1.724770642201835e-05, "loss": 0.6393, "step": 377 }, { "epoch": 0.4337971596614546, "grad_norm": 21.875, "learning_rate": 1.7293577981651376e-05, "loss": 0.5476, "step": 378 }, { "epoch": 0.4349447711949505, "grad_norm": 56.25, "learning_rate": 1.7339449541284407e-05, "loss": 0.7973, "step": 379 }, { "epoch": 0.4360923827284464, "grad_norm": 80.0, "learning_rate": 1.738532110091743e-05, "loss": 0.8487, "step": 380 }, { "epoch": 0.4372399942619423, "grad_norm": 46.5, "learning_rate": 1.743119266055046e-05, "loss": 0.8605, "step": 381 }, { "epoch": 0.43838760579543823, "grad_norm": 65.0, "learning_rate": 1.7477064220183488e-05, "loss": 0.8858, "step": 382 }, { "epoch": 0.43953521732893414, "grad_norm": 87.0, "learning_rate": 1.7522935779816516e-05, "loss": 0.7342, "step": 383 }, { "epoch": 0.44068282886243004, "grad_norm": 108.5, "learning_rate": 1.7568807339449544e-05, "loss": 0.8372, "step": 384 }, { "epoch": 0.441830440395926, "grad_norm": 38.5, "learning_rate": 1.7614678899082572e-05, "loss": 1.0963, "step": 385 }, { "epoch": 0.4429780519294219, "grad_norm": 21.875, "learning_rate": 1.7660550458715597e-05, "loss": 0.737, "step": 386 }, { "epoch": 0.4441256634629178, "grad_norm": 34.25, "learning_rate": 1.7706422018348625e-05, "loss": 0.7902, "step": 387 }, { "epoch": 0.4452732749964137, "grad_norm": 116.0, "learning_rate": 1.7752293577981653e-05, "loss": 0.875, "step": 388 }, { "epoch": 0.44642088652990963, "grad_norm": 66.5, "learning_rate": 1.779816513761468e-05, "loss": 0.9535, "step": 389 }, { "epoch": 0.44756849806340554, "grad_norm": 46.25, "learning_rate": 1.784403669724771e-05, "loss": 0.7879, "step": 390 }, { "epoch": 0.44871610959690145, "grad_norm": 48.75, "learning_rate": 1.7889908256880734e-05, "loss": 0.6081, "step": 391 }, { "epoch": 0.44986372113039735, "grad_norm": 32.5, "learning_rate": 1.7935779816513762e-05, "loss": 0.6908, "step": 392 }, { "epoch": 0.45101133266389326, "grad_norm": 34.0, "learning_rate": 1.798165137614679e-05, "loss": 0.6664, "step": 393 }, { "epoch": 0.45215894419738917, "grad_norm": 74.5, "learning_rate": 1.8027522935779818e-05, "loss": 0.6012, "step": 394 }, { "epoch": 0.4533065557308851, "grad_norm": 33.25, "learning_rate": 1.8073394495412846e-05, "loss": 0.6278, "step": 395 }, { "epoch": 0.454454167264381, "grad_norm": 19.25, "learning_rate": 1.811926605504587e-05, "loss": 0.6279, "step": 396 }, { "epoch": 0.45560177879787694, "grad_norm": 33.5, "learning_rate": 1.81651376146789e-05, "loss": 0.689, "step": 397 }, { "epoch": 0.45674939033137285, "grad_norm": 34.25, "learning_rate": 1.821100917431193e-05, "loss": 0.4764, "step": 398 }, { "epoch": 0.45789700186486876, "grad_norm": 144.0, "learning_rate": 1.8256880733944955e-05, "loss": 1.3598, "step": 399 }, { "epoch": 0.45904461339836466, "grad_norm": 105.5, "learning_rate": 1.8302752293577983e-05, "loss": 0.9441, "step": 400 }, { "epoch": 0.45904461339836466, "eval_accuracy": 0.37, "eval_loss": 0.7958357334136963, "eval_runtime": 49.9294, "eval_samples_per_second": 2.003, "eval_steps_per_second": 2.003, "step": 400 }, { "epoch": 0.46019222493186057, "grad_norm": 125.5, "learning_rate": 1.834862385321101e-05, "loss": 1.1444, "step": 401 }, { "epoch": 0.4613398364653565, "grad_norm": 67.0, "learning_rate": 1.839449541284404e-05, "loss": 0.694, "step": 402 }, { "epoch": 0.4624874479988524, "grad_norm": 48.75, "learning_rate": 1.8440366972477067e-05, "loss": 0.7125, "step": 403 }, { "epoch": 0.4636350595323483, "grad_norm": 60.5, "learning_rate": 1.8486238532110092e-05, "loss": 0.6703, "step": 404 }, { "epoch": 0.4647826710658442, "grad_norm": 51.0, "learning_rate": 1.853211009174312e-05, "loss": 0.6631, "step": 405 }, { "epoch": 0.4659302825993401, "grad_norm": 51.75, "learning_rate": 1.8577981651376148e-05, "loss": 0.7814, "step": 406 }, { "epoch": 0.467077894132836, "grad_norm": 22.875, "learning_rate": 1.8623853211009176e-05, "loss": 0.5642, "step": 407 }, { "epoch": 0.4682255056663319, "grad_norm": 105.0, "learning_rate": 1.8669724770642204e-05, "loss": 0.884, "step": 408 }, { "epoch": 0.4693731171998279, "grad_norm": 48.5, "learning_rate": 1.8715596330275232e-05, "loss": 0.5543, "step": 409 }, { "epoch": 0.4705207287333238, "grad_norm": 26.875, "learning_rate": 1.8761467889908257e-05, "loss": 0.6461, "step": 410 }, { "epoch": 0.4716683402668197, "grad_norm": 23.125, "learning_rate": 1.8807339449541285e-05, "loss": 0.5786, "step": 411 }, { "epoch": 0.4728159518003156, "grad_norm": 37.75, "learning_rate": 1.8853211009174313e-05, "loss": 0.6921, "step": 412 }, { "epoch": 0.4739635633338115, "grad_norm": 23.625, "learning_rate": 1.889908256880734e-05, "loss": 0.4189, "step": 413 }, { "epoch": 0.4751111748673074, "grad_norm": 108.5, "learning_rate": 1.894495412844037e-05, "loss": 1.0126, "step": 414 }, { "epoch": 0.4762587864008033, "grad_norm": 138.0, "learning_rate": 1.8990825688073394e-05, "loss": 1.2399, "step": 415 }, { "epoch": 0.47740639793429923, "grad_norm": 31.375, "learning_rate": 1.9036697247706422e-05, "loss": 0.4347, "step": 416 }, { "epoch": 0.47855400946779514, "grad_norm": 120.5, "learning_rate": 1.9082568807339454e-05, "loss": 1.1874, "step": 417 }, { "epoch": 0.47970162100129105, "grad_norm": 25.5, "learning_rate": 1.912844036697248e-05, "loss": 0.6172, "step": 418 }, { "epoch": 0.48084923253478695, "grad_norm": 29.0, "learning_rate": 1.9174311926605506e-05, "loss": 0.7072, "step": 419 }, { "epoch": 0.48199684406828286, "grad_norm": 40.75, "learning_rate": 1.9220183486238534e-05, "loss": 0.8408, "step": 420 }, { "epoch": 0.4831444556017788, "grad_norm": 17.5, "learning_rate": 1.9266055045871563e-05, "loss": 0.6384, "step": 421 }, { "epoch": 0.48429206713527473, "grad_norm": 26.375, "learning_rate": 1.931192660550459e-05, "loss": 0.7132, "step": 422 }, { "epoch": 0.48543967866877064, "grad_norm": 41.0, "learning_rate": 1.9357798165137615e-05, "loss": 0.6823, "step": 423 }, { "epoch": 0.48658729020226654, "grad_norm": 36.0, "learning_rate": 1.9403669724770643e-05, "loss": 0.5629, "step": 424 }, { "epoch": 0.48773490173576245, "grad_norm": 93.5, "learning_rate": 1.944954128440367e-05, "loss": 1.028, "step": 425 }, { "epoch": 0.48888251326925836, "grad_norm": 70.5, "learning_rate": 1.94954128440367e-05, "loss": 0.7085, "step": 426 }, { "epoch": 0.49003012480275426, "grad_norm": 17.875, "learning_rate": 1.9541284403669728e-05, "loss": 0.5299, "step": 427 }, { "epoch": 0.49117773633625017, "grad_norm": 58.75, "learning_rate": 1.9587155963302752e-05, "loss": 0.9028, "step": 428 }, { "epoch": 0.4923253478697461, "grad_norm": 45.25, "learning_rate": 1.963302752293578e-05, "loss": 0.8021, "step": 429 }, { "epoch": 0.493472959403242, "grad_norm": 69.0, "learning_rate": 1.967889908256881e-05, "loss": 0.696, "step": 430 }, { "epoch": 0.4946205709367379, "grad_norm": 44.0, "learning_rate": 1.9724770642201837e-05, "loss": 0.5913, "step": 431 }, { "epoch": 0.49576818247023385, "grad_norm": 71.5, "learning_rate": 1.9770642201834865e-05, "loss": 0.8661, "step": 432 }, { "epoch": 0.49691579400372976, "grad_norm": 80.5, "learning_rate": 1.9816513761467893e-05, "loss": 1.109, "step": 433 }, { "epoch": 0.49806340553722567, "grad_norm": 86.0, "learning_rate": 1.9862385321100917e-05, "loss": 1.0316, "step": 434 }, { "epoch": 0.4992110170707216, "grad_norm": 30.375, "learning_rate": 1.9908256880733945e-05, "loss": 0.7336, "step": 435 }, { "epoch": 0.5003586286042174, "grad_norm": 42.75, "learning_rate": 1.9954128440366974e-05, "loss": 0.7081, "step": 436 }, { "epoch": 0.5015062401377134, "grad_norm": 20.75, "learning_rate": 2e-05, "loss": 0.5407, "step": 437 }, { "epoch": 0.5026538516712094, "grad_norm": 117.0, "learning_rate": 1.999490316004078e-05, "loss": 1.1049, "step": 438 }, { "epoch": 0.5038014632047052, "grad_norm": 126.5, "learning_rate": 1.998980632008155e-05, "loss": 1.1828, "step": 439 }, { "epoch": 0.5049490747382012, "grad_norm": 120.5, "learning_rate": 1.9984709480122327e-05, "loss": 1.3274, "step": 440 }, { "epoch": 0.506096686271697, "grad_norm": 80.0, "learning_rate": 1.99796126401631e-05, "loss": 0.9327, "step": 441 }, { "epoch": 0.507244297805193, "grad_norm": 105.0, "learning_rate": 1.9974515800203875e-05, "loss": 1.1003, "step": 442 }, { "epoch": 0.5083919093386888, "grad_norm": 106.5, "learning_rate": 1.9969418960244652e-05, "loss": 1.0261, "step": 443 }, { "epoch": 0.5095395208721848, "grad_norm": 36.5, "learning_rate": 1.9964322120285426e-05, "loss": 0.695, "step": 444 }, { "epoch": 0.5106871324056806, "grad_norm": 15.8125, "learning_rate": 1.99592252803262e-05, "loss": 0.6983, "step": 445 }, { "epoch": 0.5118347439391766, "grad_norm": 53.5, "learning_rate": 1.9954128440366974e-05, "loss": 0.7186, "step": 446 }, { "epoch": 0.5129823554726725, "grad_norm": 12.25, "learning_rate": 1.9949031600407747e-05, "loss": 0.6272, "step": 447 }, { "epoch": 0.5141299670061684, "grad_norm": 16.125, "learning_rate": 1.9943934760448525e-05, "loss": 0.5849, "step": 448 }, { "epoch": 0.5152775785396643, "grad_norm": 36.25, "learning_rate": 1.99388379204893e-05, "loss": 0.5905, "step": 449 }, { "epoch": 0.5164251900731602, "grad_norm": 24.25, "learning_rate": 1.9933741080530073e-05, "loss": 0.4751, "step": 450 }, { "epoch": 0.5175728016066562, "grad_norm": 15.9375, "learning_rate": 1.9928644240570846e-05, "loss": 0.4372, "step": 451 }, { "epoch": 0.518720413140152, "grad_norm": 18.375, "learning_rate": 1.9923547400611624e-05, "loss": 0.6552, "step": 452 }, { "epoch": 0.519868024673648, "grad_norm": 13.6875, "learning_rate": 1.9918450560652398e-05, "loss": 0.6515, "step": 453 }, { "epoch": 0.5210156362071439, "grad_norm": 13.6875, "learning_rate": 1.991335372069317e-05, "loss": 0.5219, "step": 454 }, { "epoch": 0.5221632477406398, "grad_norm": 70.0, "learning_rate": 1.9908256880733945e-05, "loss": 0.694, "step": 455 }, { "epoch": 0.5233108592741357, "grad_norm": 47.75, "learning_rate": 1.990316004077472e-05, "loss": 1.0051, "step": 456 }, { "epoch": 0.5244584708076316, "grad_norm": 98.5, "learning_rate": 1.9898063200815497e-05, "loss": 0.8809, "step": 457 }, { "epoch": 0.5256060823411275, "grad_norm": 22.875, "learning_rate": 1.989296636085627e-05, "loss": 0.6882, "step": 458 }, { "epoch": 0.5267536938746235, "grad_norm": 103.0, "learning_rate": 1.9887869520897044e-05, "loss": 0.8227, "step": 459 }, { "epoch": 0.5279013054081193, "grad_norm": 41.5, "learning_rate": 1.9882772680937822e-05, "loss": 0.5851, "step": 460 }, { "epoch": 0.5290489169416153, "grad_norm": 16.125, "learning_rate": 1.9877675840978596e-05, "loss": 0.6286, "step": 461 }, { "epoch": 0.5301965284751112, "grad_norm": 40.0, "learning_rate": 1.987257900101937e-05, "loss": 0.3909, "step": 462 }, { "epoch": 0.5313441400086071, "grad_norm": 180.0, "learning_rate": 1.9867482161060147e-05, "loss": 1.4089, "step": 463 }, { "epoch": 0.532491751542103, "grad_norm": 67.5, "learning_rate": 1.9862385321100917e-05, "loss": 0.7342, "step": 464 }, { "epoch": 0.5336393630755989, "grad_norm": 146.0, "learning_rate": 1.9857288481141695e-05, "loss": 1.1024, "step": 465 }, { "epoch": 0.5347869746090949, "grad_norm": 112.0, "learning_rate": 1.985219164118247e-05, "loss": 0.7631, "step": 466 }, { "epoch": 0.5359345861425907, "grad_norm": 51.25, "learning_rate": 1.9847094801223243e-05, "loss": 0.8592, "step": 467 }, { "epoch": 0.5370821976760867, "grad_norm": 53.5, "learning_rate": 1.984199796126402e-05, "loss": 0.772, "step": 468 }, { "epoch": 0.5382298092095825, "grad_norm": 168.0, "learning_rate": 1.9836901121304794e-05, "loss": 1.3238, "step": 469 }, { "epoch": 0.5393774207430785, "grad_norm": 56.25, "learning_rate": 1.9831804281345568e-05, "loss": 0.686, "step": 470 }, { "epoch": 0.5405250322765743, "grad_norm": 140.0, "learning_rate": 1.982670744138634e-05, "loss": 1.3487, "step": 471 }, { "epoch": 0.5416726438100703, "grad_norm": 26.25, "learning_rate": 1.9821610601427115e-05, "loss": 0.654, "step": 472 }, { "epoch": 0.5428202553435661, "grad_norm": 96.0, "learning_rate": 1.9816513761467893e-05, "loss": 0.8932, "step": 473 }, { "epoch": 0.5439678668770621, "grad_norm": 118.5, "learning_rate": 1.9811416921508667e-05, "loss": 0.9886, "step": 474 }, { "epoch": 0.5451154784105581, "grad_norm": 34.75, "learning_rate": 1.980632008154944e-05, "loss": 0.5023, "step": 475 }, { "epoch": 0.5462630899440539, "grad_norm": 28.125, "learning_rate": 1.9801223241590214e-05, "loss": 0.4678, "step": 476 }, { "epoch": 0.5474107014775499, "grad_norm": 27.875, "learning_rate": 1.9796126401630992e-05, "loss": 0.5802, "step": 477 }, { "epoch": 0.5485583130110457, "grad_norm": 9.5625, "learning_rate": 1.9791029561671766e-05, "loss": 0.3969, "step": 478 }, { "epoch": 0.5497059245445417, "grad_norm": 30.875, "learning_rate": 1.978593272171254e-05, "loss": 0.4119, "step": 479 }, { "epoch": 0.5508535360780376, "grad_norm": 17.25, "learning_rate": 1.9780835881753317e-05, "loss": 0.5645, "step": 480 }, { "epoch": 0.5520011476115335, "grad_norm": 47.75, "learning_rate": 1.9775739041794087e-05, "loss": 0.6168, "step": 481 }, { "epoch": 0.5531487591450294, "grad_norm": 14.1875, "learning_rate": 1.9770642201834865e-05, "loss": 0.5325, "step": 482 }, { "epoch": 0.5542963706785253, "grad_norm": 41.25, "learning_rate": 1.976554536187564e-05, "loss": 0.5013, "step": 483 }, { "epoch": 0.5554439822120212, "grad_norm": 18.75, "learning_rate": 1.9760448521916412e-05, "loss": 0.4441, "step": 484 }, { "epoch": 0.5565915937455171, "grad_norm": 147.0, "learning_rate": 1.975535168195719e-05, "loss": 1.585, "step": 485 }, { "epoch": 0.5577392052790131, "grad_norm": 64.0, "learning_rate": 1.9750254841997964e-05, "loss": 0.958, "step": 486 }, { "epoch": 0.558886816812509, "grad_norm": 135.0, "learning_rate": 1.9745158002038738e-05, "loss": 1.4838, "step": 487 }, { "epoch": 0.5600344283460049, "grad_norm": 141.0, "learning_rate": 1.974006116207951e-05, "loss": 1.6651, "step": 488 }, { "epoch": 0.5611820398795008, "grad_norm": 108.0, "learning_rate": 1.9734964322120285e-05, "loss": 0.9729, "step": 489 }, { "epoch": 0.5623296514129967, "grad_norm": 35.25, "learning_rate": 1.9729867482161063e-05, "loss": 0.5966, "step": 490 }, { "epoch": 0.5634772629464926, "grad_norm": 34.5, "learning_rate": 1.9724770642201837e-05, "loss": 0.7337, "step": 491 }, { "epoch": 0.5646248744799885, "grad_norm": 19.875, "learning_rate": 1.971967380224261e-05, "loss": 0.4022, "step": 492 }, { "epoch": 0.5657724860134844, "grad_norm": 29.375, "learning_rate": 1.9714576962283384e-05, "loss": 0.618, "step": 493 }, { "epoch": 0.5669200975469804, "grad_norm": 75.5, "learning_rate": 1.970948012232416e-05, "loss": 0.6627, "step": 494 }, { "epoch": 0.5680677090804762, "grad_norm": 104.5, "learning_rate": 1.9704383282364936e-05, "loss": 0.9524, "step": 495 }, { "epoch": 0.5692153206139722, "grad_norm": 91.0, "learning_rate": 1.969928644240571e-05, "loss": 0.7282, "step": 496 }, { "epoch": 0.570362932147468, "grad_norm": 95.5, "learning_rate": 1.9694189602446487e-05, "loss": 0.9184, "step": 497 }, { "epoch": 0.571510543680964, "grad_norm": 23.625, "learning_rate": 1.9689092762487257e-05, "loss": 0.6252, "step": 498 }, { "epoch": 0.57265815521446, "grad_norm": 55.25, "learning_rate": 1.9683995922528035e-05, "loss": 0.77, "step": 499 }, { "epoch": 0.5738057667479558, "grad_norm": 49.75, "learning_rate": 1.967889908256881e-05, "loss": 0.5024, "step": 500 }, { "epoch": 0.5738057667479558, "eval_accuracy": 0.56, "eval_loss": 0.5818310379981995, "eval_runtime": 49.317, "eval_samples_per_second": 2.028, "eval_steps_per_second": 2.028, "step": 500 }, { "epoch": 0.5749533782814518, "grad_norm": 72.0, "learning_rate": 1.9673802242609582e-05, "loss": 0.5565, "step": 501 }, { "epoch": 0.5761009898149476, "grad_norm": 17.0, "learning_rate": 1.966870540265036e-05, "loss": 0.5465, "step": 502 }, { "epoch": 0.5772486013484436, "grad_norm": 16.5, "learning_rate": 1.9663608562691134e-05, "loss": 0.7208, "step": 503 }, { "epoch": 0.5783962128819394, "grad_norm": 13.75, "learning_rate": 1.9658511722731907e-05, "loss": 0.5784, "step": 504 }, { "epoch": 0.5795438244154354, "grad_norm": 17.875, "learning_rate": 1.9653414882772685e-05, "loss": 0.7433, "step": 505 }, { "epoch": 0.5806914359489312, "grad_norm": 55.5, "learning_rate": 1.9648318042813455e-05, "loss": 0.5593, "step": 506 }, { "epoch": 0.5818390474824272, "grad_norm": 26.0, "learning_rate": 1.9643221202854233e-05, "loss": 0.4981, "step": 507 }, { "epoch": 0.5829866590159231, "grad_norm": 45.5, "learning_rate": 1.9638124362895006e-05, "loss": 0.6998, "step": 508 }, { "epoch": 0.584134270549419, "grad_norm": 64.5, "learning_rate": 1.963302752293578e-05, "loss": 0.331, "step": 509 }, { "epoch": 0.585281882082915, "grad_norm": 15.75, "learning_rate": 1.9627930682976558e-05, "loss": 0.5757, "step": 510 }, { "epoch": 0.5864294936164108, "grad_norm": 78.0, "learning_rate": 1.962283384301733e-05, "loss": 0.5458, "step": 511 }, { "epoch": 0.5875771051499068, "grad_norm": 12.6875, "learning_rate": 1.9617737003058106e-05, "loss": 0.4577, "step": 512 }, { "epoch": 0.5887247166834026, "grad_norm": 94.5, "learning_rate": 1.961264016309888e-05, "loss": 1.0295, "step": 513 }, { "epoch": 0.5898723282168986, "grad_norm": 62.75, "learning_rate": 1.9607543323139657e-05, "loss": 0.6586, "step": 514 }, { "epoch": 0.5910199397503945, "grad_norm": 12.8125, "learning_rate": 1.960244648318043e-05, "loss": 0.4499, "step": 515 }, { "epoch": 0.5921675512838904, "grad_norm": 41.25, "learning_rate": 1.9597349643221205e-05, "loss": 0.6115, "step": 516 }, { "epoch": 0.5933151628173863, "grad_norm": 33.5, "learning_rate": 1.959225280326198e-05, "loss": 0.6823, "step": 517 }, { "epoch": 0.5944627743508822, "grad_norm": 67.5, "learning_rate": 1.9587155963302752e-05, "loss": 0.7254, "step": 518 }, { "epoch": 0.5956103858843781, "grad_norm": 21.75, "learning_rate": 1.958205912334353e-05, "loss": 0.6258, "step": 519 }, { "epoch": 0.596757997417874, "grad_norm": 20.125, "learning_rate": 1.9576962283384304e-05, "loss": 0.782, "step": 520 }, { "epoch": 0.59790560895137, "grad_norm": 55.0, "learning_rate": 1.9571865443425077e-05, "loss": 0.6427, "step": 521 }, { "epoch": 0.5990532204848659, "grad_norm": 21.375, "learning_rate": 1.9566768603465855e-05, "loss": 0.5042, "step": 522 }, { "epoch": 0.6002008320183618, "grad_norm": 84.0, "learning_rate": 1.9561671763506625e-05, "loss": 0.7413, "step": 523 }, { "epoch": 0.6013484435518577, "grad_norm": 32.25, "learning_rate": 1.9556574923547403e-05, "loss": 0.4809, "step": 524 }, { "epoch": 0.6024960550853536, "grad_norm": 29.25, "learning_rate": 1.9551478083588176e-05, "loss": 0.5245, "step": 525 }, { "epoch": 0.6036436666188495, "grad_norm": 115.0, "learning_rate": 1.954638124362895e-05, "loss": 1.0439, "step": 526 }, { "epoch": 0.6047912781523455, "grad_norm": 155.0, "learning_rate": 1.9541284403669728e-05, "loss": 1.6477, "step": 527 }, { "epoch": 0.6059388896858413, "grad_norm": 39.25, "learning_rate": 1.95361875637105e-05, "loss": 0.555, "step": 528 }, { "epoch": 0.6070865012193373, "grad_norm": 23.0, "learning_rate": 1.9531090723751275e-05, "loss": 0.478, "step": 529 }, { "epoch": 0.6082341127528331, "grad_norm": 43.25, "learning_rate": 1.9525993883792053e-05, "loss": 0.6068, "step": 530 }, { "epoch": 0.6093817242863291, "grad_norm": 44.5, "learning_rate": 1.9520897043832823e-05, "loss": 0.248, "step": 531 }, { "epoch": 0.6105293358198249, "grad_norm": 20.75, "learning_rate": 1.95158002038736e-05, "loss": 0.4505, "step": 532 }, { "epoch": 0.6116769473533209, "grad_norm": 41.5, "learning_rate": 1.9510703363914374e-05, "loss": 0.3902, "step": 533 }, { "epoch": 0.6128245588868169, "grad_norm": 38.75, "learning_rate": 1.950560652395515e-05, "loss": 0.5029, "step": 534 }, { "epoch": 0.6139721704203127, "grad_norm": 131.0, "learning_rate": 1.9500509683995926e-05, "loss": 1.2225, "step": 535 }, { "epoch": 0.6151197819538087, "grad_norm": 85.5, "learning_rate": 1.94954128440367e-05, "loss": 0.8337, "step": 536 }, { "epoch": 0.6162673934873045, "grad_norm": 43.25, "learning_rate": 1.9490316004077473e-05, "loss": 0.5878, "step": 537 }, { "epoch": 0.6174150050208005, "grad_norm": 12.875, "learning_rate": 1.9485219164118247e-05, "loss": 0.4961, "step": 538 }, { "epoch": 0.6185626165542963, "grad_norm": 77.0, "learning_rate": 1.9480122324159025e-05, "loss": 0.9027, "step": 539 }, { "epoch": 0.6197102280877923, "grad_norm": 46.5, "learning_rate": 1.94750254841998e-05, "loss": 0.7113, "step": 540 }, { "epoch": 0.6208578396212882, "grad_norm": 77.5, "learning_rate": 1.9469928644240572e-05, "loss": 0.7001, "step": 541 }, { "epoch": 0.6220054511547841, "grad_norm": 68.5, "learning_rate": 1.9464831804281346e-05, "loss": 0.6916, "step": 542 }, { "epoch": 0.62315306268828, "grad_norm": 77.0, "learning_rate": 1.945973496432212e-05, "loss": 0.7548, "step": 543 }, { "epoch": 0.6243006742217759, "grad_norm": 85.0, "learning_rate": 1.9454638124362898e-05, "loss": 0.8164, "step": 544 }, { "epoch": 0.6254482857552719, "grad_norm": 15.0625, "learning_rate": 1.944954128440367e-05, "loss": 0.603, "step": 545 }, { "epoch": 0.6265958972887677, "grad_norm": 23.25, "learning_rate": 1.9444444444444445e-05, "loss": 0.4311, "step": 546 }, { "epoch": 0.6277435088222637, "grad_norm": 10.1875, "learning_rate": 1.9439347604485223e-05, "loss": 0.3436, "step": 547 }, { "epoch": 0.6288911203557596, "grad_norm": 75.0, "learning_rate": 1.9434250764525993e-05, "loss": 0.8814, "step": 548 }, { "epoch": 0.6300387318892555, "grad_norm": 62.0, "learning_rate": 1.942915392456677e-05, "loss": 0.939, "step": 549 }, { "epoch": 0.6311863434227514, "grad_norm": 52.0, "learning_rate": 1.9424057084607544e-05, "loss": 0.411, "step": 550 }, { "epoch": 0.6323339549562473, "grad_norm": 127.5, "learning_rate": 1.9418960244648318e-05, "loss": 1.655, "step": 551 }, { "epoch": 0.6334815664897432, "grad_norm": 96.0, "learning_rate": 1.9413863404689096e-05, "loss": 1.4065, "step": 552 }, { "epoch": 0.6346291780232391, "grad_norm": 52.5, "learning_rate": 1.940876656472987e-05, "loss": 0.7391, "step": 553 }, { "epoch": 0.635776789556735, "grad_norm": 78.0, "learning_rate": 1.9403669724770643e-05, "loss": 0.9576, "step": 554 }, { "epoch": 0.636924401090231, "grad_norm": 91.0, "learning_rate": 1.9398572884811417e-05, "loss": 1.0132, "step": 555 }, { "epoch": 0.6380720126237268, "grad_norm": 14.8125, "learning_rate": 1.9393476044852195e-05, "loss": 0.734, "step": 556 }, { "epoch": 0.6392196241572228, "grad_norm": 63.75, "learning_rate": 1.938837920489297e-05, "loss": 0.6127, "step": 557 }, { "epoch": 0.6403672356907187, "grad_norm": 19.625, "learning_rate": 1.9383282364933742e-05, "loss": 0.5999, "step": 558 }, { "epoch": 0.6415148472242146, "grad_norm": 21.625, "learning_rate": 1.9378185524974516e-05, "loss": 0.7446, "step": 559 }, { "epoch": 0.6426624587577106, "grad_norm": 26.375, "learning_rate": 1.937308868501529e-05, "loss": 0.6067, "step": 560 }, { "epoch": 0.6438100702912064, "grad_norm": 130.0, "learning_rate": 1.9367991845056068e-05, "loss": 1.0849, "step": 561 }, { "epoch": 0.6449576818247024, "grad_norm": 26.875, "learning_rate": 1.936289500509684e-05, "loss": 0.4882, "step": 562 }, { "epoch": 0.6461052933581982, "grad_norm": 13.375, "learning_rate": 1.9357798165137615e-05, "loss": 0.6071, "step": 563 }, { "epoch": 0.6472529048916942, "grad_norm": 23.625, "learning_rate": 1.9352701325178393e-05, "loss": 0.7541, "step": 564 }, { "epoch": 0.64840051642519, "grad_norm": 22.5, "learning_rate": 1.9347604485219163e-05, "loss": 0.6343, "step": 565 }, { "epoch": 0.649548127958686, "grad_norm": 37.75, "learning_rate": 1.934250764525994e-05, "loss": 0.629, "step": 566 }, { "epoch": 0.6506957394921818, "grad_norm": 33.25, "learning_rate": 1.9337410805300714e-05, "loss": 0.6112, "step": 567 }, { "epoch": 0.6518433510256778, "grad_norm": 23.625, "learning_rate": 1.9332313965341488e-05, "loss": 0.6854, "step": 568 }, { "epoch": 0.6529909625591738, "grad_norm": 25.125, "learning_rate": 1.9327217125382266e-05, "loss": 0.5574, "step": 569 }, { "epoch": 0.6541385740926696, "grad_norm": 22.125, "learning_rate": 1.932212028542304e-05, "loss": 0.4604, "step": 570 }, { "epoch": 0.6552861856261656, "grad_norm": 10.625, "learning_rate": 1.9317023445463813e-05, "loss": 0.409, "step": 571 }, { "epoch": 0.6564337971596614, "grad_norm": 11.375, "learning_rate": 1.931192660550459e-05, "loss": 0.4102, "step": 572 }, { "epoch": 0.6575814086931574, "grad_norm": 74.5, "learning_rate": 1.9306829765545365e-05, "loss": 0.5481, "step": 573 }, { "epoch": 0.6587290202266533, "grad_norm": 109.5, "learning_rate": 1.930173292558614e-05, "loss": 0.886, "step": 574 }, { "epoch": 0.6598766317601492, "grad_norm": 48.75, "learning_rate": 1.9296636085626912e-05, "loss": 0.6536, "step": 575 }, { "epoch": 0.6610242432936451, "grad_norm": 57.75, "learning_rate": 1.9291539245667686e-05, "loss": 0.902, "step": 576 }, { "epoch": 0.662171854827141, "grad_norm": 61.5, "learning_rate": 1.9286442405708464e-05, "loss": 0.7151, "step": 577 }, { "epoch": 0.6633194663606369, "grad_norm": 36.25, "learning_rate": 1.9281345565749237e-05, "loss": 0.6232, "step": 578 }, { "epoch": 0.6644670778941328, "grad_norm": 11.6875, "learning_rate": 1.927624872579001e-05, "loss": 0.3918, "step": 579 }, { "epoch": 0.6656146894276288, "grad_norm": 13.9375, "learning_rate": 1.9271151885830785e-05, "loss": 0.7175, "step": 580 }, { "epoch": 0.6667623009611247, "grad_norm": 17.875, "learning_rate": 1.9266055045871563e-05, "loss": 0.7939, "step": 581 }, { "epoch": 0.6679099124946206, "grad_norm": 13.9375, "learning_rate": 1.9260958205912336e-05, "loss": 0.6663, "step": 582 }, { "epoch": 0.6690575240281165, "grad_norm": 17.75, "learning_rate": 1.925586136595311e-05, "loss": 0.512, "step": 583 }, { "epoch": 0.6702051355616124, "grad_norm": 24.625, "learning_rate": 1.9250764525993884e-05, "loss": 0.8056, "step": 584 }, { "epoch": 0.6713527470951083, "grad_norm": 46.75, "learning_rate": 1.9245667686034658e-05, "loss": 0.6661, "step": 585 }, { "epoch": 0.6725003586286042, "grad_norm": 23.5, "learning_rate": 1.9240570846075435e-05, "loss": 0.6705, "step": 586 }, { "epoch": 0.6736479701621001, "grad_norm": 55.5, "learning_rate": 1.923547400611621e-05, "loss": 0.7411, "step": 587 }, { "epoch": 0.6747955816955961, "grad_norm": 34.25, "learning_rate": 1.9230377166156983e-05, "loss": 0.6056, "step": 588 }, { "epoch": 0.6759431932290919, "grad_norm": 107.5, "learning_rate": 1.922528032619776e-05, "loss": 0.9975, "step": 589 }, { "epoch": 0.6770908047625879, "grad_norm": 14.1875, "learning_rate": 1.9220183486238534e-05, "loss": 0.5988, "step": 590 }, { "epoch": 0.6782384162960837, "grad_norm": 132.0, "learning_rate": 1.921508664627931e-05, "loss": 1.1598, "step": 591 }, { "epoch": 0.6793860278295797, "grad_norm": 15.6875, "learning_rate": 1.9209989806320086e-05, "loss": 0.4172, "step": 592 }, { "epoch": 0.6805336393630756, "grad_norm": 38.25, "learning_rate": 1.9204892966360856e-05, "loss": 0.5714, "step": 593 }, { "epoch": 0.6816812508965715, "grad_norm": 14.0, "learning_rate": 1.9199796126401633e-05, "loss": 0.4887, "step": 594 }, { "epoch": 0.6828288624300675, "grad_norm": 41.5, "learning_rate": 1.9194699286442407e-05, "loss": 0.6648, "step": 595 }, { "epoch": 0.6839764739635633, "grad_norm": 13.875, "learning_rate": 1.918960244648318e-05, "loss": 0.6589, "step": 596 }, { "epoch": 0.6851240854970593, "grad_norm": 35.0, "learning_rate": 1.918450560652396e-05, "loss": 1.0174, "step": 597 }, { "epoch": 0.6862716970305551, "grad_norm": 27.875, "learning_rate": 1.9179408766564732e-05, "loss": 0.7711, "step": 598 }, { "epoch": 0.6874193085640511, "grad_norm": 48.25, "learning_rate": 1.9174311926605506e-05, "loss": 0.4402, "step": 599 }, { "epoch": 0.6885669200975469, "grad_norm": 49.0, "learning_rate": 1.916921508664628e-05, "loss": 0.4483, "step": 600 }, { "epoch": 0.6885669200975469, "eval_accuracy": 0.63, "eval_loss": 0.6301568150520325, "eval_runtime": 49.333, "eval_samples_per_second": 2.027, "eval_steps_per_second": 2.027, "step": 600 }, { "epoch": 0.6897145316310429, "grad_norm": 30.25, "learning_rate": 1.9164118246687054e-05, "loss": 0.4694, "step": 601 }, { "epoch": 0.6908621431645388, "grad_norm": 29.875, "learning_rate": 1.915902140672783e-05, "loss": 0.374, "step": 602 }, { "epoch": 0.6920097546980347, "grad_norm": 70.5, "learning_rate": 1.9153924566768605e-05, "loss": 0.941, "step": 603 }, { "epoch": 0.6931573662315307, "grad_norm": 87.0, "learning_rate": 1.914882772680938e-05, "loss": 0.906, "step": 604 }, { "epoch": 0.6943049777650265, "grad_norm": 14.0625, "learning_rate": 1.9143730886850153e-05, "loss": 0.406, "step": 605 }, { "epoch": 0.6954525892985225, "grad_norm": 55.25, "learning_rate": 1.913863404689093e-05, "loss": 0.779, "step": 606 }, { "epoch": 0.6966002008320183, "grad_norm": 36.5, "learning_rate": 1.9133537206931704e-05, "loss": 0.5805, "step": 607 }, { "epoch": 0.6977478123655143, "grad_norm": 23.75, "learning_rate": 1.912844036697248e-05, "loss": 0.6406, "step": 608 }, { "epoch": 0.6988954238990102, "grad_norm": 39.25, "learning_rate": 1.9123343527013256e-05, "loss": 0.4615, "step": 609 }, { "epoch": 0.7000430354325061, "grad_norm": 67.5, "learning_rate": 1.9118246687054026e-05, "loss": 0.8637, "step": 610 }, { "epoch": 0.701190646966002, "grad_norm": 22.5, "learning_rate": 1.9113149847094803e-05, "loss": 0.4727, "step": 611 }, { "epoch": 0.7023382584994979, "grad_norm": 27.125, "learning_rate": 1.9108053007135577e-05, "loss": 0.6226, "step": 612 }, { "epoch": 0.7034858700329938, "grad_norm": 62.5, "learning_rate": 1.910295616717635e-05, "loss": 0.6596, "step": 613 }, { "epoch": 0.7046334815664897, "grad_norm": 40.0, "learning_rate": 1.909785932721713e-05, "loss": 0.5424, "step": 614 }, { "epoch": 0.7057810930999856, "grad_norm": 62.5, "learning_rate": 1.9092762487257902e-05, "loss": 0.6348, "step": 615 }, { "epoch": 0.7069287046334816, "grad_norm": 80.5, "learning_rate": 1.9087665647298676e-05, "loss": 0.9329, "step": 616 }, { "epoch": 0.7080763161669775, "grad_norm": 101.5, "learning_rate": 1.9082568807339454e-05, "loss": 1.0578, "step": 617 }, { "epoch": 0.7092239277004734, "grad_norm": 23.375, "learning_rate": 1.9077471967380224e-05, "loss": 0.6725, "step": 618 }, { "epoch": 0.7103715392339693, "grad_norm": 42.0, "learning_rate": 1.9072375127421e-05, "loss": 0.6087, "step": 619 }, { "epoch": 0.7115191507674652, "grad_norm": 37.0, "learning_rate": 1.9067278287461775e-05, "loss": 0.6237, "step": 620 }, { "epoch": 0.7126667623009612, "grad_norm": 18.5, "learning_rate": 1.906218144750255e-05, "loss": 0.6327, "step": 621 }, { "epoch": 0.713814373834457, "grad_norm": 27.375, "learning_rate": 1.9057084607543327e-05, "loss": 0.7476, "step": 622 }, { "epoch": 0.714961985367953, "grad_norm": 28.25, "learning_rate": 1.90519877675841e-05, "loss": 0.7312, "step": 623 }, { "epoch": 0.7161095969014488, "grad_norm": 57.75, "learning_rate": 1.9046890927624874e-05, "loss": 0.6385, "step": 624 }, { "epoch": 0.7172572084349448, "grad_norm": 61.75, "learning_rate": 1.9041794087665648e-05, "loss": 0.3454, "step": 625 }, { "epoch": 0.7184048199684406, "grad_norm": 39.0, "learning_rate": 1.9036697247706422e-05, "loss": 0.6641, "step": 626 }, { "epoch": 0.7195524315019366, "grad_norm": 62.5, "learning_rate": 1.9031600407747196e-05, "loss": 0.8142, "step": 627 }, { "epoch": 0.7207000430354326, "grad_norm": 16.375, "learning_rate": 1.9026503567787973e-05, "loss": 0.6127, "step": 628 }, { "epoch": 0.7218476545689284, "grad_norm": 57.75, "learning_rate": 1.9021406727828747e-05, "loss": 0.7968, "step": 629 }, { "epoch": 0.7229952661024244, "grad_norm": 25.375, "learning_rate": 1.901630988786952e-05, "loss": 0.4927, "step": 630 }, { "epoch": 0.7241428776359202, "grad_norm": 48.25, "learning_rate": 1.90112130479103e-05, "loss": 0.6833, "step": 631 }, { "epoch": 0.7252904891694162, "grad_norm": 54.75, "learning_rate": 1.9006116207951072e-05, "loss": 0.6494, "step": 632 }, { "epoch": 0.726438100702912, "grad_norm": 54.5, "learning_rate": 1.9001019367991846e-05, "loss": 0.2914, "step": 633 }, { "epoch": 0.727585712236408, "grad_norm": 10.125, "learning_rate": 1.8995922528032624e-05, "loss": 0.745, "step": 634 }, { "epoch": 0.7287333237699039, "grad_norm": 45.75, "learning_rate": 1.8990825688073394e-05, "loss": 0.8078, "step": 635 }, { "epoch": 0.7298809353033998, "grad_norm": 95.0, "learning_rate": 1.898572884811417e-05, "loss": 0.9361, "step": 636 }, { "epoch": 0.7310285468368957, "grad_norm": 64.5, "learning_rate": 1.8980632008154945e-05, "loss": 0.5982, "step": 637 }, { "epoch": 0.7321761583703916, "grad_norm": 22.25, "learning_rate": 1.897553516819572e-05, "loss": 0.5722, "step": 638 }, { "epoch": 0.7333237699038875, "grad_norm": 51.5, "learning_rate": 1.8970438328236496e-05, "loss": 0.7216, "step": 639 }, { "epoch": 0.7344713814373834, "grad_norm": 18.5, "learning_rate": 1.896534148827727e-05, "loss": 0.5961, "step": 640 }, { "epoch": 0.7356189929708794, "grad_norm": 47.75, "learning_rate": 1.8960244648318044e-05, "loss": 0.6, "step": 641 }, { "epoch": 0.7367666045043753, "grad_norm": 67.0, "learning_rate": 1.8955147808358818e-05, "loss": 0.7799, "step": 642 }, { "epoch": 0.7379142160378712, "grad_norm": 90.5, "learning_rate": 1.8950050968399592e-05, "loss": 0.979, "step": 643 }, { "epoch": 0.7390618275713671, "grad_norm": 47.0, "learning_rate": 1.894495412844037e-05, "loss": 0.6637, "step": 644 }, { "epoch": 0.740209439104863, "grad_norm": 28.875, "learning_rate": 1.8939857288481143e-05, "loss": 0.7095, "step": 645 }, { "epoch": 0.7413570506383589, "grad_norm": 70.5, "learning_rate": 1.8934760448521917e-05, "loss": 0.7767, "step": 646 }, { "epoch": 0.7425046621718548, "grad_norm": 67.0, "learning_rate": 1.892966360856269e-05, "loss": 0.8117, "step": 647 }, { "epoch": 0.7436522737053507, "grad_norm": 47.0, "learning_rate": 1.892456676860347e-05, "loss": 0.7253, "step": 648 }, { "epoch": 0.7447998852388467, "grad_norm": 42.25, "learning_rate": 1.8919469928644242e-05, "loss": 0.5712, "step": 649 }, { "epoch": 0.7459474967723425, "grad_norm": 10.3125, "learning_rate": 1.8914373088685016e-05, "loss": 0.4933, "step": 650 }, { "epoch": 0.7470951083058385, "grad_norm": 17.875, "learning_rate": 1.8909276248725793e-05, "loss": 0.468, "step": 651 }, { "epoch": 0.7482427198393344, "grad_norm": 61.75, "learning_rate": 1.8904179408766564e-05, "loss": 0.7518, "step": 652 }, { "epoch": 0.7493903313728303, "grad_norm": 71.0, "learning_rate": 1.889908256880734e-05, "loss": 0.8373, "step": 653 }, { "epoch": 0.7505379429063262, "grad_norm": 26.75, "learning_rate": 1.8893985728848115e-05, "loss": 0.7643, "step": 654 }, { "epoch": 0.7516855544398221, "grad_norm": 28.25, "learning_rate": 1.888888888888889e-05, "loss": 0.5331, "step": 655 }, { "epoch": 0.7528331659733181, "grad_norm": 8.375, "learning_rate": 1.8883792048929666e-05, "loss": 0.4439, "step": 656 }, { "epoch": 0.7539807775068139, "grad_norm": 17.875, "learning_rate": 1.887869520897044e-05, "loss": 0.5831, "step": 657 }, { "epoch": 0.7551283890403099, "grad_norm": 34.25, "learning_rate": 1.8873598369011214e-05, "loss": 0.5412, "step": 658 }, { "epoch": 0.7562760005738057, "grad_norm": 50.75, "learning_rate": 1.886850152905199e-05, "loss": 0.5549, "step": 659 }, { "epoch": 0.7574236121073017, "grad_norm": 56.0, "learning_rate": 1.8863404689092762e-05, "loss": 0.4515, "step": 660 }, { "epoch": 0.7585712236407975, "grad_norm": 76.5, "learning_rate": 1.885830784913354e-05, "loss": 0.8915, "step": 661 }, { "epoch": 0.7597188351742935, "grad_norm": 88.0, "learning_rate": 1.8853211009174313e-05, "loss": 0.7725, "step": 662 }, { "epoch": 0.7608664467077895, "grad_norm": 32.0, "learning_rate": 1.8848114169215087e-05, "loss": 0.674, "step": 663 }, { "epoch": 0.7620140582412853, "grad_norm": 37.0, "learning_rate": 1.8843017329255864e-05, "loss": 0.4771, "step": 664 }, { "epoch": 0.7631616697747813, "grad_norm": 47.5, "learning_rate": 1.883792048929664e-05, "loss": 0.665, "step": 665 }, { "epoch": 0.7643092813082771, "grad_norm": 49.0, "learning_rate": 1.8832823649337412e-05, "loss": 0.5971, "step": 666 }, { "epoch": 0.7654568928417731, "grad_norm": 17.875, "learning_rate": 1.8827726809378186e-05, "loss": 0.507, "step": 667 }, { "epoch": 0.766604504375269, "grad_norm": 17.5, "learning_rate": 1.8822629969418963e-05, "loss": 0.3678, "step": 668 }, { "epoch": 0.7677521159087649, "grad_norm": 39.5, "learning_rate": 1.8817533129459737e-05, "loss": 0.5468, "step": 669 }, { "epoch": 0.7688997274422608, "grad_norm": 26.375, "learning_rate": 1.881243628950051e-05, "loss": 0.4158, "step": 670 }, { "epoch": 0.7700473389757567, "grad_norm": 70.5, "learning_rate": 1.8807339449541285e-05, "loss": 0.8145, "step": 671 }, { "epoch": 0.7711949505092526, "grad_norm": 57.0, "learning_rate": 1.880224260958206e-05, "loss": 0.5283, "step": 672 }, { "epoch": 0.7723425620427485, "grad_norm": 15.875, "learning_rate": 1.8797145769622836e-05, "loss": 0.6116, "step": 673 }, { "epoch": 0.7734901735762444, "grad_norm": 10.6875, "learning_rate": 1.879204892966361e-05, "loss": 0.6081, "step": 674 }, { "epoch": 0.7746377851097404, "grad_norm": 41.0, "learning_rate": 1.8786952089704384e-05, "loss": 0.6481, "step": 675 }, { "epoch": 0.7757853966432363, "grad_norm": 13.375, "learning_rate": 1.878185524974516e-05, "loss": 0.4866, "step": 676 }, { "epoch": 0.7769330081767322, "grad_norm": 32.75, "learning_rate": 1.8776758409785932e-05, "loss": 0.7627, "step": 677 }, { "epoch": 0.7780806197102281, "grad_norm": 83.0, "learning_rate": 1.877166156982671e-05, "loss": 0.8497, "step": 678 }, { "epoch": 0.779228231243724, "grad_norm": 46.75, "learning_rate": 1.8766564729867483e-05, "loss": 0.3555, "step": 679 }, { "epoch": 0.7803758427772199, "grad_norm": 67.0, "learning_rate": 1.8761467889908257e-05, "loss": 0.7386, "step": 680 }, { "epoch": 0.7815234543107158, "grad_norm": 98.5, "learning_rate": 1.8756371049949034e-05, "loss": 0.6743, "step": 681 }, { "epoch": 0.7826710658442118, "grad_norm": 52.25, "learning_rate": 1.8751274209989808e-05, "loss": 0.6193, "step": 682 }, { "epoch": 0.7838186773777076, "grad_norm": 18.0, "learning_rate": 1.8746177370030582e-05, "loss": 0.4905, "step": 683 }, { "epoch": 0.7849662889112036, "grad_norm": 37.75, "learning_rate": 1.874108053007136e-05, "loss": 0.4043, "step": 684 }, { "epoch": 0.7861139004446994, "grad_norm": 22.25, "learning_rate": 1.8735983690112133e-05, "loss": 0.5243, "step": 685 }, { "epoch": 0.7872615119781954, "grad_norm": 21.75, "learning_rate": 1.8730886850152907e-05, "loss": 0.4342, "step": 686 }, { "epoch": 0.7884091235116913, "grad_norm": 31.375, "learning_rate": 1.872579001019368e-05, "loss": 0.5206, "step": 687 }, { "epoch": 0.7895567350451872, "grad_norm": 21.25, "learning_rate": 1.8720693170234455e-05, "loss": 0.6048, "step": 688 }, { "epoch": 0.7907043465786832, "grad_norm": 49.0, "learning_rate": 1.8715596330275232e-05, "loss": 0.5758, "step": 689 }, { "epoch": 0.791851958112179, "grad_norm": 26.75, "learning_rate": 1.8710499490316006e-05, "loss": 0.5446, "step": 690 }, { "epoch": 0.792999569645675, "grad_norm": 28.5, "learning_rate": 1.870540265035678e-05, "loss": 0.4504, "step": 691 }, { "epoch": 0.7941471811791708, "grad_norm": 28.375, "learning_rate": 1.8700305810397554e-05, "loss": 0.7349, "step": 692 }, { "epoch": 0.7952947927126668, "grad_norm": 102.0, "learning_rate": 1.869520897043833e-05, "loss": 0.6082, "step": 693 }, { "epoch": 0.7964424042461626, "grad_norm": 87.0, "learning_rate": 1.8690112130479105e-05, "loss": 0.7548, "step": 694 }, { "epoch": 0.7975900157796586, "grad_norm": 24.875, "learning_rate": 1.868501529051988e-05, "loss": 0.7732, "step": 695 }, { "epoch": 0.7987376273131545, "grad_norm": 44.0, "learning_rate": 1.8679918450560653e-05, "loss": 0.5928, "step": 696 }, { "epoch": 0.7998852388466504, "grad_norm": 24.375, "learning_rate": 1.8674821610601427e-05, "loss": 0.5727, "step": 697 }, { "epoch": 0.8010328503801463, "grad_norm": 72.5, "learning_rate": 1.8669724770642204e-05, "loss": 0.811, "step": 698 }, { "epoch": 0.8021804619136422, "grad_norm": 13.25, "learning_rate": 1.8664627930682978e-05, "loss": 0.2618, "step": 699 }, { "epoch": 0.8033280734471382, "grad_norm": 34.5, "learning_rate": 1.8659531090723752e-05, "loss": 0.8214, "step": 700 }, { "epoch": 0.8033280734471382, "eval_accuracy": 0.61, "eval_loss": 0.5829592347145081, "eval_runtime": 49.9174, "eval_samples_per_second": 2.003, "eval_steps_per_second": 2.003, "step": 700 }, { "epoch": 0.804475684980634, "grad_norm": 25.125, "learning_rate": 1.865443425076453e-05, "loss": 0.5568, "step": 701 }, { "epoch": 0.80562329651413, "grad_norm": 14.9375, "learning_rate": 1.86493374108053e-05, "loss": 0.3704, "step": 702 }, { "epoch": 0.8067709080476259, "grad_norm": 15.8125, "learning_rate": 1.8644240570846077e-05, "loss": 0.4246, "step": 703 }, { "epoch": 0.8079185195811218, "grad_norm": 16.125, "learning_rate": 1.863914373088685e-05, "loss": 0.3896, "step": 704 }, { "epoch": 0.8090661311146177, "grad_norm": 50.5, "learning_rate": 1.8634046890927625e-05, "loss": 0.3966, "step": 705 }, { "epoch": 0.8102137426481136, "grad_norm": 45.25, "learning_rate": 1.8628950050968402e-05, "loss": 0.3742, "step": 706 }, { "epoch": 0.8113613541816095, "grad_norm": 39.75, "learning_rate": 1.8623853211009176e-05, "loss": 0.4672, "step": 707 }, { "epoch": 0.8125089657151054, "grad_norm": 39.25, "learning_rate": 1.861875637104995e-05, "loss": 0.6046, "step": 708 }, { "epoch": 0.8136565772486013, "grad_norm": 38.25, "learning_rate": 1.8613659531090724e-05, "loss": 1.0867, "step": 709 }, { "epoch": 0.8148041887820973, "grad_norm": 26.875, "learning_rate": 1.86085626911315e-05, "loss": 0.3141, "step": 710 }, { "epoch": 0.8159518003155932, "grad_norm": 53.0, "learning_rate": 1.8603465851172275e-05, "loss": 0.8153, "step": 711 }, { "epoch": 0.8170994118490891, "grad_norm": 35.0, "learning_rate": 1.859836901121305e-05, "loss": 0.7676, "step": 712 }, { "epoch": 0.818247023382585, "grad_norm": 55.75, "learning_rate": 1.8593272171253823e-05, "loss": 0.5664, "step": 713 }, { "epoch": 0.8193946349160809, "grad_norm": 14.5, "learning_rate": 1.8588175331294597e-05, "loss": 0.6436, "step": 714 }, { "epoch": 0.8205422464495769, "grad_norm": 29.0, "learning_rate": 1.8583078491335374e-05, "loss": 0.4355, "step": 715 }, { "epoch": 0.8216898579830727, "grad_norm": 82.5, "learning_rate": 1.8577981651376148e-05, "loss": 1.2766, "step": 716 }, { "epoch": 0.8228374695165687, "grad_norm": 37.5, "learning_rate": 1.8572884811416922e-05, "loss": 0.4578, "step": 717 }, { "epoch": 0.8239850810500645, "grad_norm": 16.875, "learning_rate": 1.85677879714577e-05, "loss": 0.5334, "step": 718 }, { "epoch": 0.8251326925835605, "grad_norm": 22.875, "learning_rate": 1.856269113149847e-05, "loss": 0.5546, "step": 719 }, { "epoch": 0.8262803041170563, "grad_norm": 106.0, "learning_rate": 1.8557594291539247e-05, "loss": 0.9589, "step": 720 }, { "epoch": 0.8274279156505523, "grad_norm": 21.75, "learning_rate": 1.855249745158002e-05, "loss": 0.5682, "step": 721 }, { "epoch": 0.8285755271840483, "grad_norm": 52.75, "learning_rate": 1.8547400611620795e-05, "loss": 0.4809, "step": 722 }, { "epoch": 0.8297231387175441, "grad_norm": 19.0, "learning_rate": 1.8542303771661572e-05, "loss": 0.3774, "step": 723 }, { "epoch": 0.8308707502510401, "grad_norm": 25.125, "learning_rate": 1.8537206931702346e-05, "loss": 0.4828, "step": 724 }, { "epoch": 0.8320183617845359, "grad_norm": 34.0, "learning_rate": 1.853211009174312e-05, "loss": 0.4859, "step": 725 }, { "epoch": 0.8331659733180319, "grad_norm": 48.75, "learning_rate": 1.8527013251783897e-05, "loss": 0.5612, "step": 726 }, { "epoch": 0.8343135848515277, "grad_norm": 45.5, "learning_rate": 1.852191641182467e-05, "loss": 0.5976, "step": 727 }, { "epoch": 0.8354611963850237, "grad_norm": 52.0, "learning_rate": 1.8516819571865445e-05, "loss": 0.6551, "step": 728 }, { "epoch": 0.8366088079185195, "grad_norm": 23.75, "learning_rate": 1.851172273190622e-05, "loss": 0.5022, "step": 729 }, { "epoch": 0.8377564194520155, "grad_norm": 71.5, "learning_rate": 1.8506625891946993e-05, "loss": 0.8474, "step": 730 }, { "epoch": 0.8389040309855114, "grad_norm": 67.5, "learning_rate": 1.850152905198777e-05, "loss": 0.6511, "step": 731 }, { "epoch": 0.8400516425190073, "grad_norm": 36.5, "learning_rate": 1.8496432212028544e-05, "loss": 0.5485, "step": 732 }, { "epoch": 0.8411992540525032, "grad_norm": 63.25, "learning_rate": 1.8491335372069318e-05, "loss": 0.7833, "step": 733 }, { "epoch": 0.8423468655859991, "grad_norm": 11.625, "learning_rate": 1.8486238532110092e-05, "loss": 0.5295, "step": 734 }, { "epoch": 0.8434944771194951, "grad_norm": 44.25, "learning_rate": 1.848114169215087e-05, "loss": 0.4733, "step": 735 }, { "epoch": 0.844642088652991, "grad_norm": 6.90625, "learning_rate": 1.8476044852191643e-05, "loss": 0.2207, "step": 736 }, { "epoch": 0.8457897001864869, "grad_norm": 63.0, "learning_rate": 1.8470948012232417e-05, "loss": 0.5543, "step": 737 }, { "epoch": 0.8469373117199828, "grad_norm": 19.5, "learning_rate": 1.846585117227319e-05, "loss": 0.4689, "step": 738 }, { "epoch": 0.8480849232534787, "grad_norm": 14.6875, "learning_rate": 1.8460754332313965e-05, "loss": 0.5446, "step": 739 }, { "epoch": 0.8492325347869746, "grad_norm": 68.5, "learning_rate": 1.8455657492354742e-05, "loss": 0.6652, "step": 740 }, { "epoch": 0.8503801463204705, "grad_norm": 25.0, "learning_rate": 1.8450560652395516e-05, "loss": 0.3413, "step": 741 }, { "epoch": 0.8515277578539664, "grad_norm": 43.5, "learning_rate": 1.844546381243629e-05, "loss": 0.5552, "step": 742 }, { "epoch": 0.8526753693874624, "grad_norm": 50.25, "learning_rate": 1.8440366972477067e-05, "loss": 0.5051, "step": 743 }, { "epoch": 0.8538229809209582, "grad_norm": 30.625, "learning_rate": 1.843527013251784e-05, "loss": 0.5442, "step": 744 }, { "epoch": 0.8549705924544542, "grad_norm": 21.75, "learning_rate": 1.8430173292558615e-05, "loss": 0.724, "step": 745 }, { "epoch": 0.8561182039879501, "grad_norm": 23.0, "learning_rate": 1.8425076452599392e-05, "loss": 0.6982, "step": 746 }, { "epoch": 0.857265815521446, "grad_norm": 34.5, "learning_rate": 1.8419979612640163e-05, "loss": 0.4268, "step": 747 }, { "epoch": 0.858413427054942, "grad_norm": 12.0, "learning_rate": 1.841488277268094e-05, "loss": 0.5611, "step": 748 }, { "epoch": 0.8595610385884378, "grad_norm": 20.5, "learning_rate": 1.8409785932721714e-05, "loss": 0.552, "step": 749 }, { "epoch": 0.8607086501219338, "grad_norm": 35.0, "learning_rate": 1.8404689092762488e-05, "loss": 0.5549, "step": 750 }, { "epoch": 0.8618562616554296, "grad_norm": 43.5, "learning_rate": 1.8399592252803265e-05, "loss": 0.5242, "step": 751 }, { "epoch": 0.8630038731889256, "grad_norm": 13.875, "learning_rate": 1.839449541284404e-05, "loss": 0.3858, "step": 752 }, { "epoch": 0.8641514847224214, "grad_norm": 45.25, "learning_rate": 1.8389398572884813e-05, "loss": 0.6547, "step": 753 }, { "epoch": 0.8652990962559174, "grad_norm": 66.5, "learning_rate": 1.8384301732925587e-05, "loss": 0.5999, "step": 754 }, { "epoch": 0.8664467077894132, "grad_norm": 31.25, "learning_rate": 1.837920489296636e-05, "loss": 0.6402, "step": 755 }, { "epoch": 0.8675943193229092, "grad_norm": 16.125, "learning_rate": 1.8374108053007138e-05, "loss": 0.4183, "step": 756 }, { "epoch": 0.868741930856405, "grad_norm": 34.75, "learning_rate": 1.8369011213047912e-05, "loss": 0.46, "step": 757 }, { "epoch": 0.869889542389901, "grad_norm": 11.6875, "learning_rate": 1.8363914373088686e-05, "loss": 0.5179, "step": 758 }, { "epoch": 0.871037153923397, "grad_norm": 60.0, "learning_rate": 1.835881753312946e-05, "loss": 0.5217, "step": 759 }, { "epoch": 0.8721847654568928, "grad_norm": 9.75, "learning_rate": 1.8353720693170237e-05, "loss": 0.3772, "step": 760 }, { "epoch": 0.8733323769903888, "grad_norm": 11.75, "learning_rate": 1.834862385321101e-05, "loss": 0.5367, "step": 761 }, { "epoch": 0.8744799885238846, "grad_norm": 41.5, "learning_rate": 1.8343527013251785e-05, "loss": 0.573, "step": 762 }, { "epoch": 0.8756276000573806, "grad_norm": 34.25, "learning_rate": 1.8338430173292562e-05, "loss": 0.1937, "step": 763 }, { "epoch": 0.8767752115908765, "grad_norm": 23.125, "learning_rate": 1.8333333333333333e-05, "loss": 0.4614, "step": 764 }, { "epoch": 0.8779228231243724, "grad_norm": 46.5, "learning_rate": 1.832823649337411e-05, "loss": 0.442, "step": 765 }, { "epoch": 0.8790704346578683, "grad_norm": 74.5, "learning_rate": 1.8323139653414884e-05, "loss": 0.9426, "step": 766 }, { "epoch": 0.8802180461913642, "grad_norm": 65.5, "learning_rate": 1.8318042813455658e-05, "loss": 0.8281, "step": 767 }, { "epoch": 0.8813656577248601, "grad_norm": 30.375, "learning_rate": 1.8312945973496435e-05, "loss": 0.6035, "step": 768 }, { "epoch": 0.882513269258356, "grad_norm": 89.5, "learning_rate": 1.830784913353721e-05, "loss": 0.8813, "step": 769 }, { "epoch": 0.883660880791852, "grad_norm": 71.5, "learning_rate": 1.8302752293577983e-05, "loss": 0.7809, "step": 770 }, { "epoch": 0.8848084923253479, "grad_norm": 87.5, "learning_rate": 1.829765545361876e-05, "loss": 0.9051, "step": 771 }, { "epoch": 0.8859561038588438, "grad_norm": 78.5, "learning_rate": 1.829255861365953e-05, "loss": 0.8777, "step": 772 }, { "epoch": 0.8871037153923397, "grad_norm": 38.5, "learning_rate": 1.8287461773700308e-05, "loss": 0.3393, "step": 773 }, { "epoch": 0.8882513269258356, "grad_norm": 34.0, "learning_rate": 1.8282364933741082e-05, "loss": 0.4772, "step": 774 }, { "epoch": 0.8893989384593315, "grad_norm": 42.25, "learning_rate": 1.8277268093781856e-05, "loss": 0.5136, "step": 775 }, { "epoch": 0.8905465499928275, "grad_norm": 51.25, "learning_rate": 1.8272171253822633e-05, "loss": 0.4965, "step": 776 }, { "epoch": 0.8916941615263233, "grad_norm": 23.5, "learning_rate": 1.8267074413863407e-05, "loss": 0.6667, "step": 777 }, { "epoch": 0.8928417730598193, "grad_norm": 69.0, "learning_rate": 1.826197757390418e-05, "loss": 0.7309, "step": 778 }, { "epoch": 0.8939893845933151, "grad_norm": 13.5625, "learning_rate": 1.8256880733944955e-05, "loss": 0.5041, "step": 779 }, { "epoch": 0.8951369961268111, "grad_norm": 115.0, "learning_rate": 1.825178389398573e-05, "loss": 0.9974, "step": 780 }, { "epoch": 0.8962846076603069, "grad_norm": 19.125, "learning_rate": 1.8246687054026503e-05, "loss": 0.5556, "step": 781 }, { "epoch": 0.8974322191938029, "grad_norm": 52.5, "learning_rate": 1.824159021406728e-05, "loss": 0.7631, "step": 782 }, { "epoch": 0.8985798307272989, "grad_norm": 10.4375, "learning_rate": 1.8236493374108054e-05, "loss": 0.6217, "step": 783 }, { "epoch": 0.8997274422607947, "grad_norm": 20.625, "learning_rate": 1.8231396534148828e-05, "loss": 0.4404, "step": 784 }, { "epoch": 0.9008750537942907, "grad_norm": 81.0, "learning_rate": 1.8226299694189605e-05, "loss": 0.8382, "step": 785 }, { "epoch": 0.9020226653277865, "grad_norm": 25.125, "learning_rate": 1.822120285423038e-05, "loss": 0.465, "step": 786 }, { "epoch": 0.9031702768612825, "grad_norm": 4.90625, "learning_rate": 1.8216106014271153e-05, "loss": 0.2211, "step": 787 }, { "epoch": 0.9043178883947783, "grad_norm": 18.25, "learning_rate": 1.821100917431193e-05, "loss": 0.5354, "step": 788 }, { "epoch": 0.9054654999282743, "grad_norm": 22.625, "learning_rate": 1.82059123343527e-05, "loss": 0.4656, "step": 789 }, { "epoch": 0.9066131114617701, "grad_norm": 22.125, "learning_rate": 1.8200815494393478e-05, "loss": 0.7412, "step": 790 }, { "epoch": 0.9077607229952661, "grad_norm": 18.125, "learning_rate": 1.8195718654434252e-05, "loss": 0.5085, "step": 791 }, { "epoch": 0.908908334528762, "grad_norm": 17.625, "learning_rate": 1.8190621814475026e-05, "loss": 0.4275, "step": 792 }, { "epoch": 0.9100559460622579, "grad_norm": 74.5, "learning_rate": 1.8185524974515803e-05, "loss": 0.8506, "step": 793 }, { "epoch": 0.9112035575957539, "grad_norm": 59.75, "learning_rate": 1.8180428134556577e-05, "loss": 0.5863, "step": 794 }, { "epoch": 0.9123511691292497, "grad_norm": 17.875, "learning_rate": 1.817533129459735e-05, "loss": 0.5018, "step": 795 }, { "epoch": 0.9134987806627457, "grad_norm": 73.0, "learning_rate": 1.8170234454638125e-05, "loss": 0.7539, "step": 796 }, { "epoch": 0.9146463921962416, "grad_norm": 38.0, "learning_rate": 1.81651376146789e-05, "loss": 0.6021, "step": 797 }, { "epoch": 0.9157940037297375, "grad_norm": 34.75, "learning_rate": 1.8160040774719676e-05, "loss": 0.4989, "step": 798 }, { "epoch": 0.9169416152632334, "grad_norm": 29.25, "learning_rate": 1.815494393476045e-05, "loss": 0.5503, "step": 799 }, { "epoch": 0.9180892267967293, "grad_norm": 113.0, "learning_rate": 1.8149847094801224e-05, "loss": 0.7238, "step": 800 }, { "epoch": 0.9180892267967293, "eval_accuracy": 0.67, "eval_loss": 0.5950115323066711, "eval_runtime": 49.3005, "eval_samples_per_second": 2.028, "eval_steps_per_second": 2.028, "step": 800 }, { "epoch": 0.9192368383302252, "grad_norm": 12.9375, "learning_rate": 1.8144750254841998e-05, "loss": 0.632, "step": 801 }, { "epoch": 0.9203844498637211, "grad_norm": 49.75, "learning_rate": 1.8139653414882775e-05, "loss": 0.6413, "step": 802 }, { "epoch": 0.921532061397217, "grad_norm": 13.125, "learning_rate": 1.813455657492355e-05, "loss": 0.5482, "step": 803 }, { "epoch": 0.922679672930713, "grad_norm": 20.125, "learning_rate": 1.8129459734964323e-05, "loss": 0.5773, "step": 804 }, { "epoch": 0.9238272844642089, "grad_norm": 100.0, "learning_rate": 1.81243628950051e-05, "loss": 1.35, "step": 805 }, { "epoch": 0.9249748959977048, "grad_norm": 25.75, "learning_rate": 1.811926605504587e-05, "loss": 0.5234, "step": 806 }, { "epoch": 0.9261225075312007, "grad_norm": 37.0, "learning_rate": 1.8114169215086648e-05, "loss": 0.473, "step": 807 }, { "epoch": 0.9272701190646966, "grad_norm": 29.0, "learning_rate": 1.8109072375127422e-05, "loss": 0.4716, "step": 808 }, { "epoch": 0.9284177305981925, "grad_norm": 22.0, "learning_rate": 1.8103975535168196e-05, "loss": 0.5146, "step": 809 }, { "epoch": 0.9295653421316884, "grad_norm": 11.75, "learning_rate": 1.8098878695208973e-05, "loss": 0.6532, "step": 810 }, { "epoch": 0.9307129536651844, "grad_norm": 14.375, "learning_rate": 1.8093781855249747e-05, "loss": 0.5441, "step": 811 }, { "epoch": 0.9318605651986802, "grad_norm": 42.0, "learning_rate": 1.808868501529052e-05, "loss": 0.4905, "step": 812 }, { "epoch": 0.9330081767321762, "grad_norm": 64.0, "learning_rate": 1.8083588175331298e-05, "loss": 0.8364, "step": 813 }, { "epoch": 0.934155788265672, "grad_norm": 28.875, "learning_rate": 1.807849133537207e-05, "loss": 0.414, "step": 814 }, { "epoch": 0.935303399799168, "grad_norm": 7.6875, "learning_rate": 1.8073394495412846e-05, "loss": 0.3923, "step": 815 }, { "epoch": 0.9364510113326638, "grad_norm": 32.25, "learning_rate": 1.806829765545362e-05, "loss": 0.5358, "step": 816 }, { "epoch": 0.9375986228661598, "grad_norm": 57.5, "learning_rate": 1.8063200815494394e-05, "loss": 0.4813, "step": 817 }, { "epoch": 0.9387462343996558, "grad_norm": 33.5, "learning_rate": 1.805810397553517e-05, "loss": 0.4693, "step": 818 }, { "epoch": 0.9398938459331516, "grad_norm": 35.0, "learning_rate": 1.8053007135575945e-05, "loss": 0.3321, "step": 819 }, { "epoch": 0.9410414574666476, "grad_norm": 19.75, "learning_rate": 1.804791029561672e-05, "loss": 0.5709, "step": 820 }, { "epoch": 0.9421890690001434, "grad_norm": 133.0, "learning_rate": 1.8042813455657493e-05, "loss": 1.0803, "step": 821 }, { "epoch": 0.9433366805336394, "grad_norm": 7.90625, "learning_rate": 1.803771661569827e-05, "loss": 0.5182, "step": 822 }, { "epoch": 0.9444842920671352, "grad_norm": 25.375, "learning_rate": 1.8032619775739044e-05, "loss": 0.6489, "step": 823 }, { "epoch": 0.9456319036006312, "grad_norm": 24.125, "learning_rate": 1.8027522935779818e-05, "loss": 0.5298, "step": 824 }, { "epoch": 0.9467795151341271, "grad_norm": 26.5, "learning_rate": 1.8022426095820592e-05, "loss": 0.4967, "step": 825 }, { "epoch": 0.947927126667623, "grad_norm": 55.5, "learning_rate": 1.8017329255861366e-05, "loss": 0.7623, "step": 826 }, { "epoch": 0.9490747382011189, "grad_norm": 38.75, "learning_rate": 1.8012232415902143e-05, "loss": 0.3873, "step": 827 }, { "epoch": 0.9502223497346148, "grad_norm": 36.5, "learning_rate": 1.8007135575942917e-05, "loss": 0.4885, "step": 828 }, { "epoch": 0.9513699612681108, "grad_norm": 36.75, "learning_rate": 1.800203873598369e-05, "loss": 0.4786, "step": 829 }, { "epoch": 0.9525175728016066, "grad_norm": 15.9375, "learning_rate": 1.7996941896024468e-05, "loss": 0.7344, "step": 830 }, { "epoch": 0.9536651843351026, "grad_norm": 35.75, "learning_rate": 1.799184505606524e-05, "loss": 0.4364, "step": 831 }, { "epoch": 0.9548127958685985, "grad_norm": 60.5, "learning_rate": 1.7986748216106016e-05, "loss": 0.4018, "step": 832 }, { "epoch": 0.9559604074020944, "grad_norm": 29.5, "learning_rate": 1.798165137614679e-05, "loss": 0.5492, "step": 833 }, { "epoch": 0.9571080189355903, "grad_norm": 51.75, "learning_rate": 1.7976554536187564e-05, "loss": 0.5751, "step": 834 }, { "epoch": 0.9582556304690862, "grad_norm": 27.0, "learning_rate": 1.797145769622834e-05, "loss": 0.4943, "step": 835 }, { "epoch": 0.9594032420025821, "grad_norm": 40.75, "learning_rate": 1.7966360856269115e-05, "loss": 0.6622, "step": 836 }, { "epoch": 0.960550853536078, "grad_norm": 82.0, "learning_rate": 1.796126401630989e-05, "loss": 0.6737, "step": 837 }, { "epoch": 0.9616984650695739, "grad_norm": 11.125, "learning_rate": 1.7956167176350666e-05, "loss": 0.4544, "step": 838 }, { "epoch": 0.9628460766030699, "grad_norm": 18.125, "learning_rate": 1.795107033639144e-05, "loss": 0.5389, "step": 839 }, { "epoch": 0.9639936881365657, "grad_norm": 15.9375, "learning_rate": 1.7945973496432214e-05, "loss": 0.1783, "step": 840 }, { "epoch": 0.9651412996700617, "grad_norm": 53.5, "learning_rate": 1.7940876656472988e-05, "loss": 0.3035, "step": 841 }, { "epoch": 0.9662889112035576, "grad_norm": 52.75, "learning_rate": 1.7935779816513762e-05, "loss": 0.6946, "step": 842 }, { "epoch": 0.9674365227370535, "grad_norm": 34.75, "learning_rate": 1.793068297655454e-05, "loss": 0.5466, "step": 843 }, { "epoch": 0.9685841342705495, "grad_norm": 21.875, "learning_rate": 1.7925586136595313e-05, "loss": 0.4619, "step": 844 }, { "epoch": 0.9697317458040453, "grad_norm": 50.5, "learning_rate": 1.7920489296636087e-05, "loss": 0.6513, "step": 845 }, { "epoch": 0.9708793573375413, "grad_norm": 15.125, "learning_rate": 1.791539245667686e-05, "loss": 0.379, "step": 846 }, { "epoch": 0.9720269688710371, "grad_norm": 10.625, "learning_rate": 1.7910295616717638e-05, "loss": 0.5085, "step": 847 }, { "epoch": 0.9731745804045331, "grad_norm": 12.6875, "learning_rate": 1.7905198776758412e-05, "loss": 0.5272, "step": 848 }, { "epoch": 0.9743221919380289, "grad_norm": 27.5, "learning_rate": 1.7900101936799186e-05, "loss": 1.0062, "step": 849 }, { "epoch": 0.9754698034715249, "grad_norm": 127.5, "learning_rate": 1.789500509683996e-05, "loss": 1.0798, "step": 850 }, { "epoch": 0.9766174150050208, "grad_norm": 103.5, "learning_rate": 1.7889908256880734e-05, "loss": 1.1638, "step": 851 }, { "epoch": 0.9777650265385167, "grad_norm": 69.0, "learning_rate": 1.788481141692151e-05, "loss": 0.9011, "step": 852 }, { "epoch": 0.9789126380720127, "grad_norm": 86.5, "learning_rate": 1.7879714576962285e-05, "loss": 0.8197, "step": 853 }, { "epoch": 0.9800602496055085, "grad_norm": 15.5, "learning_rate": 1.787461773700306e-05, "loss": 0.6134, "step": 854 }, { "epoch": 0.9812078611390045, "grad_norm": 110.5, "learning_rate": 1.7869520897043836e-05, "loss": 0.919, "step": 855 }, { "epoch": 0.9823554726725003, "grad_norm": 30.125, "learning_rate": 1.786442405708461e-05, "loss": 0.5746, "step": 856 }, { "epoch": 0.9835030842059963, "grad_norm": 100.0, "learning_rate": 1.7859327217125384e-05, "loss": 0.3361, "step": 857 }, { "epoch": 0.9846506957394922, "grad_norm": 60.0, "learning_rate": 1.7854230377166158e-05, "loss": 0.6782, "step": 858 }, { "epoch": 0.9857983072729881, "grad_norm": 62.5, "learning_rate": 1.7849133537206932e-05, "loss": 0.8552, "step": 859 }, { "epoch": 0.986945918806484, "grad_norm": 38.75, "learning_rate": 1.784403669724771e-05, "loss": 0.7251, "step": 860 }, { "epoch": 0.9880935303399799, "grad_norm": 84.5, "learning_rate": 1.7838939857288483e-05, "loss": 0.9825, "step": 861 }, { "epoch": 0.9892411418734758, "grad_norm": 32.75, "learning_rate": 1.7833843017329257e-05, "loss": 0.2631, "step": 862 }, { "epoch": 0.9903887534069717, "grad_norm": 101.5, "learning_rate": 1.782874617737003e-05, "loss": 1.0281, "step": 863 }, { "epoch": 0.9915363649404677, "grad_norm": 36.75, "learning_rate": 1.7823649337410808e-05, "loss": 0.6591, "step": 864 }, { "epoch": 0.9926839764739636, "grad_norm": 23.75, "learning_rate": 1.7818552497451582e-05, "loss": 0.2017, "step": 865 }, { "epoch": 0.9938315880074595, "grad_norm": 11.0625, "learning_rate": 1.7813455657492356e-05, "loss": 0.6496, "step": 866 }, { "epoch": 0.9949791995409554, "grad_norm": 52.0, "learning_rate": 1.780835881753313e-05, "loss": 0.7726, "step": 867 }, { "epoch": 0.9961268110744513, "grad_norm": 8.9375, "learning_rate": 1.7803261977573904e-05, "loss": 0.2688, "step": 868 }, { "epoch": 0.9972744226079472, "grad_norm": 29.625, "learning_rate": 1.779816513761468e-05, "loss": 0.4991, "step": 869 }, { "epoch": 0.9984220341414431, "grad_norm": 6.59375, "learning_rate": 1.7793068297655455e-05, "loss": 0.3232, "step": 870 }, { "epoch": 0.999569645674939, "grad_norm": 39.25, "learning_rate": 1.778797145769623e-05, "loss": 0.4866, "step": 871 }, { "epoch": 1.0, "grad_norm": 31.25, "learning_rate": 1.7782874617737006e-05, "loss": 0.1617, "step": 872 }, { "epoch": 1.0011476115334959, "grad_norm": 54.25, "learning_rate": 1.7777777777777777e-05, "loss": 0.5081, "step": 873 }, { "epoch": 1.002295223066992, "grad_norm": 59.5, "learning_rate": 1.7772680937818554e-05, "loss": 0.6284, "step": 874 }, { "epoch": 1.0034428346004878, "grad_norm": 62.0, "learning_rate": 1.7767584097859328e-05, "loss": 0.6364, "step": 875 }, { "epoch": 1.0045904461339836, "grad_norm": 99.5, "learning_rate": 1.7762487257900102e-05, "loss": 1.521, "step": 876 }, { "epoch": 1.0057380576674795, "grad_norm": 105.5, "learning_rate": 1.775739041794088e-05, "loss": 1.0837, "step": 877 }, { "epoch": 1.0068856692009756, "grad_norm": 117.0, "learning_rate": 1.7752293577981653e-05, "loss": 1.0871, "step": 878 }, { "epoch": 1.0080332807344714, "grad_norm": 91.5, "learning_rate": 1.7747196738022427e-05, "loss": 0.7927, "step": 879 }, { "epoch": 1.0091808922679673, "grad_norm": 68.5, "learning_rate": 1.7742099898063204e-05, "loss": 0.6309, "step": 880 }, { "epoch": 1.010328503801463, "grad_norm": 11.3125, "learning_rate": 1.7737003058103978e-05, "loss": 0.3369, "step": 881 }, { "epoch": 1.0114761153349592, "grad_norm": 11.125, "learning_rate": 1.7731906218144752e-05, "loss": 0.2181, "step": 882 }, { "epoch": 1.012623726868455, "grad_norm": 23.75, "learning_rate": 1.7726809378185526e-05, "loss": 0.4936, "step": 883 }, { "epoch": 1.0137713384019509, "grad_norm": 26.25, "learning_rate": 1.77217125382263e-05, "loss": 0.5372, "step": 884 }, { "epoch": 1.014918949935447, "grad_norm": 23.375, "learning_rate": 1.7716615698267077e-05, "loss": 0.2898, "step": 885 }, { "epoch": 1.0160665614689428, "grad_norm": 61.25, "learning_rate": 1.771151885830785e-05, "loss": 1.0463, "step": 886 }, { "epoch": 1.0172141730024387, "grad_norm": 16.125, "learning_rate": 1.7706422018348625e-05, "loss": 0.3061, "step": 887 }, { "epoch": 1.0183617845359345, "grad_norm": 83.5, "learning_rate": 1.77013251783894e-05, "loss": 0.7545, "step": 888 }, { "epoch": 1.0195093960694306, "grad_norm": 57.75, "learning_rate": 1.7696228338430176e-05, "loss": 0.7643, "step": 889 }, { "epoch": 1.0206570076029264, "grad_norm": 19.125, "learning_rate": 1.769113149847095e-05, "loss": 0.6013, "step": 890 }, { "epoch": 1.0218046191364223, "grad_norm": 12.25, "learning_rate": 1.7686034658511724e-05, "loss": 0.4579, "step": 891 }, { "epoch": 1.0229522306699181, "grad_norm": 38.25, "learning_rate": 1.7680937818552498e-05, "loss": 0.4669, "step": 892 }, { "epoch": 1.0240998422034142, "grad_norm": 68.0, "learning_rate": 1.767584097859327e-05, "loss": 0.4824, "step": 893 }, { "epoch": 1.02524745373691, "grad_norm": 10.5625, "learning_rate": 1.767074413863405e-05, "loss": 0.5689, "step": 894 }, { "epoch": 1.026395065270406, "grad_norm": 8.875, "learning_rate": 1.7665647298674823e-05, "loss": 0.3161, "step": 895 }, { "epoch": 1.0275426768039018, "grad_norm": 23.625, "learning_rate": 1.7660550458715597e-05, "loss": 0.4443, "step": 896 }, { "epoch": 1.0286902883373978, "grad_norm": 15.75, "learning_rate": 1.7655453618756374e-05, "loss": 0.2331, "step": 897 }, { "epoch": 1.0298378998708937, "grad_norm": 8.25, "learning_rate": 1.7650356778797148e-05, "loss": 0.3554, "step": 898 }, { "epoch": 1.0309855114043895, "grad_norm": 14.5, "learning_rate": 1.7645259938837922e-05, "loss": 0.6107, "step": 899 }, { "epoch": 1.0321331229378856, "grad_norm": 42.25, "learning_rate": 1.76401630988787e-05, "loss": 0.3624, "step": 900 }, { "epoch": 1.0321331229378856, "eval_accuracy": 0.64, "eval_loss": 0.6176496744155884, "eval_runtime": 49.5336, "eval_samples_per_second": 2.019, "eval_steps_per_second": 2.019, "step": 900 }, { "epoch": 1.0332807344713815, "grad_norm": 23.0, "learning_rate": 1.763506625891947e-05, "loss": 0.4606, "step": 901 }, { "epoch": 1.0344283460048773, "grad_norm": 50.75, "learning_rate": 1.7629969418960247e-05, "loss": 0.5176, "step": 902 }, { "epoch": 1.0355759575383732, "grad_norm": 58.0, "learning_rate": 1.762487257900102e-05, "loss": 0.3688, "step": 903 }, { "epoch": 1.0367235690718692, "grad_norm": 36.0, "learning_rate": 1.7619775739041795e-05, "loss": 0.7414, "step": 904 }, { "epoch": 1.037871180605365, "grad_norm": 28.5, "learning_rate": 1.7614678899082572e-05, "loss": 0.8468, "step": 905 }, { "epoch": 1.039018792138861, "grad_norm": 26.25, "learning_rate": 1.7609582059123346e-05, "loss": 0.4338, "step": 906 }, { "epoch": 1.040166403672357, "grad_norm": 122.5, "learning_rate": 1.760448521916412e-05, "loss": 0.9431, "step": 907 }, { "epoch": 1.0413140152058529, "grad_norm": 15.375, "learning_rate": 1.7599388379204894e-05, "loss": 0.5602, "step": 908 }, { "epoch": 1.0424616267393487, "grad_norm": 96.5, "learning_rate": 1.7594291539245668e-05, "loss": 0.6268, "step": 909 }, { "epoch": 1.0436092382728446, "grad_norm": 59.25, "learning_rate": 1.7589194699286445e-05, "loss": 0.404, "step": 910 }, { "epoch": 1.0447568498063406, "grad_norm": 30.5, "learning_rate": 1.758409785932722e-05, "loss": 0.5772, "step": 911 }, { "epoch": 1.0459044613398365, "grad_norm": 15.875, "learning_rate": 1.7579001019367993e-05, "loss": 0.4666, "step": 912 }, { "epoch": 1.0470520728733324, "grad_norm": 20.25, "learning_rate": 1.7573904179408767e-05, "loss": 0.4576, "step": 913 }, { "epoch": 1.0481996844068282, "grad_norm": 33.5, "learning_rate": 1.7568807339449544e-05, "loss": 0.4427, "step": 914 }, { "epoch": 1.0493472959403243, "grad_norm": 6.59375, "learning_rate": 1.7563710499490318e-05, "loss": 0.1466, "step": 915 }, { "epoch": 1.0504949074738201, "grad_norm": 44.0, "learning_rate": 1.7558613659531092e-05, "loss": 0.3564, "step": 916 }, { "epoch": 1.051642519007316, "grad_norm": 26.75, "learning_rate": 1.755351681957187e-05, "loss": 0.6648, "step": 917 }, { "epoch": 1.0527901305408118, "grad_norm": 53.5, "learning_rate": 1.754841997961264e-05, "loss": 0.5389, "step": 918 }, { "epoch": 1.053937742074308, "grad_norm": 23.875, "learning_rate": 1.7543323139653417e-05, "loss": 0.4424, "step": 919 }, { "epoch": 1.0550853536078038, "grad_norm": 34.75, "learning_rate": 1.753822629969419e-05, "loss": 0.4035, "step": 920 }, { "epoch": 1.0562329651412996, "grad_norm": 25.125, "learning_rate": 1.7533129459734965e-05, "loss": 0.5704, "step": 921 }, { "epoch": 1.0573805766747957, "grad_norm": 25.625, "learning_rate": 1.7528032619775742e-05, "loss": 0.7077, "step": 922 }, { "epoch": 1.0585281882082915, "grad_norm": 11.25, "learning_rate": 1.7522935779816516e-05, "loss": 0.228, "step": 923 }, { "epoch": 1.0596757997417874, "grad_norm": 31.125, "learning_rate": 1.751783893985729e-05, "loss": 0.5716, "step": 924 }, { "epoch": 1.0608234112752832, "grad_norm": 21.25, "learning_rate": 1.7512742099898067e-05, "loss": 0.2658, "step": 925 }, { "epoch": 1.0619710228087793, "grad_norm": 12.875, "learning_rate": 1.7507645259938838e-05, "loss": 0.2415, "step": 926 }, { "epoch": 1.0631186343422752, "grad_norm": 100.5, "learning_rate": 1.7502548419979615e-05, "loss": 1.0011, "step": 927 }, { "epoch": 1.064266245875771, "grad_norm": 67.0, "learning_rate": 1.749745158002039e-05, "loss": 0.6753, "step": 928 }, { "epoch": 1.0654138574092669, "grad_norm": 109.0, "learning_rate": 1.7492354740061163e-05, "loss": 0.8631, "step": 929 }, { "epoch": 1.066561468942763, "grad_norm": 68.5, "learning_rate": 1.7487257900101937e-05, "loss": 1.0799, "step": 930 }, { "epoch": 1.0677090804762588, "grad_norm": 74.0, "learning_rate": 1.7482161060142714e-05, "loss": 0.5419, "step": 931 }, { "epoch": 1.0688566920097546, "grad_norm": 61.25, "learning_rate": 1.7477064220183488e-05, "loss": 0.6041, "step": 932 }, { "epoch": 1.0700043035432507, "grad_norm": 32.25, "learning_rate": 1.7471967380224262e-05, "loss": 0.8215, "step": 933 }, { "epoch": 1.0711519150767466, "grad_norm": 45.25, "learning_rate": 1.746687054026504e-05, "loss": 0.5843, "step": 934 }, { "epoch": 1.0722995266102424, "grad_norm": 104.5, "learning_rate": 1.746177370030581e-05, "loss": 1.3222, "step": 935 }, { "epoch": 1.0734471381437383, "grad_norm": 56.0, "learning_rate": 1.7456676860346587e-05, "loss": 0.5504, "step": 936 }, { "epoch": 1.0745947496772343, "grad_norm": 54.0, "learning_rate": 1.745158002038736e-05, "loss": 0.8466, "step": 937 }, { "epoch": 1.0757423612107302, "grad_norm": 27.375, "learning_rate": 1.7446483180428135e-05, "loss": 0.9508, "step": 938 }, { "epoch": 1.076889972744226, "grad_norm": 14.625, "learning_rate": 1.7441386340468912e-05, "loss": 0.3969, "step": 939 }, { "epoch": 1.078037584277722, "grad_norm": 75.0, "learning_rate": 1.7436289500509686e-05, "loss": 0.9936, "step": 940 }, { "epoch": 1.079185195811218, "grad_norm": 51.75, "learning_rate": 1.743119266055046e-05, "loss": 0.5978, "step": 941 }, { "epoch": 1.0803328073447138, "grad_norm": 57.5, "learning_rate": 1.7426095820591237e-05, "loss": 0.6549, "step": 942 }, { "epoch": 1.0814804188782097, "grad_norm": 19.625, "learning_rate": 1.7420998980632008e-05, "loss": 0.4942, "step": 943 }, { "epoch": 1.0826280304117057, "grad_norm": 83.0, "learning_rate": 1.7415902140672785e-05, "loss": 0.6702, "step": 944 }, { "epoch": 1.0837756419452016, "grad_norm": 42.5, "learning_rate": 1.741080530071356e-05, "loss": 0.6299, "step": 945 }, { "epoch": 1.0849232534786974, "grad_norm": 19.25, "learning_rate": 1.7405708460754333e-05, "loss": 0.5421, "step": 946 }, { "epoch": 1.0860708650121933, "grad_norm": 34.0, "learning_rate": 1.740061162079511e-05, "loss": 0.7019, "step": 947 }, { "epoch": 1.0872184765456894, "grad_norm": 34.0, "learning_rate": 1.7395514780835884e-05, "loss": 0.5919, "step": 948 }, { "epoch": 1.0883660880791852, "grad_norm": 17.875, "learning_rate": 1.7390417940876658e-05, "loss": 0.2788, "step": 949 }, { "epoch": 1.089513699612681, "grad_norm": 16.0, "learning_rate": 1.738532110091743e-05, "loss": 0.7744, "step": 950 }, { "epoch": 1.090661311146177, "grad_norm": 61.5, "learning_rate": 1.7380224260958206e-05, "loss": 0.6198, "step": 951 }, { "epoch": 1.091808922679673, "grad_norm": 17.5, "learning_rate": 1.7375127420998983e-05, "loss": 0.5995, "step": 952 }, { "epoch": 1.0929565342131689, "grad_norm": 15.0, "learning_rate": 1.7370030581039757e-05, "loss": 0.4392, "step": 953 }, { "epoch": 1.0941041457466647, "grad_norm": 54.75, "learning_rate": 1.736493374108053e-05, "loss": 0.4673, "step": 954 }, { "epoch": 1.0952517572801606, "grad_norm": 31.5, "learning_rate": 1.7359836901121305e-05, "loss": 0.5318, "step": 955 }, { "epoch": 1.0963993688136566, "grad_norm": 35.0, "learning_rate": 1.7354740061162082e-05, "loss": 0.5184, "step": 956 }, { "epoch": 1.0975469803471525, "grad_norm": 23.75, "learning_rate": 1.7349643221202856e-05, "loss": 0.5015, "step": 957 }, { "epoch": 1.0986945918806483, "grad_norm": 54.0, "learning_rate": 1.734454638124363e-05, "loss": 0.5254, "step": 958 }, { "epoch": 1.0998422034141444, "grad_norm": 10.375, "learning_rate": 1.7339449541284407e-05, "loss": 0.4739, "step": 959 }, { "epoch": 1.1009898149476403, "grad_norm": 23.5, "learning_rate": 1.7334352701325177e-05, "loss": 0.5565, "step": 960 }, { "epoch": 1.102137426481136, "grad_norm": 11.875, "learning_rate": 1.7329255861365955e-05, "loss": 0.3887, "step": 961 }, { "epoch": 1.103285038014632, "grad_norm": 10.875, "learning_rate": 1.732415902140673e-05, "loss": 0.6166, "step": 962 }, { "epoch": 1.104432649548128, "grad_norm": 43.75, "learning_rate": 1.7319062181447503e-05, "loss": 0.9438, "step": 963 }, { "epoch": 1.1055802610816239, "grad_norm": 17.375, "learning_rate": 1.731396534148828e-05, "loss": 0.6131, "step": 964 }, { "epoch": 1.1067278726151197, "grad_norm": 36.5, "learning_rate": 1.7308868501529054e-05, "loss": 0.5897, "step": 965 }, { "epoch": 1.1078754841486158, "grad_norm": 34.25, "learning_rate": 1.7303771661569828e-05, "loss": 0.473, "step": 966 }, { "epoch": 1.1090230956821117, "grad_norm": 23.75, "learning_rate": 1.7298674821610605e-05, "loss": 0.6736, "step": 967 }, { "epoch": 1.1101707072156075, "grad_norm": 47.25, "learning_rate": 1.7293577981651376e-05, "loss": 0.4113, "step": 968 }, { "epoch": 1.1113183187491034, "grad_norm": 13.6875, "learning_rate": 1.7288481141692153e-05, "loss": 0.2634, "step": 969 }, { "epoch": 1.1124659302825994, "grad_norm": 36.75, "learning_rate": 1.7283384301732927e-05, "loss": 0.5289, "step": 970 }, { "epoch": 1.1136135418160953, "grad_norm": 28.625, "learning_rate": 1.72782874617737e-05, "loss": 0.5775, "step": 971 }, { "epoch": 1.1147611533495911, "grad_norm": 42.25, "learning_rate": 1.7273190621814478e-05, "loss": 0.7163, "step": 972 }, { "epoch": 1.115908764883087, "grad_norm": 57.0, "learning_rate": 1.7268093781855252e-05, "loss": 0.5009, "step": 973 }, { "epoch": 1.117056376416583, "grad_norm": 22.375, "learning_rate": 1.7262996941896026e-05, "loss": 0.4101, "step": 974 }, { "epoch": 1.118203987950079, "grad_norm": 41.25, "learning_rate": 1.72579001019368e-05, "loss": 0.4195, "step": 975 }, { "epoch": 1.1193515994835748, "grad_norm": 17.625, "learning_rate": 1.7252803261977577e-05, "loss": 0.4409, "step": 976 }, { "epoch": 1.1204992110170706, "grad_norm": 18.375, "learning_rate": 1.724770642201835e-05, "loss": 0.4041, "step": 977 }, { "epoch": 1.1216468225505667, "grad_norm": 39.0, "learning_rate": 1.7242609582059125e-05, "loss": 0.6333, "step": 978 }, { "epoch": 1.1227944340840625, "grad_norm": 62.25, "learning_rate": 1.72375127420999e-05, "loss": 0.648, "step": 979 }, { "epoch": 1.1239420456175584, "grad_norm": 57.0, "learning_rate": 1.7232415902140673e-05, "loss": 0.5549, "step": 980 }, { "epoch": 1.1250896571510545, "grad_norm": 17.875, "learning_rate": 1.722731906218145e-05, "loss": 0.3829, "step": 981 }, { "epoch": 1.1262372686845503, "grad_norm": 23.5, "learning_rate": 1.7222222222222224e-05, "loss": 0.3594, "step": 982 }, { "epoch": 1.1273848802180462, "grad_norm": 53.0, "learning_rate": 1.7217125382262998e-05, "loss": 0.6625, "step": 983 }, { "epoch": 1.128532491751542, "grad_norm": 49.75, "learning_rate": 1.7212028542303775e-05, "loss": 0.4887, "step": 984 }, { "epoch": 1.129680103285038, "grad_norm": 15.0, "learning_rate": 1.7206931702344545e-05, "loss": 0.5548, "step": 985 }, { "epoch": 1.130827714818534, "grad_norm": 40.5, "learning_rate": 1.7201834862385323e-05, "loss": 0.7024, "step": 986 }, { "epoch": 1.1319753263520298, "grad_norm": 58.25, "learning_rate": 1.7196738022426097e-05, "loss": 0.4027, "step": 987 }, { "epoch": 1.1331229378855259, "grad_norm": 70.5, "learning_rate": 1.719164118246687e-05, "loss": 0.6295, "step": 988 }, { "epoch": 1.1342705494190217, "grad_norm": 34.25, "learning_rate": 1.7186544342507648e-05, "loss": 0.3274, "step": 989 }, { "epoch": 1.1354181609525176, "grad_norm": 20.0, "learning_rate": 1.7181447502548422e-05, "loss": 0.1818, "step": 990 }, { "epoch": 1.1365657724860134, "grad_norm": 46.5, "learning_rate": 1.7176350662589196e-05, "loss": 0.4344, "step": 991 }, { "epoch": 1.1377133840195093, "grad_norm": 60.0, "learning_rate": 1.7171253822629973e-05, "loss": 0.3682, "step": 992 }, { "epoch": 1.1388609955530054, "grad_norm": 30.25, "learning_rate": 1.7166156982670747e-05, "loss": 0.4771, "step": 993 }, { "epoch": 1.1400086070865012, "grad_norm": 23.375, "learning_rate": 1.716106014271152e-05, "loss": 0.4939, "step": 994 }, { "epoch": 1.141156218619997, "grad_norm": 17.625, "learning_rate": 1.7155963302752295e-05, "loss": 0.6885, "step": 995 }, { "epoch": 1.1423038301534931, "grad_norm": 64.5, "learning_rate": 1.715086646279307e-05, "loss": 0.8163, "step": 996 }, { "epoch": 1.143451441686989, "grad_norm": 39.5, "learning_rate": 1.7145769622833846e-05, "loss": 0.3577, "step": 997 }, { "epoch": 1.1445990532204848, "grad_norm": 8.6875, "learning_rate": 1.714067278287462e-05, "loss": 0.202, "step": 998 }, { "epoch": 1.1457466647539807, "grad_norm": 52.0, "learning_rate": 1.7135575942915394e-05, "loss": 0.5541, "step": 999 }, { "epoch": 1.1468942762874768, "grad_norm": 22.125, "learning_rate": 1.7130479102956168e-05, "loss": 0.2125, "step": 1000 }, { "epoch": 1.1468942762874768, "eval_accuracy": 0.6, "eval_loss": 0.5487725734710693, "eval_runtime": 50.2711, "eval_samples_per_second": 1.989, "eval_steps_per_second": 1.989, "step": 1000 }, { "epoch": 1.1480418878209726, "grad_norm": 21.25, "learning_rate": 1.7125382262996945e-05, "loss": 0.3415, "step": 1001 }, { "epoch": 1.1491894993544685, "grad_norm": 39.5, "learning_rate": 1.712028542303772e-05, "loss": 0.6746, "step": 1002 }, { "epoch": 1.1503371108879645, "grad_norm": 16.875, "learning_rate": 1.7115188583078493e-05, "loss": 0.7315, "step": 1003 }, { "epoch": 1.1514847224214604, "grad_norm": 13.6875, "learning_rate": 1.7110091743119267e-05, "loss": 0.5293, "step": 1004 }, { "epoch": 1.1526323339549562, "grad_norm": 10.4375, "learning_rate": 1.710499490316004e-05, "loss": 0.4509, "step": 1005 }, { "epoch": 1.153779945488452, "grad_norm": 18.375, "learning_rate": 1.7099898063200818e-05, "loss": 0.3469, "step": 1006 }, { "epoch": 1.1549275570219482, "grad_norm": 12.375, "learning_rate": 1.709480122324159e-05, "loss": 0.4868, "step": 1007 }, { "epoch": 1.156075168555444, "grad_norm": 57.5, "learning_rate": 1.7089704383282366e-05, "loss": 0.5211, "step": 1008 }, { "epoch": 1.1572227800889399, "grad_norm": 13.875, "learning_rate": 1.7084607543323143e-05, "loss": 0.3623, "step": 1009 }, { "epoch": 1.1583703916224357, "grad_norm": 69.0, "learning_rate": 1.7079510703363917e-05, "loss": 0.274, "step": 1010 }, { "epoch": 1.1595180031559318, "grad_norm": 18.5, "learning_rate": 1.707441386340469e-05, "loss": 0.2365, "step": 1011 }, { "epoch": 1.1606656146894276, "grad_norm": 40.25, "learning_rate": 1.7069317023445465e-05, "loss": 0.2999, "step": 1012 }, { "epoch": 1.1618132262229235, "grad_norm": 57.5, "learning_rate": 1.706422018348624e-05, "loss": 0.5137, "step": 1013 }, { "epoch": 1.1629608377564193, "grad_norm": 20.875, "learning_rate": 1.7059123343527016e-05, "loss": 0.6691, "step": 1014 }, { "epoch": 1.1641084492899154, "grad_norm": 30.875, "learning_rate": 1.705402650356779e-05, "loss": 0.6642, "step": 1015 }, { "epoch": 1.1652560608234113, "grad_norm": 15.9375, "learning_rate": 1.7048929663608564e-05, "loss": 0.2695, "step": 1016 }, { "epoch": 1.1664036723569071, "grad_norm": 88.5, "learning_rate": 1.7043832823649338e-05, "loss": 0.8211, "step": 1017 }, { "epoch": 1.1675512838904032, "grad_norm": 45.5, "learning_rate": 1.7038735983690115e-05, "loss": 0.7956, "step": 1018 }, { "epoch": 1.168698895423899, "grad_norm": 80.0, "learning_rate": 1.703363914373089e-05, "loss": 0.8805, "step": 1019 }, { "epoch": 1.169846506957395, "grad_norm": 15.1875, "learning_rate": 1.7028542303771663e-05, "loss": 0.5262, "step": 1020 }, { "epoch": 1.1709941184908907, "grad_norm": 60.75, "learning_rate": 1.7023445463812437e-05, "loss": 1.1968, "step": 1021 }, { "epoch": 1.1721417300243868, "grad_norm": 31.375, "learning_rate": 1.701834862385321e-05, "loss": 0.744, "step": 1022 }, { "epoch": 1.1732893415578827, "grad_norm": 20.625, "learning_rate": 1.7013251783893988e-05, "loss": 0.3321, "step": 1023 }, { "epoch": 1.1744369530913785, "grad_norm": 19.875, "learning_rate": 1.700815494393476e-05, "loss": 0.4447, "step": 1024 }, { "epoch": 1.1755845646248746, "grad_norm": 23.625, "learning_rate": 1.7003058103975536e-05, "loss": 0.3697, "step": 1025 }, { "epoch": 1.1767321761583704, "grad_norm": 20.625, "learning_rate": 1.6997961264016313e-05, "loss": 0.2759, "step": 1026 }, { "epoch": 1.1778797876918663, "grad_norm": 58.75, "learning_rate": 1.6992864424057087e-05, "loss": 0.7182, "step": 1027 }, { "epoch": 1.1790273992253621, "grad_norm": 6.96875, "learning_rate": 1.698776758409786e-05, "loss": 0.1403, "step": 1028 }, { "epoch": 1.1801750107588582, "grad_norm": 24.875, "learning_rate": 1.6982670744138638e-05, "loss": 0.513, "step": 1029 }, { "epoch": 1.181322622292354, "grad_norm": 15.8125, "learning_rate": 1.697757390417941e-05, "loss": 0.5238, "step": 1030 }, { "epoch": 1.18247023382585, "grad_norm": 22.0, "learning_rate": 1.6972477064220186e-05, "loss": 0.445, "step": 1031 }, { "epoch": 1.1836178453593458, "grad_norm": 86.5, "learning_rate": 1.696738022426096e-05, "loss": 0.7085, "step": 1032 }, { "epoch": 1.1847654568928419, "grad_norm": 63.75, "learning_rate": 1.6962283384301734e-05, "loss": 1.0473, "step": 1033 }, { "epoch": 1.1859130684263377, "grad_norm": 63.5, "learning_rate": 1.695718654434251e-05, "loss": 0.3947, "step": 1034 }, { "epoch": 1.1870606799598336, "grad_norm": 12.375, "learning_rate": 1.6952089704383285e-05, "loss": 0.3453, "step": 1035 }, { "epoch": 1.1882082914933294, "grad_norm": 9.1875, "learning_rate": 1.694699286442406e-05, "loss": 0.2802, "step": 1036 }, { "epoch": 1.1893559030268255, "grad_norm": 29.75, "learning_rate": 1.6941896024464833e-05, "loss": 0.543, "step": 1037 }, { "epoch": 1.1905035145603213, "grad_norm": 60.75, "learning_rate": 1.6936799184505606e-05, "loss": 0.7737, "step": 1038 }, { "epoch": 1.1916511260938172, "grad_norm": 38.0, "learning_rate": 1.6931702344546384e-05, "loss": 0.7725, "step": 1039 }, { "epoch": 1.1927987376273133, "grad_norm": 9.9375, "learning_rate": 1.6926605504587158e-05, "loss": 0.2875, "step": 1040 }, { "epoch": 1.193946349160809, "grad_norm": 53.5, "learning_rate": 1.692150866462793e-05, "loss": 0.6683, "step": 1041 }, { "epoch": 1.195093960694305, "grad_norm": 58.25, "learning_rate": 1.6916411824668705e-05, "loss": 0.4406, "step": 1042 }, { "epoch": 1.1962415722278008, "grad_norm": 46.0, "learning_rate": 1.6911314984709483e-05, "loss": 0.6739, "step": 1043 }, { "epoch": 1.1973891837612969, "grad_norm": 47.25, "learning_rate": 1.6906218144750257e-05, "loss": 0.278, "step": 1044 }, { "epoch": 1.1985367952947927, "grad_norm": 42.5, "learning_rate": 1.690112130479103e-05, "loss": 0.4348, "step": 1045 }, { "epoch": 1.1996844068282886, "grad_norm": 20.0, "learning_rate": 1.6896024464831804e-05, "loss": 0.1145, "step": 1046 }, { "epoch": 1.2008320183617847, "grad_norm": 36.0, "learning_rate": 1.689092762487258e-05, "loss": 0.7542, "step": 1047 }, { "epoch": 1.2019796298952805, "grad_norm": 7.5625, "learning_rate": 1.6885830784913356e-05, "loss": 0.1475, "step": 1048 }, { "epoch": 1.2031272414287764, "grad_norm": 54.75, "learning_rate": 1.688073394495413e-05, "loss": 0.3082, "step": 1049 }, { "epoch": 1.2042748529622722, "grad_norm": 24.5, "learning_rate": 1.6875637104994903e-05, "loss": 0.8289, "step": 1050 }, { "epoch": 1.205422464495768, "grad_norm": 69.5, "learning_rate": 1.687054026503568e-05, "loss": 0.7198, "step": 1051 }, { "epoch": 1.2065700760292641, "grad_norm": 57.25, "learning_rate": 1.6865443425076455e-05, "loss": 0.3022, "step": 1052 }, { "epoch": 1.20771768756276, "grad_norm": 43.5, "learning_rate": 1.686034658511723e-05, "loss": 0.5085, "step": 1053 }, { "epoch": 1.2088652990962558, "grad_norm": 16.75, "learning_rate": 1.6855249745158006e-05, "loss": 0.4784, "step": 1054 }, { "epoch": 1.210012910629752, "grad_norm": 14.3125, "learning_rate": 1.6850152905198776e-05, "loss": 0.3837, "step": 1055 }, { "epoch": 1.2111605221632478, "grad_norm": 9.625, "learning_rate": 1.6845056065239554e-05, "loss": 0.2057, "step": 1056 }, { "epoch": 1.2123081336967436, "grad_norm": 23.625, "learning_rate": 1.6839959225280328e-05, "loss": 0.9273, "step": 1057 }, { "epoch": 1.2134557452302395, "grad_norm": 27.125, "learning_rate": 1.68348623853211e-05, "loss": 0.5371, "step": 1058 }, { "epoch": 1.2146033567637355, "grad_norm": 55.75, "learning_rate": 1.682976554536188e-05, "loss": 0.5682, "step": 1059 }, { "epoch": 1.2157509682972314, "grad_norm": 55.25, "learning_rate": 1.6824668705402653e-05, "loss": 0.4674, "step": 1060 }, { "epoch": 1.2168985798307272, "grad_norm": 18.625, "learning_rate": 1.6819571865443427e-05, "loss": 0.515, "step": 1061 }, { "epoch": 1.2180461913642233, "grad_norm": 118.5, "learning_rate": 1.68144750254842e-05, "loss": 1.1109, "step": 1062 }, { "epoch": 1.2191938028977192, "grad_norm": 45.5, "learning_rate": 1.6809378185524974e-05, "loss": 0.2941, "step": 1063 }, { "epoch": 1.220341414431215, "grad_norm": 21.0, "learning_rate": 1.6804281345565752e-05, "loss": 0.4562, "step": 1064 }, { "epoch": 1.2214890259647109, "grad_norm": 9.375, "learning_rate": 1.6799184505606526e-05, "loss": 0.1801, "step": 1065 }, { "epoch": 1.222636637498207, "grad_norm": 12.5, "learning_rate": 1.67940876656473e-05, "loss": 0.2672, "step": 1066 }, { "epoch": 1.2237842490317028, "grad_norm": 77.5, "learning_rate": 1.6788990825688073e-05, "loss": 0.6601, "step": 1067 }, { "epoch": 1.2249318605651986, "grad_norm": 12.0625, "learning_rate": 1.678389398572885e-05, "loss": 0.1519, "step": 1068 }, { "epoch": 1.2260794720986945, "grad_norm": 24.75, "learning_rate": 1.6778797145769625e-05, "loss": 0.5777, "step": 1069 }, { "epoch": 1.2272270836321906, "grad_norm": 31.125, "learning_rate": 1.67737003058104e-05, "loss": 0.7135, "step": 1070 }, { "epoch": 1.2283746951656864, "grad_norm": 63.5, "learning_rate": 1.6768603465851176e-05, "loss": 0.8834, "step": 1071 }, { "epoch": 1.2295223066991823, "grad_norm": 64.0, "learning_rate": 1.6763506625891946e-05, "loss": 0.8991, "step": 1072 }, { "epoch": 1.2306699182326781, "grad_norm": 21.375, "learning_rate": 1.6758409785932724e-05, "loss": 0.2697, "step": 1073 }, { "epoch": 1.2318175297661742, "grad_norm": 44.0, "learning_rate": 1.6753312945973498e-05, "loss": 0.4755, "step": 1074 }, { "epoch": 1.23296514129967, "grad_norm": 28.875, "learning_rate": 1.674821610601427e-05, "loss": 0.3531, "step": 1075 }, { "epoch": 1.234112752833166, "grad_norm": 33.0, "learning_rate": 1.674311926605505e-05, "loss": 0.1501, "step": 1076 }, { "epoch": 1.235260364366662, "grad_norm": 23.0, "learning_rate": 1.6738022426095823e-05, "loss": 0.7386, "step": 1077 }, { "epoch": 1.2364079759001578, "grad_norm": 18.75, "learning_rate": 1.6732925586136597e-05, "loss": 0.1371, "step": 1078 }, { "epoch": 1.2375555874336537, "grad_norm": 32.25, "learning_rate": 1.672782874617737e-05, "loss": 0.4283, "step": 1079 }, { "epoch": 1.2387031989671495, "grad_norm": 15.25, "learning_rate": 1.6722731906218144e-05, "loss": 0.221, "step": 1080 }, { "epoch": 1.2398508105006456, "grad_norm": 30.625, "learning_rate": 1.671763506625892e-05, "loss": 0.4018, "step": 1081 }, { "epoch": 1.2409984220341415, "grad_norm": 67.0, "learning_rate": 1.6712538226299696e-05, "loss": 0.9727, "step": 1082 }, { "epoch": 1.2421460335676373, "grad_norm": 31.375, "learning_rate": 1.670744138634047e-05, "loss": 0.4461, "step": 1083 }, { "epoch": 1.2432936451011334, "grad_norm": 15.9375, "learning_rate": 1.6702344546381243e-05, "loss": 0.4182, "step": 1084 }, { "epoch": 1.2444412566346292, "grad_norm": 58.25, "learning_rate": 1.669724770642202e-05, "loss": 0.6867, "step": 1085 }, { "epoch": 1.245588868168125, "grad_norm": 30.75, "learning_rate": 1.6692150866462795e-05, "loss": 0.3318, "step": 1086 }, { "epoch": 1.246736479701621, "grad_norm": 52.5, "learning_rate": 1.668705402650357e-05, "loss": 0.4148, "step": 1087 }, { "epoch": 1.247884091235117, "grad_norm": 22.125, "learning_rate": 1.6681957186544346e-05, "loss": 0.5934, "step": 1088 }, { "epoch": 1.2490317027686129, "grad_norm": 33.5, "learning_rate": 1.6676860346585116e-05, "loss": 0.2049, "step": 1089 }, { "epoch": 1.2501793143021087, "grad_norm": 37.5, "learning_rate": 1.6671763506625894e-05, "loss": 0.3963, "step": 1090 }, { "epoch": 1.2513269258356048, "grad_norm": 18.5, "learning_rate": 1.6666666666666667e-05, "loss": 0.2542, "step": 1091 }, { "epoch": 1.2524745373691006, "grad_norm": 22.75, "learning_rate": 1.666156982670744e-05, "loss": 0.608, "step": 1092 }, { "epoch": 1.2536221489025965, "grad_norm": 29.625, "learning_rate": 1.665647298674822e-05, "loss": 0.6878, "step": 1093 }, { "epoch": 1.2547697604360923, "grad_norm": 30.125, "learning_rate": 1.6651376146788993e-05, "loss": 0.172, "step": 1094 }, { "epoch": 1.2559173719695882, "grad_norm": 32.75, "learning_rate": 1.6646279306829766e-05, "loss": 0.535, "step": 1095 }, { "epoch": 1.2570649835030843, "grad_norm": 53.25, "learning_rate": 1.6641182466870544e-05, "loss": 0.5492, "step": 1096 }, { "epoch": 1.2582125950365801, "grad_norm": 29.0, "learning_rate": 1.6636085626911314e-05, "loss": 0.3528, "step": 1097 }, { "epoch": 1.259360206570076, "grad_norm": 72.5, "learning_rate": 1.663098878695209e-05, "loss": 0.5466, "step": 1098 }, { "epoch": 1.260507818103572, "grad_norm": 33.25, "learning_rate": 1.6625891946992865e-05, "loss": 0.2994, "step": 1099 }, { "epoch": 1.261655429637068, "grad_norm": 44.5, "learning_rate": 1.662079510703364e-05, "loss": 0.5398, "step": 1100 }, { "epoch": 1.261655429637068, "eval_accuracy": 0.56, "eval_loss": 0.5154783129692078, "eval_runtime": 49.6732, "eval_samples_per_second": 2.013, "eval_steps_per_second": 2.013, "step": 1100 }, { "epoch": 1.2628030411705637, "grad_norm": 23.5, "learning_rate": 1.6615698267074417e-05, "loss": 0.6216, "step": 1101 }, { "epoch": 1.2639506527040596, "grad_norm": 42.0, "learning_rate": 1.661060142711519e-05, "loss": 0.2358, "step": 1102 }, { "epoch": 1.2650982642375557, "grad_norm": 32.75, "learning_rate": 1.6605504587155964e-05, "loss": 0.2636, "step": 1103 }, { "epoch": 1.2662458757710515, "grad_norm": 19.875, "learning_rate": 1.660040774719674e-05, "loss": 0.2472, "step": 1104 }, { "epoch": 1.2673934873045474, "grad_norm": 11.8125, "learning_rate": 1.6595310907237516e-05, "loss": 0.1907, "step": 1105 }, { "epoch": 1.2685410988380434, "grad_norm": 74.0, "learning_rate": 1.659021406727829e-05, "loss": 0.8738, "step": 1106 }, { "epoch": 1.2696887103715393, "grad_norm": 39.5, "learning_rate": 1.6585117227319063e-05, "loss": 0.4113, "step": 1107 }, { "epoch": 1.2708363219050351, "grad_norm": 34.25, "learning_rate": 1.6580020387359837e-05, "loss": 0.7458, "step": 1108 }, { "epoch": 1.271983933438531, "grad_norm": 130.0, "learning_rate": 1.657492354740061e-05, "loss": 1.2238, "step": 1109 }, { "epoch": 1.2731315449720269, "grad_norm": 37.5, "learning_rate": 1.656982670744139e-05, "loss": 0.6994, "step": 1110 }, { "epoch": 1.274279156505523, "grad_norm": 83.5, "learning_rate": 1.6564729867482163e-05, "loss": 1.1055, "step": 1111 }, { "epoch": 1.2754267680390188, "grad_norm": 7.15625, "learning_rate": 1.6559633027522936e-05, "loss": 0.1926, "step": 1112 }, { "epoch": 1.2765743795725146, "grad_norm": 151.0, "learning_rate": 1.6554536187563714e-05, "loss": 0.4989, "step": 1113 }, { "epoch": 1.2777219911060107, "grad_norm": 43.75, "learning_rate": 1.6549439347604484e-05, "loss": 0.4593, "step": 1114 }, { "epoch": 1.2788696026395066, "grad_norm": 23.5, "learning_rate": 1.654434250764526e-05, "loss": 0.2898, "step": 1115 }, { "epoch": 1.2800172141730024, "grad_norm": 37.5, "learning_rate": 1.6539245667686035e-05, "loss": 0.3342, "step": 1116 }, { "epoch": 1.2811648257064983, "grad_norm": 35.25, "learning_rate": 1.653414882772681e-05, "loss": 0.4059, "step": 1117 }, { "epoch": 1.2823124372399943, "grad_norm": 17.875, "learning_rate": 1.6529051987767587e-05, "loss": 0.3272, "step": 1118 }, { "epoch": 1.2834600487734902, "grad_norm": 59.25, "learning_rate": 1.652395514780836e-05, "loss": 0.5725, "step": 1119 }, { "epoch": 1.284607660306986, "grad_norm": 66.0, "learning_rate": 1.6518858307849134e-05, "loss": 0.8477, "step": 1120 }, { "epoch": 1.285755271840482, "grad_norm": 67.0, "learning_rate": 1.6513761467889912e-05, "loss": 0.5421, "step": 1121 }, { "epoch": 1.286902883373978, "grad_norm": 23.75, "learning_rate": 1.6508664627930682e-05, "loss": 0.457, "step": 1122 }, { "epoch": 1.2880504949074738, "grad_norm": 22.875, "learning_rate": 1.650356778797146e-05, "loss": 0.5799, "step": 1123 }, { "epoch": 1.2891981064409697, "grad_norm": 48.25, "learning_rate": 1.6498470948012233e-05, "loss": 0.5672, "step": 1124 }, { "epoch": 1.2903457179744655, "grad_norm": 31.625, "learning_rate": 1.6493374108053007e-05, "loss": 0.6196, "step": 1125 }, { "epoch": 1.2914933295079616, "grad_norm": 79.5, "learning_rate": 1.6488277268093785e-05, "loss": 0.6727, "step": 1126 }, { "epoch": 1.2926409410414574, "grad_norm": 55.25, "learning_rate": 1.648318042813456e-05, "loss": 0.6848, "step": 1127 }, { "epoch": 1.2937885525749535, "grad_norm": 49.5, "learning_rate": 1.6478083588175332e-05, "loss": 1.015, "step": 1128 }, { "epoch": 1.2949361641084494, "grad_norm": 30.375, "learning_rate": 1.6472986748216106e-05, "loss": 0.9048, "step": 1129 }, { "epoch": 1.2960837756419452, "grad_norm": 65.5, "learning_rate": 1.6467889908256884e-05, "loss": 0.7712, "step": 1130 }, { "epoch": 1.297231387175441, "grad_norm": 14.8125, "learning_rate": 1.6462793068297658e-05, "loss": 0.1942, "step": 1131 }, { "epoch": 1.298378998708937, "grad_norm": 57.0, "learning_rate": 1.645769622833843e-05, "loss": 0.5278, "step": 1132 }, { "epoch": 1.299526610242433, "grad_norm": 20.125, "learning_rate": 1.6452599388379205e-05, "loss": 0.3787, "step": 1133 }, { "epoch": 1.3006742217759288, "grad_norm": 20.875, "learning_rate": 1.644750254841998e-05, "loss": 0.347, "step": 1134 }, { "epoch": 1.3018218333094247, "grad_norm": 36.25, "learning_rate": 1.6442405708460757e-05, "loss": 0.613, "step": 1135 }, { "epoch": 1.3029694448429208, "grad_norm": 46.75, "learning_rate": 1.643730886850153e-05, "loss": 0.3531, "step": 1136 }, { "epoch": 1.3041170563764166, "grad_norm": 51.5, "learning_rate": 1.6432212028542304e-05, "loss": 0.4654, "step": 1137 }, { "epoch": 1.3052646679099125, "grad_norm": 59.5, "learning_rate": 1.642711518858308e-05, "loss": 0.6825, "step": 1138 }, { "epoch": 1.3064122794434083, "grad_norm": 20.125, "learning_rate": 1.6422018348623852e-05, "loss": 0.5258, "step": 1139 }, { "epoch": 1.3075598909769044, "grad_norm": 21.375, "learning_rate": 1.641692150866463e-05, "loss": 0.2334, "step": 1140 }, { "epoch": 1.3087075025104002, "grad_norm": 57.5, "learning_rate": 1.6411824668705403e-05, "loss": 0.9003, "step": 1141 }, { "epoch": 1.309855114043896, "grad_norm": 61.5, "learning_rate": 1.6406727828746177e-05, "loss": 0.6237, "step": 1142 }, { "epoch": 1.3110027255773922, "grad_norm": 20.875, "learning_rate": 1.6401630988786955e-05, "loss": 0.3164, "step": 1143 }, { "epoch": 1.312150337110888, "grad_norm": 48.5, "learning_rate": 1.639653414882773e-05, "loss": 0.4018, "step": 1144 }, { "epoch": 1.3132979486443839, "grad_norm": 56.0, "learning_rate": 1.6391437308868502e-05, "loss": 0.7092, "step": 1145 }, { "epoch": 1.3144455601778797, "grad_norm": 38.75, "learning_rate": 1.638634046890928e-05, "loss": 0.5181, "step": 1146 }, { "epoch": 1.3155931717113756, "grad_norm": 40.5, "learning_rate": 1.6381243628950054e-05, "loss": 0.3165, "step": 1147 }, { "epoch": 1.3167407832448716, "grad_norm": 32.25, "learning_rate": 1.6376146788990827e-05, "loss": 0.7836, "step": 1148 }, { "epoch": 1.3178883947783675, "grad_norm": 61.5, "learning_rate": 1.63710499490316e-05, "loss": 0.7627, "step": 1149 }, { "epoch": 1.3190360063118634, "grad_norm": 79.5, "learning_rate": 1.6365953109072375e-05, "loss": 0.9356, "step": 1150 }, { "epoch": 1.3201836178453594, "grad_norm": 189.0, "learning_rate": 1.6360856269113153e-05, "loss": 1.0391, "step": 1151 }, { "epoch": 1.3213312293788553, "grad_norm": 20.25, "learning_rate": 1.6355759429153926e-05, "loss": 0.6406, "step": 1152 }, { "epoch": 1.3224788409123511, "grad_norm": 26.375, "learning_rate": 1.63506625891947e-05, "loss": 0.3832, "step": 1153 }, { "epoch": 1.323626452445847, "grad_norm": 33.0, "learning_rate": 1.6345565749235474e-05, "loss": 0.4041, "step": 1154 }, { "epoch": 1.324774063979343, "grad_norm": 23.625, "learning_rate": 1.634046890927625e-05, "loss": 0.3527, "step": 1155 }, { "epoch": 1.325921675512839, "grad_norm": 99.5, "learning_rate": 1.6335372069317022e-05, "loss": 0.9746, "step": 1156 }, { "epoch": 1.3270692870463348, "grad_norm": 45.0, "learning_rate": 1.63302752293578e-05, "loss": 0.2891, "step": 1157 }, { "epoch": 1.3282168985798308, "grad_norm": 56.5, "learning_rate": 1.6325178389398573e-05, "loss": 0.8078, "step": 1158 }, { "epoch": 1.3293645101133267, "grad_norm": 16.125, "learning_rate": 1.6320081549439347e-05, "loss": 0.6181, "step": 1159 }, { "epoch": 1.3305121216468225, "grad_norm": 31.5, "learning_rate": 1.6314984709480125e-05, "loss": 0.3313, "step": 1160 }, { "epoch": 1.3316597331803184, "grad_norm": 11.75, "learning_rate": 1.63098878695209e-05, "loss": 0.2764, "step": 1161 }, { "epoch": 1.3328073447138145, "grad_norm": 32.0, "learning_rate": 1.6304791029561672e-05, "loss": 0.6992, "step": 1162 }, { "epoch": 1.3339549562473103, "grad_norm": 40.25, "learning_rate": 1.629969418960245e-05, "loss": 0.4695, "step": 1163 }, { "epoch": 1.3351025677808062, "grad_norm": 60.75, "learning_rate": 1.6294597349643224e-05, "loss": 0.5952, "step": 1164 }, { "epoch": 1.3362501793143022, "grad_norm": 52.75, "learning_rate": 1.6289500509683997e-05, "loss": 0.4987, "step": 1165 }, { "epoch": 1.337397790847798, "grad_norm": 28.25, "learning_rate": 1.628440366972477e-05, "loss": 0.31, "step": 1166 }, { "epoch": 1.338545402381294, "grad_norm": 21.25, "learning_rate": 1.6279306829765545e-05, "loss": 0.4244, "step": 1167 }, { "epoch": 1.3396930139147898, "grad_norm": 30.75, "learning_rate": 1.6274209989806323e-05, "loss": 0.5522, "step": 1168 }, { "epoch": 1.3408406254482856, "grad_norm": 18.5, "learning_rate": 1.6269113149847096e-05, "loss": 0.3786, "step": 1169 }, { "epoch": 1.3419882369817817, "grad_norm": 14.6875, "learning_rate": 1.626401630988787e-05, "loss": 0.0966, "step": 1170 }, { "epoch": 1.3431358485152776, "grad_norm": 50.0, "learning_rate": 1.6258919469928644e-05, "loss": 0.4607, "step": 1171 }, { "epoch": 1.3442834600487734, "grad_norm": 27.375, "learning_rate": 1.625382262996942e-05, "loss": 0.847, "step": 1172 }, { "epoch": 1.3454310715822695, "grad_norm": 13.0625, "learning_rate": 1.6248725790010195e-05, "loss": 0.4091, "step": 1173 }, { "epoch": 1.3465786831157653, "grad_norm": 16.0, "learning_rate": 1.624362895005097e-05, "loss": 0.2403, "step": 1174 }, { "epoch": 1.3477262946492612, "grad_norm": 37.25, "learning_rate": 1.6238532110091743e-05, "loss": 0.421, "step": 1175 }, { "epoch": 1.348873906182757, "grad_norm": 55.25, "learning_rate": 1.6233435270132517e-05, "loss": 0.662, "step": 1176 }, { "epoch": 1.3500215177162531, "grad_norm": 40.5, "learning_rate": 1.6228338430173294e-05, "loss": 0.4565, "step": 1177 }, { "epoch": 1.351169129249749, "grad_norm": 14.3125, "learning_rate": 1.622324159021407e-05, "loss": 0.4465, "step": 1178 }, { "epoch": 1.3523167407832448, "grad_norm": 28.625, "learning_rate": 1.6218144750254842e-05, "loss": 0.3729, "step": 1179 }, { "epoch": 1.353464352316741, "grad_norm": 55.25, "learning_rate": 1.621304791029562e-05, "loss": 0.3222, "step": 1180 }, { "epoch": 1.3546119638502367, "grad_norm": 22.875, "learning_rate": 1.6207951070336393e-05, "loss": 0.437, "step": 1181 }, { "epoch": 1.3557595753837326, "grad_norm": 38.0, "learning_rate": 1.6202854230377167e-05, "loss": 0.651, "step": 1182 }, { "epoch": 1.3569071869172284, "grad_norm": 21.625, "learning_rate": 1.6197757390417945e-05, "loss": 0.4508, "step": 1183 }, { "epoch": 1.3580547984507243, "grad_norm": 27.875, "learning_rate": 1.6192660550458715e-05, "loss": 0.3005, "step": 1184 }, { "epoch": 1.3592024099842204, "grad_norm": 42.0, "learning_rate": 1.6187563710499492e-05, "loss": 0.2964, "step": 1185 }, { "epoch": 1.3603500215177162, "grad_norm": 26.125, "learning_rate": 1.6182466870540266e-05, "loss": 0.6497, "step": 1186 }, { "epoch": 1.3614976330512123, "grad_norm": 20.375, "learning_rate": 1.617737003058104e-05, "loss": 0.3097, "step": 1187 }, { "epoch": 1.3626452445847081, "grad_norm": 56.5, "learning_rate": 1.6172273190621818e-05, "loss": 0.437, "step": 1188 }, { "epoch": 1.363792856118204, "grad_norm": 69.5, "learning_rate": 1.616717635066259e-05, "loss": 0.4491, "step": 1189 }, { "epoch": 1.3649404676516999, "grad_norm": 58.75, "learning_rate": 1.6162079510703365e-05, "loss": 0.4697, "step": 1190 }, { "epoch": 1.3660880791851957, "grad_norm": 15.0, "learning_rate": 1.615698267074414e-05, "loss": 0.2935, "step": 1191 }, { "epoch": 1.3672356907186918, "grad_norm": 69.0, "learning_rate": 1.6151885830784913e-05, "loss": 0.8532, "step": 1192 }, { "epoch": 1.3683833022521876, "grad_norm": 27.5, "learning_rate": 1.614678899082569e-05, "loss": 0.3305, "step": 1193 }, { "epoch": 1.3695309137856835, "grad_norm": 65.5, "learning_rate": 1.6141692150866464e-05, "loss": 0.6747, "step": 1194 }, { "epoch": 1.3706785253191796, "grad_norm": 39.5, "learning_rate": 1.6136595310907238e-05, "loss": 0.4378, "step": 1195 }, { "epoch": 1.3718261368526754, "grad_norm": 36.25, "learning_rate": 1.6131498470948012e-05, "loss": 0.4756, "step": 1196 }, { "epoch": 1.3729737483861713, "grad_norm": 25.75, "learning_rate": 1.612640163098879e-05, "loss": 0.2116, "step": 1197 }, { "epoch": 1.374121359919667, "grad_norm": 51.5, "learning_rate": 1.6121304791029563e-05, "loss": 0.6976, "step": 1198 }, { "epoch": 1.3752689714531632, "grad_norm": 28.25, "learning_rate": 1.6116207951070337e-05, "loss": 0.3644, "step": 1199 }, { "epoch": 1.376416582986659, "grad_norm": 21.25, "learning_rate": 1.6111111111111115e-05, "loss": 0.4288, "step": 1200 }, { "epoch": 1.376416582986659, "eval_accuracy": 0.61, "eval_loss": 0.5443911552429199, "eval_runtime": 49.3817, "eval_samples_per_second": 2.025, "eval_steps_per_second": 2.025, "step": 1200 }, { "epoch": 1.3775641945201549, "grad_norm": 16.75, "learning_rate": 1.6106014271151885e-05, "loss": 0.5041, "step": 1201 }, { "epoch": 1.378711806053651, "grad_norm": 50.25, "learning_rate": 1.6100917431192662e-05, "loss": 0.5077, "step": 1202 }, { "epoch": 1.3798594175871468, "grad_norm": 15.875, "learning_rate": 1.6095820591233436e-05, "loss": 0.2509, "step": 1203 }, { "epoch": 1.3810070291206427, "grad_norm": 52.5, "learning_rate": 1.609072375127421e-05, "loss": 0.6619, "step": 1204 }, { "epoch": 1.3821546406541385, "grad_norm": 27.0, "learning_rate": 1.6085626911314988e-05, "loss": 0.3906, "step": 1205 }, { "epoch": 1.3833022521876344, "grad_norm": 28.5, "learning_rate": 1.608053007135576e-05, "loss": 0.506, "step": 1206 }, { "epoch": 1.3844498637211304, "grad_norm": 34.25, "learning_rate": 1.6075433231396535e-05, "loss": 0.3932, "step": 1207 }, { "epoch": 1.3855974752546263, "grad_norm": 36.75, "learning_rate": 1.6070336391437313e-05, "loss": 0.5362, "step": 1208 }, { "epoch": 1.3867450867881221, "grad_norm": 52.5, "learning_rate": 1.6065239551478083e-05, "loss": 0.5699, "step": 1209 }, { "epoch": 1.3878926983216182, "grad_norm": 45.5, "learning_rate": 1.606014271151886e-05, "loss": 0.5685, "step": 1210 }, { "epoch": 1.389040309855114, "grad_norm": 60.25, "learning_rate": 1.6055045871559634e-05, "loss": 0.9313, "step": 1211 }, { "epoch": 1.39018792138861, "grad_norm": 38.0, "learning_rate": 1.6049949031600408e-05, "loss": 0.5542, "step": 1212 }, { "epoch": 1.3913355329221058, "grad_norm": 40.75, "learning_rate": 1.6044852191641186e-05, "loss": 0.8328, "step": 1213 }, { "epoch": 1.3924831444556018, "grad_norm": 42.25, "learning_rate": 1.603975535168196e-05, "loss": 0.2783, "step": 1214 }, { "epoch": 1.3936307559890977, "grad_norm": 39.5, "learning_rate": 1.6034658511722733e-05, "loss": 0.5385, "step": 1215 }, { "epoch": 1.3947783675225935, "grad_norm": 42.75, "learning_rate": 1.6029561671763507e-05, "loss": 0.5375, "step": 1216 }, { "epoch": 1.3959259790560896, "grad_norm": 22.25, "learning_rate": 1.602446483180428e-05, "loss": 0.6028, "step": 1217 }, { "epoch": 1.3970735905895855, "grad_norm": 34.0, "learning_rate": 1.601936799184506e-05, "loss": 0.5153, "step": 1218 }, { "epoch": 1.3982212021230813, "grad_norm": 79.5, "learning_rate": 1.6014271151885832e-05, "loss": 0.7959, "step": 1219 }, { "epoch": 1.3993688136565772, "grad_norm": 28.0, "learning_rate": 1.6009174311926606e-05, "loss": 0.271, "step": 1220 }, { "epoch": 1.400516425190073, "grad_norm": 76.5, "learning_rate": 1.600407747196738e-05, "loss": 0.6952, "step": 1221 }, { "epoch": 1.401664036723569, "grad_norm": 21.875, "learning_rate": 1.5998980632008157e-05, "loss": 0.452, "step": 1222 }, { "epoch": 1.402811648257065, "grad_norm": 70.5, "learning_rate": 1.599388379204893e-05, "loss": 0.5592, "step": 1223 }, { "epoch": 1.403959259790561, "grad_norm": 17.875, "learning_rate": 1.5988786952089705e-05, "loss": 0.451, "step": 1224 }, { "epoch": 1.4051068713240569, "grad_norm": 30.125, "learning_rate": 1.5983690112130483e-05, "loss": 0.4143, "step": 1225 }, { "epoch": 1.4062544828575527, "grad_norm": 25.625, "learning_rate": 1.5978593272171253e-05, "loss": 0.454, "step": 1226 }, { "epoch": 1.4074020943910486, "grad_norm": 24.625, "learning_rate": 1.597349643221203e-05, "loss": 0.4827, "step": 1227 }, { "epoch": 1.4085497059245444, "grad_norm": 14.6875, "learning_rate": 1.5968399592252804e-05, "loss": 0.1517, "step": 1228 }, { "epoch": 1.4096973174580405, "grad_norm": 12.3125, "learning_rate": 1.5963302752293578e-05, "loss": 0.4015, "step": 1229 }, { "epoch": 1.4108449289915364, "grad_norm": 59.0, "learning_rate": 1.5958205912334355e-05, "loss": 0.5366, "step": 1230 }, { "epoch": 1.4119925405250322, "grad_norm": 11.1875, "learning_rate": 1.595310907237513e-05, "loss": 0.3743, "step": 1231 }, { "epoch": 1.4131401520585283, "grad_norm": 18.75, "learning_rate": 1.5948012232415903e-05, "loss": 0.4668, "step": 1232 }, { "epoch": 1.4142877635920241, "grad_norm": 50.75, "learning_rate": 1.5942915392456677e-05, "loss": 0.3211, "step": 1233 }, { "epoch": 1.41543537512552, "grad_norm": 41.5, "learning_rate": 1.593781855249745e-05, "loss": 0.5208, "step": 1234 }, { "epoch": 1.4165829866590158, "grad_norm": 16.5, "learning_rate": 1.593272171253823e-05, "loss": 0.2334, "step": 1235 }, { "epoch": 1.417730598192512, "grad_norm": 72.0, "learning_rate": 1.5927624872579002e-05, "loss": 0.4065, "step": 1236 }, { "epoch": 1.4188782097260078, "grad_norm": 21.0, "learning_rate": 1.5922528032619776e-05, "loss": 0.4257, "step": 1237 }, { "epoch": 1.4200258212595036, "grad_norm": 18.75, "learning_rate": 1.591743119266055e-05, "loss": 0.3615, "step": 1238 }, { "epoch": 1.4211734327929997, "grad_norm": 54.5, "learning_rate": 1.5912334352701327e-05, "loss": 0.2902, "step": 1239 }, { "epoch": 1.4223210443264955, "grad_norm": 8.3125, "learning_rate": 1.59072375127421e-05, "loss": 0.1653, "step": 1240 }, { "epoch": 1.4234686558599914, "grad_norm": 18.125, "learning_rate": 1.5902140672782875e-05, "loss": 0.3842, "step": 1241 }, { "epoch": 1.4246162673934872, "grad_norm": 85.0, "learning_rate": 1.5897043832823652e-05, "loss": 0.7718, "step": 1242 }, { "epoch": 1.425763878926983, "grad_norm": 27.125, "learning_rate": 1.5891946992864423e-05, "loss": 0.195, "step": 1243 }, { "epoch": 1.4269114904604792, "grad_norm": 31.125, "learning_rate": 1.58868501529052e-05, "loss": 0.5963, "step": 1244 }, { "epoch": 1.428059101993975, "grad_norm": 67.0, "learning_rate": 1.5881753312945974e-05, "loss": 0.709, "step": 1245 }, { "epoch": 1.429206713527471, "grad_norm": 20.25, "learning_rate": 1.5876656472986748e-05, "loss": 0.3003, "step": 1246 }, { "epoch": 1.430354325060967, "grad_norm": 40.25, "learning_rate": 1.5871559633027525e-05, "loss": 0.7344, "step": 1247 }, { "epoch": 1.4315019365944628, "grad_norm": 26.75, "learning_rate": 1.58664627930683e-05, "loss": 1.0281, "step": 1248 }, { "epoch": 1.4326495481279586, "grad_norm": 49.25, "learning_rate": 1.5861365953109073e-05, "loss": 0.3, "step": 1249 }, { "epoch": 1.4337971596614545, "grad_norm": 27.125, "learning_rate": 1.585626911314985e-05, "loss": 0.5945, "step": 1250 }, { "epoch": 1.4349447711949506, "grad_norm": 41.5, "learning_rate": 1.585117227319062e-05, "loss": 0.6692, "step": 1251 }, { "epoch": 1.4360923827284464, "grad_norm": 14.375, "learning_rate": 1.58460754332314e-05, "loss": 0.3908, "step": 1252 }, { "epoch": 1.4372399942619423, "grad_norm": 77.5, "learning_rate": 1.5840978593272172e-05, "loss": 0.7376, "step": 1253 }, { "epoch": 1.4383876057954383, "grad_norm": 29.75, "learning_rate": 1.5835881753312946e-05, "loss": 0.355, "step": 1254 }, { "epoch": 1.4395352173289342, "grad_norm": 32.0, "learning_rate": 1.5830784913353723e-05, "loss": 0.7525, "step": 1255 }, { "epoch": 1.44068282886243, "grad_norm": 42.75, "learning_rate": 1.5825688073394497e-05, "loss": 0.2832, "step": 1256 }, { "epoch": 1.441830440395926, "grad_norm": 21.125, "learning_rate": 1.582059123343527e-05, "loss": 0.3375, "step": 1257 }, { "epoch": 1.442978051929422, "grad_norm": 33.25, "learning_rate": 1.5815494393476045e-05, "loss": 0.3517, "step": 1258 }, { "epoch": 1.4441256634629178, "grad_norm": 35.0, "learning_rate": 1.5810397553516822e-05, "loss": 0.382, "step": 1259 }, { "epoch": 1.4452732749964137, "grad_norm": 53.75, "learning_rate": 1.5805300713557596e-05, "loss": 0.3113, "step": 1260 }, { "epoch": 1.4464208865299097, "grad_norm": 43.75, "learning_rate": 1.580020387359837e-05, "loss": 0.3177, "step": 1261 }, { "epoch": 1.4475684980634056, "grad_norm": 35.0, "learning_rate": 1.5795107033639144e-05, "loss": 0.3791, "step": 1262 }, { "epoch": 1.4487161095969014, "grad_norm": 45.0, "learning_rate": 1.5790010193679918e-05, "loss": 0.4492, "step": 1263 }, { "epoch": 1.4498637211303973, "grad_norm": 27.875, "learning_rate": 1.5784913353720695e-05, "loss": 0.3343, "step": 1264 }, { "epoch": 1.4510113326638931, "grad_norm": 19.125, "learning_rate": 1.577981651376147e-05, "loss": 0.8559, "step": 1265 }, { "epoch": 1.4521589441973892, "grad_norm": 8.0, "learning_rate": 1.5774719673802243e-05, "loss": 0.1379, "step": 1266 }, { "epoch": 1.453306555730885, "grad_norm": 49.5, "learning_rate": 1.576962283384302e-05, "loss": 0.4941, "step": 1267 }, { "epoch": 1.454454167264381, "grad_norm": 84.5, "learning_rate": 1.576452599388379e-05, "loss": 1.4308, "step": 1268 }, { "epoch": 1.455601778797877, "grad_norm": 92.0, "learning_rate": 1.5759429153924568e-05, "loss": 0.9692, "step": 1269 }, { "epoch": 1.4567493903313729, "grad_norm": 88.0, "learning_rate": 1.5754332313965342e-05, "loss": 0.9589, "step": 1270 }, { "epoch": 1.4578970018648687, "grad_norm": 50.25, "learning_rate": 1.5749235474006116e-05, "loss": 0.5352, "step": 1271 }, { "epoch": 1.4590446133983646, "grad_norm": 42.0, "learning_rate": 1.5744138634046893e-05, "loss": 0.3708, "step": 1272 }, { "epoch": 1.4601922249318606, "grad_norm": 35.0, "learning_rate": 1.5739041794087667e-05, "loss": 0.7022, "step": 1273 }, { "epoch": 1.4613398364653565, "grad_norm": 13.375, "learning_rate": 1.573394495412844e-05, "loss": 0.3201, "step": 1274 }, { "epoch": 1.4624874479988523, "grad_norm": 87.5, "learning_rate": 1.572884811416922e-05, "loss": 0.576, "step": 1275 }, { "epoch": 1.4636350595323484, "grad_norm": 68.5, "learning_rate": 1.5723751274209992e-05, "loss": 0.5697, "step": 1276 }, { "epoch": 1.4647826710658443, "grad_norm": 31.75, "learning_rate": 1.5718654434250766e-05, "loss": 0.4631, "step": 1277 }, { "epoch": 1.46593028259934, "grad_norm": 19.5, "learning_rate": 1.571355759429154e-05, "loss": 0.4516, "step": 1278 }, { "epoch": 1.467077894132836, "grad_norm": 52.0, "learning_rate": 1.5708460754332314e-05, "loss": 0.6808, "step": 1279 }, { "epoch": 1.4682255056663318, "grad_norm": 17.875, "learning_rate": 1.570336391437309e-05, "loss": 0.3936, "step": 1280 }, { "epoch": 1.4693731171998279, "grad_norm": 24.25, "learning_rate": 1.5698267074413865e-05, "loss": 0.4196, "step": 1281 }, { "epoch": 1.4705207287333237, "grad_norm": 111.0, "learning_rate": 1.569317023445464e-05, "loss": 0.8228, "step": 1282 }, { "epoch": 1.4716683402668198, "grad_norm": 36.5, "learning_rate": 1.5688073394495413e-05, "loss": 0.5546, "step": 1283 }, { "epoch": 1.4728159518003157, "grad_norm": 40.5, "learning_rate": 1.568297655453619e-05, "loss": 0.4347, "step": 1284 }, { "epoch": 1.4739635633338115, "grad_norm": 59.75, "learning_rate": 1.5677879714576964e-05, "loss": 0.8506, "step": 1285 }, { "epoch": 1.4751111748673074, "grad_norm": 58.25, "learning_rate": 1.5672782874617738e-05, "loss": 0.4958, "step": 1286 }, { "epoch": 1.4762587864008032, "grad_norm": 41.5, "learning_rate": 1.5667686034658512e-05, "loss": 0.6571, "step": 1287 }, { "epoch": 1.4774063979342993, "grad_norm": 20.75, "learning_rate": 1.5662589194699286e-05, "loss": 0.2749, "step": 1288 }, { "epoch": 1.4785540094677951, "grad_norm": 24.875, "learning_rate": 1.5657492354740063e-05, "loss": 0.552, "step": 1289 }, { "epoch": 1.479701621001291, "grad_norm": 24.625, "learning_rate": 1.5652395514780837e-05, "loss": 0.5655, "step": 1290 }, { "epoch": 1.480849232534787, "grad_norm": 71.0, "learning_rate": 1.564729867482161e-05, "loss": 1.1072, "step": 1291 }, { "epoch": 1.481996844068283, "grad_norm": 56.5, "learning_rate": 1.564220183486239e-05, "loss": 0.9029, "step": 1292 }, { "epoch": 1.4831444556017788, "grad_norm": 75.0, "learning_rate": 1.563710499490316e-05, "loss": 0.8671, "step": 1293 }, { "epoch": 1.4842920671352746, "grad_norm": 68.5, "learning_rate": 1.5632008154943936e-05, "loss": 0.6165, "step": 1294 }, { "epoch": 1.4854396786687707, "grad_norm": 57.5, "learning_rate": 1.5626911314984713e-05, "loss": 0.4413, "step": 1295 }, { "epoch": 1.4865872902022665, "grad_norm": 38.25, "learning_rate": 1.5621814475025484e-05, "loss": 0.4508, "step": 1296 }, { "epoch": 1.4877349017357624, "grad_norm": 22.375, "learning_rate": 1.561671763506626e-05, "loss": 0.4694, "step": 1297 }, { "epoch": 1.4888825132692585, "grad_norm": 19.625, "learning_rate": 1.5611620795107035e-05, "loss": 0.4833, "step": 1298 }, { "epoch": 1.4900301248027543, "grad_norm": 74.0, "learning_rate": 1.560652395514781e-05, "loss": 0.6443, "step": 1299 }, { "epoch": 1.4911777363362502, "grad_norm": 30.25, "learning_rate": 1.5601427115188586e-05, "loss": 0.5003, "step": 1300 }, { "epoch": 1.4911777363362502, "eval_accuracy": 0.64, "eval_loss": 0.5184877514839172, "eval_runtime": 49.6613, "eval_samples_per_second": 2.014, "eval_steps_per_second": 2.014, "step": 1300 }, { "epoch": 1.492325347869746, "grad_norm": 26.5, "learning_rate": 1.559633027522936e-05, "loss": 0.4356, "step": 1301 }, { "epoch": 1.4934729594032419, "grad_norm": 93.0, "learning_rate": 1.5591233435270134e-05, "loss": 0.6945, "step": 1302 }, { "epoch": 1.494620570936738, "grad_norm": 84.0, "learning_rate": 1.5586136595310908e-05, "loss": 0.7059, "step": 1303 }, { "epoch": 1.4957681824702338, "grad_norm": 84.5, "learning_rate": 1.5581039755351682e-05, "loss": 0.8654, "step": 1304 }, { "epoch": 1.4969157940037299, "grad_norm": 79.5, "learning_rate": 1.5575942915392456e-05, "loss": 0.8112, "step": 1305 }, { "epoch": 1.4980634055372257, "grad_norm": 41.75, "learning_rate": 1.5570846075433233e-05, "loss": 1.0995, "step": 1306 }, { "epoch": 1.4992110170707216, "grad_norm": 28.625, "learning_rate": 1.5565749235474007e-05, "loss": 0.8355, "step": 1307 }, { "epoch": 1.5003586286042174, "grad_norm": 67.5, "learning_rate": 1.556065239551478e-05, "loss": 0.7727, "step": 1308 }, { "epoch": 1.5015062401377133, "grad_norm": 17.5, "learning_rate": 1.555555555555556e-05, "loss": 0.4682, "step": 1309 }, { "epoch": 1.5026538516712094, "grad_norm": 18.625, "learning_rate": 1.555045871559633e-05, "loss": 0.2126, "step": 1310 }, { "epoch": 1.5038014632047052, "grad_norm": 15.75, "learning_rate": 1.5545361875637106e-05, "loss": 0.4916, "step": 1311 }, { "epoch": 1.5049490747382013, "grad_norm": 31.625, "learning_rate": 1.554026503567788e-05, "loss": 0.2308, "step": 1312 }, { "epoch": 1.5060966862716971, "grad_norm": 51.75, "learning_rate": 1.5535168195718654e-05, "loss": 1.0898, "step": 1313 }, { "epoch": 1.507244297805193, "grad_norm": 31.75, "learning_rate": 1.553007135575943e-05, "loss": 0.4099, "step": 1314 }, { "epoch": 1.5083919093386888, "grad_norm": 88.0, "learning_rate": 1.5524974515800205e-05, "loss": 0.9649, "step": 1315 }, { "epoch": 1.5095395208721847, "grad_norm": 24.75, "learning_rate": 1.551987767584098e-05, "loss": 1.0352, "step": 1316 }, { "epoch": 1.5106871324056805, "grad_norm": 13.625, "learning_rate": 1.5514780835881756e-05, "loss": 0.3537, "step": 1317 }, { "epoch": 1.5118347439391766, "grad_norm": 94.0, "learning_rate": 1.550968399592253e-05, "loss": 0.9038, "step": 1318 }, { "epoch": 1.5129823554726725, "grad_norm": 26.0, "learning_rate": 1.5504587155963304e-05, "loss": 0.346, "step": 1319 }, { "epoch": 1.5141299670061685, "grad_norm": 44.25, "learning_rate": 1.5499490316004078e-05, "loss": 0.7941, "step": 1320 }, { "epoch": 1.5152775785396644, "grad_norm": 27.75, "learning_rate": 1.5494393476044852e-05, "loss": 0.3747, "step": 1321 }, { "epoch": 1.5164251900731602, "grad_norm": 97.5, "learning_rate": 1.548929663608563e-05, "loss": 0.9651, "step": 1322 }, { "epoch": 1.517572801606656, "grad_norm": 10.6875, "learning_rate": 1.5484199796126403e-05, "loss": 0.2523, "step": 1323 }, { "epoch": 1.518720413140152, "grad_norm": 23.25, "learning_rate": 1.5479102956167177e-05, "loss": 0.5667, "step": 1324 }, { "epoch": 1.519868024673648, "grad_norm": 22.25, "learning_rate": 1.547400611620795e-05, "loss": 0.3108, "step": 1325 }, { "epoch": 1.5210156362071439, "grad_norm": 29.125, "learning_rate": 1.5468909276248728e-05, "loss": 0.5994, "step": 1326 }, { "epoch": 1.52216324774064, "grad_norm": 37.0, "learning_rate": 1.5463812436289502e-05, "loss": 0.6835, "step": 1327 }, { "epoch": 1.5233108592741358, "grad_norm": 35.0, "learning_rate": 1.5458715596330276e-05, "loss": 0.3934, "step": 1328 }, { "epoch": 1.5244584708076316, "grad_norm": 42.5, "learning_rate": 1.545361875637105e-05, "loss": 0.4904, "step": 1329 }, { "epoch": 1.5256060823411275, "grad_norm": 79.5, "learning_rate": 1.5448521916411824e-05, "loss": 0.8999, "step": 1330 }, { "epoch": 1.5267536938746233, "grad_norm": 51.25, "learning_rate": 1.54434250764526e-05, "loss": 0.5231, "step": 1331 }, { "epoch": 1.5279013054081192, "grad_norm": 53.5, "learning_rate": 1.5438328236493375e-05, "loss": 0.6297, "step": 1332 }, { "epoch": 1.5290489169416153, "grad_norm": 65.5, "learning_rate": 1.543323139653415e-05, "loss": 0.5863, "step": 1333 }, { "epoch": 1.5301965284751113, "grad_norm": 44.0, "learning_rate": 1.5428134556574926e-05, "loss": 0.402, "step": 1334 }, { "epoch": 1.5313441400086072, "grad_norm": 54.0, "learning_rate": 1.54230377166157e-05, "loss": 0.5476, "step": 1335 }, { "epoch": 1.532491751542103, "grad_norm": 40.5, "learning_rate": 1.5417940876656474e-05, "loss": 0.4921, "step": 1336 }, { "epoch": 1.533639363075599, "grad_norm": 15.125, "learning_rate": 1.541284403669725e-05, "loss": 0.4748, "step": 1337 }, { "epoch": 1.5347869746090947, "grad_norm": 35.25, "learning_rate": 1.5407747196738022e-05, "loss": 0.5071, "step": 1338 }, { "epoch": 1.5359345861425906, "grad_norm": 26.5, "learning_rate": 1.54026503567788e-05, "loss": 0.2151, "step": 1339 }, { "epoch": 1.5370821976760867, "grad_norm": 32.5, "learning_rate": 1.5397553516819573e-05, "loss": 0.4312, "step": 1340 }, { "epoch": 1.5382298092095825, "grad_norm": 80.5, "learning_rate": 1.5392456676860347e-05, "loss": 0.6625, "step": 1341 }, { "epoch": 1.5393774207430786, "grad_norm": 46.25, "learning_rate": 1.5387359836901124e-05, "loss": 0.3488, "step": 1342 }, { "epoch": 1.5405250322765744, "grad_norm": 41.75, "learning_rate": 1.5382262996941898e-05, "loss": 0.5342, "step": 1343 }, { "epoch": 1.5416726438100703, "grad_norm": 44.0, "learning_rate": 1.5377166156982672e-05, "loss": 0.4736, "step": 1344 }, { "epoch": 1.5428202553435661, "grad_norm": 11.625, "learning_rate": 1.5372069317023446e-05, "loss": 0.3527, "step": 1345 }, { "epoch": 1.543967866877062, "grad_norm": 31.75, "learning_rate": 1.536697247706422e-05, "loss": 0.6221, "step": 1346 }, { "epoch": 1.545115478410558, "grad_norm": 47.0, "learning_rate": 1.5361875637104997e-05, "loss": 0.6081, "step": 1347 }, { "epoch": 1.546263089944054, "grad_norm": 22.5, "learning_rate": 1.535677879714577e-05, "loss": 0.4948, "step": 1348 }, { "epoch": 1.54741070147755, "grad_norm": 82.5, "learning_rate": 1.5351681957186545e-05, "loss": 0.7993, "step": 1349 }, { "epoch": 1.5485583130110459, "grad_norm": 45.5, "learning_rate": 1.534658511722732e-05, "loss": 0.6413, "step": 1350 }, { "epoch": 1.5497059245445417, "grad_norm": 14.875, "learning_rate": 1.5341488277268096e-05, "loss": 0.4485, "step": 1351 }, { "epoch": 1.5508535360780376, "grad_norm": 50.0, "learning_rate": 1.533639143730887e-05, "loss": 1.0687, "step": 1352 }, { "epoch": 1.5520011476115334, "grad_norm": 58.75, "learning_rate": 1.5331294597349644e-05, "loss": 1.0185, "step": 1353 }, { "epoch": 1.5531487591450293, "grad_norm": 43.0, "learning_rate": 1.532619775739042e-05, "loss": 0.5074, "step": 1354 }, { "epoch": 1.5542963706785253, "grad_norm": 47.25, "learning_rate": 1.5321100917431192e-05, "loss": 0.3446, "step": 1355 }, { "epoch": 1.5554439822120212, "grad_norm": 37.25, "learning_rate": 1.531600407747197e-05, "loss": 0.3883, "step": 1356 }, { "epoch": 1.5565915937455173, "grad_norm": 79.0, "learning_rate": 1.5310907237512743e-05, "loss": 0.8577, "step": 1357 }, { "epoch": 1.557739205279013, "grad_norm": 45.0, "learning_rate": 1.5305810397553517e-05, "loss": 0.5937, "step": 1358 }, { "epoch": 1.558886816812509, "grad_norm": 56.25, "learning_rate": 1.5300713557594294e-05, "loss": 0.8568, "step": 1359 }, { "epoch": 1.5600344283460048, "grad_norm": 33.25, "learning_rate": 1.5295616717635068e-05, "loss": 0.4064, "step": 1360 }, { "epoch": 1.5611820398795007, "grad_norm": 59.0, "learning_rate": 1.5290519877675842e-05, "loss": 0.6067, "step": 1361 }, { "epoch": 1.5623296514129967, "grad_norm": 11.75, "learning_rate": 1.528542303771662e-05, "loss": 0.3149, "step": 1362 }, { "epoch": 1.5634772629464926, "grad_norm": 38.0, "learning_rate": 1.528032619775739e-05, "loss": 0.5482, "step": 1363 }, { "epoch": 1.5646248744799887, "grad_norm": 43.0, "learning_rate": 1.5275229357798167e-05, "loss": 0.3758, "step": 1364 }, { "epoch": 1.5657724860134845, "grad_norm": 17.625, "learning_rate": 1.527013251783894e-05, "loss": 0.0865, "step": 1365 }, { "epoch": 1.5669200975469804, "grad_norm": 79.0, "learning_rate": 1.5265035677879715e-05, "loss": 1.07, "step": 1366 }, { "epoch": 1.5680677090804762, "grad_norm": 86.5, "learning_rate": 1.5259938837920492e-05, "loss": 1.2776, "step": 1367 }, { "epoch": 1.569215320613972, "grad_norm": 65.0, "learning_rate": 1.5254841997961264e-05, "loss": 1.0829, "step": 1368 }, { "epoch": 1.570362932147468, "grad_norm": 11.25, "learning_rate": 1.524974515800204e-05, "loss": 0.1616, "step": 1369 }, { "epoch": 1.571510543680964, "grad_norm": 175.0, "learning_rate": 1.5244648318042814e-05, "loss": 0.788, "step": 1370 }, { "epoch": 1.57265815521446, "grad_norm": 52.25, "learning_rate": 1.523955147808359e-05, "loss": 0.6801, "step": 1371 }, { "epoch": 1.573805766747956, "grad_norm": 90.0, "learning_rate": 1.5234454638124365e-05, "loss": 1.1125, "step": 1372 }, { "epoch": 1.5749533782814518, "grad_norm": 69.5, "learning_rate": 1.5229357798165139e-05, "loss": 0.7275, "step": 1373 }, { "epoch": 1.5761009898149476, "grad_norm": 21.625, "learning_rate": 1.5224260958205915e-05, "loss": 0.2809, "step": 1374 }, { "epoch": 1.5772486013484435, "grad_norm": 41.5, "learning_rate": 1.5219164118246687e-05, "loss": 1.0073, "step": 1375 }, { "epoch": 1.5783962128819393, "grad_norm": 65.5, "learning_rate": 1.5214067278287462e-05, "loss": 0.6342, "step": 1376 }, { "epoch": 1.5795438244154354, "grad_norm": 19.5, "learning_rate": 1.5208970438328238e-05, "loss": 0.2352, "step": 1377 }, { "epoch": 1.5806914359489312, "grad_norm": 17.125, "learning_rate": 1.5203873598369012e-05, "loss": 0.5829, "step": 1378 }, { "epoch": 1.5818390474824273, "grad_norm": 40.25, "learning_rate": 1.5198776758409788e-05, "loss": 0.5567, "step": 1379 }, { "epoch": 1.5829866590159232, "grad_norm": 20.0, "learning_rate": 1.5193679918450561e-05, "loss": 0.6663, "step": 1380 }, { "epoch": 1.584134270549419, "grad_norm": 84.0, "learning_rate": 1.5188583078491337e-05, "loss": 0.6399, "step": 1381 }, { "epoch": 1.5852818820829149, "grad_norm": 13.5, "learning_rate": 1.5183486238532111e-05, "loss": 0.3913, "step": 1382 }, { "epoch": 1.5864294936164107, "grad_norm": 39.75, "learning_rate": 1.5178389398572887e-05, "loss": 0.4537, "step": 1383 }, { "epoch": 1.5875771051499068, "grad_norm": 10.3125, "learning_rate": 1.5173292558613662e-05, "loss": 0.5198, "step": 1384 }, { "epoch": 1.5887247166834026, "grad_norm": 26.75, "learning_rate": 1.5168195718654434e-05, "loss": 0.4686, "step": 1385 }, { "epoch": 1.5898723282168987, "grad_norm": 57.25, "learning_rate": 1.516309887869521e-05, "loss": 0.5172, "step": 1386 }, { "epoch": 1.5910199397503946, "grad_norm": 82.0, "learning_rate": 1.5158002038735984e-05, "loss": 0.9411, "step": 1387 }, { "epoch": 1.5921675512838904, "grad_norm": 31.0, "learning_rate": 1.515290519877676e-05, "loss": 0.3182, "step": 1388 }, { "epoch": 1.5933151628173863, "grad_norm": 79.5, "learning_rate": 1.5147808358817535e-05, "loss": 0.7013, "step": 1389 }, { "epoch": 1.5944627743508821, "grad_norm": 17.875, "learning_rate": 1.5142711518858309e-05, "loss": 0.5569, "step": 1390 }, { "epoch": 1.595610385884378, "grad_norm": 23.375, "learning_rate": 1.5137614678899085e-05, "loss": 0.5306, "step": 1391 }, { "epoch": 1.596757997417874, "grad_norm": 11.5, "learning_rate": 1.5132517838939857e-05, "loss": 0.2887, "step": 1392 }, { "epoch": 1.5979056089513701, "grad_norm": 22.5, "learning_rate": 1.5127420998980632e-05, "loss": 0.5286, "step": 1393 }, { "epoch": 1.599053220484866, "grad_norm": 21.0, "learning_rate": 1.5122324159021408e-05, "loss": 0.3716, "step": 1394 }, { "epoch": 1.6002008320183618, "grad_norm": 63.25, "learning_rate": 1.5117227319062182e-05, "loss": 0.7257, "step": 1395 }, { "epoch": 1.6013484435518577, "grad_norm": 7.6875, "learning_rate": 1.5112130479102958e-05, "loss": 0.127, "step": 1396 }, { "epoch": 1.6024960550853535, "grad_norm": 17.0, "learning_rate": 1.5107033639143731e-05, "loss": 0.2272, "step": 1397 }, { "epoch": 1.6036436666188494, "grad_norm": 30.875, "learning_rate": 1.5101936799184507e-05, "loss": 0.4778, "step": 1398 }, { "epoch": 1.6047912781523455, "grad_norm": 19.5, "learning_rate": 1.5096839959225283e-05, "loss": 0.5537, "step": 1399 }, { "epoch": 1.6059388896858413, "grad_norm": 57.5, "learning_rate": 1.5091743119266057e-05, "loss": 0.6817, "step": 1400 }, { "epoch": 1.6059388896858413, "eval_accuracy": 0.63, "eval_loss": 0.49080872535705566, "eval_runtime": 49.7511, "eval_samples_per_second": 2.01, "eval_steps_per_second": 2.01, "step": 1400 }, { "epoch": 1.6070865012193374, "grad_norm": 14.125, "learning_rate": 1.5086646279306832e-05, "loss": 0.6062, "step": 1401 }, { "epoch": 1.6082341127528332, "grad_norm": 9.625, "learning_rate": 1.5081549439347604e-05, "loss": 0.2577, "step": 1402 }, { "epoch": 1.609381724286329, "grad_norm": 19.25, "learning_rate": 1.507645259938838e-05, "loss": 0.4531, "step": 1403 }, { "epoch": 1.610529335819825, "grad_norm": 41.0, "learning_rate": 1.5071355759429156e-05, "loss": 0.6092, "step": 1404 }, { "epoch": 1.6116769473533208, "grad_norm": 34.0, "learning_rate": 1.506625891946993e-05, "loss": 0.4515, "step": 1405 }, { "epoch": 1.6128245588868169, "grad_norm": 23.75, "learning_rate": 1.5061162079510705e-05, "loss": 0.5269, "step": 1406 }, { "epoch": 1.6139721704203127, "grad_norm": 31.75, "learning_rate": 1.5056065239551479e-05, "loss": 0.4641, "step": 1407 }, { "epoch": 1.6151197819538088, "grad_norm": 33.75, "learning_rate": 1.5050968399592255e-05, "loss": 0.3172, "step": 1408 }, { "epoch": 1.6162673934873046, "grad_norm": 9.9375, "learning_rate": 1.504587155963303e-05, "loss": 0.231, "step": 1409 }, { "epoch": 1.6174150050208005, "grad_norm": 10.5, "learning_rate": 1.5040774719673802e-05, "loss": 0.19, "step": 1410 }, { "epoch": 1.6185626165542963, "grad_norm": 26.375, "learning_rate": 1.5035677879714578e-05, "loss": 0.6969, "step": 1411 }, { "epoch": 1.6197102280877922, "grad_norm": 11.5625, "learning_rate": 1.5030581039755352e-05, "loss": 0.3084, "step": 1412 }, { "epoch": 1.620857839621288, "grad_norm": 33.5, "learning_rate": 1.5025484199796127e-05, "loss": 0.9029, "step": 1413 }, { "epoch": 1.6220054511547841, "grad_norm": 26.75, "learning_rate": 1.5020387359836903e-05, "loss": 0.7869, "step": 1414 }, { "epoch": 1.62315306268828, "grad_norm": 18.5, "learning_rate": 1.5015290519877677e-05, "loss": 0.555, "step": 1415 }, { "epoch": 1.624300674221776, "grad_norm": 26.25, "learning_rate": 1.5010193679918453e-05, "loss": 0.8343, "step": 1416 }, { "epoch": 1.625448285755272, "grad_norm": 18.625, "learning_rate": 1.5005096839959225e-05, "loss": 0.4117, "step": 1417 }, { "epoch": 1.6265958972887677, "grad_norm": 30.75, "learning_rate": 1.5000000000000002e-05, "loss": 0.3806, "step": 1418 }, { "epoch": 1.6277435088222636, "grad_norm": 24.375, "learning_rate": 1.4994903160040778e-05, "loss": 0.4463, "step": 1419 }, { "epoch": 1.6288911203557594, "grad_norm": 14.5, "learning_rate": 1.498980632008155e-05, "loss": 0.1973, "step": 1420 }, { "epoch": 1.6300387318892555, "grad_norm": 19.25, "learning_rate": 1.4984709480122325e-05, "loss": 0.689, "step": 1421 }, { "epoch": 1.6311863434227514, "grad_norm": 21.125, "learning_rate": 1.49796126401631e-05, "loss": 0.5135, "step": 1422 }, { "epoch": 1.6323339549562474, "grad_norm": 29.5, "learning_rate": 1.4974515800203875e-05, "loss": 0.3149, "step": 1423 }, { "epoch": 1.6334815664897433, "grad_norm": 13.1875, "learning_rate": 1.496941896024465e-05, "loss": 0.2246, "step": 1424 }, { "epoch": 1.6346291780232391, "grad_norm": 76.0, "learning_rate": 1.4964322120285424e-05, "loss": 0.7469, "step": 1425 }, { "epoch": 1.635776789556735, "grad_norm": 90.5, "learning_rate": 1.49592252803262e-05, "loss": 0.8995, "step": 1426 }, { "epoch": 1.6369244010902309, "grad_norm": 23.25, "learning_rate": 1.4954128440366972e-05, "loss": 0.7444, "step": 1427 }, { "epoch": 1.6380720126237267, "grad_norm": 18.0, "learning_rate": 1.4949031600407748e-05, "loss": 0.4238, "step": 1428 }, { "epoch": 1.6392196241572228, "grad_norm": 31.25, "learning_rate": 1.4943934760448523e-05, "loss": 0.4251, "step": 1429 }, { "epoch": 1.6403672356907189, "grad_norm": 20.0, "learning_rate": 1.4938837920489297e-05, "loss": 0.6363, "step": 1430 }, { "epoch": 1.6415148472242147, "grad_norm": 42.0, "learning_rate": 1.4933741080530073e-05, "loss": 0.6076, "step": 1431 }, { "epoch": 1.6426624587577106, "grad_norm": 62.0, "learning_rate": 1.4928644240570847e-05, "loss": 0.5137, "step": 1432 }, { "epoch": 1.6438100702912064, "grad_norm": 26.125, "learning_rate": 1.4923547400611623e-05, "loss": 0.4482, "step": 1433 }, { "epoch": 1.6449576818247023, "grad_norm": 66.0, "learning_rate": 1.4918450560652398e-05, "loss": 0.9944, "step": 1434 }, { "epoch": 1.646105293358198, "grad_norm": 21.25, "learning_rate": 1.491335372069317e-05, "loss": 0.2451, "step": 1435 }, { "epoch": 1.6472529048916942, "grad_norm": 18.0, "learning_rate": 1.4908256880733946e-05, "loss": 0.699, "step": 1436 }, { "epoch": 1.64840051642519, "grad_norm": 17.125, "learning_rate": 1.490316004077472e-05, "loss": 0.4074, "step": 1437 }, { "epoch": 1.649548127958686, "grad_norm": 59.25, "learning_rate": 1.4898063200815495e-05, "loss": 0.4132, "step": 1438 }, { "epoch": 1.650695739492182, "grad_norm": 20.5, "learning_rate": 1.4892966360856271e-05, "loss": 0.339, "step": 1439 }, { "epoch": 1.6518433510256778, "grad_norm": 27.0, "learning_rate": 1.4887869520897045e-05, "loss": 0.3662, "step": 1440 }, { "epoch": 1.6529909625591737, "grad_norm": 9.5625, "learning_rate": 1.488277268093782e-05, "loss": 0.3819, "step": 1441 }, { "epoch": 1.6541385740926695, "grad_norm": 130.0, "learning_rate": 1.4877675840978594e-05, "loss": 0.4787, "step": 1442 }, { "epoch": 1.6552861856261656, "grad_norm": 29.375, "learning_rate": 1.487257900101937e-05, "loss": 0.4502, "step": 1443 }, { "epoch": 1.6564337971596614, "grad_norm": 28.125, "learning_rate": 1.4867482161060146e-05, "loss": 0.4953, "step": 1444 }, { "epoch": 1.6575814086931575, "grad_norm": 36.0, "learning_rate": 1.4862385321100918e-05, "loss": 0.9421, "step": 1445 }, { "epoch": 1.6587290202266534, "grad_norm": 35.0, "learning_rate": 1.4857288481141693e-05, "loss": 0.3018, "step": 1446 }, { "epoch": 1.6598766317601492, "grad_norm": 18.625, "learning_rate": 1.4852191641182467e-05, "loss": 0.2527, "step": 1447 }, { "epoch": 1.661024243293645, "grad_norm": 12.5, "learning_rate": 1.4847094801223243e-05, "loss": 0.3482, "step": 1448 }, { "epoch": 1.662171854827141, "grad_norm": 18.75, "learning_rate": 1.4841997961264019e-05, "loss": 0.1798, "step": 1449 }, { "epoch": 1.6633194663606368, "grad_norm": 6.34375, "learning_rate": 1.4836901121304792e-05, "loss": 0.1118, "step": 1450 }, { "epoch": 1.6644670778941328, "grad_norm": 21.375, "learning_rate": 1.4831804281345568e-05, "loss": 0.5154, "step": 1451 }, { "epoch": 1.665614689427629, "grad_norm": 57.75, "learning_rate": 1.482670744138634e-05, "loss": 0.845, "step": 1452 }, { "epoch": 1.6667623009611248, "grad_norm": 31.875, "learning_rate": 1.4821610601427116e-05, "loss": 0.6743, "step": 1453 }, { "epoch": 1.6679099124946206, "grad_norm": 30.5, "learning_rate": 1.4816513761467891e-05, "loss": 0.6286, "step": 1454 }, { "epoch": 1.6690575240281165, "grad_norm": 26.25, "learning_rate": 1.4811416921508665e-05, "loss": 0.2807, "step": 1455 }, { "epoch": 1.6702051355616123, "grad_norm": 21.25, "learning_rate": 1.4806320081549441e-05, "loss": 0.5438, "step": 1456 }, { "epoch": 1.6713527470951082, "grad_norm": 15.875, "learning_rate": 1.4801223241590215e-05, "loss": 0.4873, "step": 1457 }, { "epoch": 1.6725003586286042, "grad_norm": 12.3125, "learning_rate": 1.479612640163099e-05, "loss": 0.2455, "step": 1458 }, { "epoch": 1.6736479701621, "grad_norm": 36.25, "learning_rate": 1.4791029561671764e-05, "loss": 0.6741, "step": 1459 }, { "epoch": 1.6747955816955962, "grad_norm": 36.25, "learning_rate": 1.478593272171254e-05, "loss": 0.2113, "step": 1460 }, { "epoch": 1.675943193229092, "grad_norm": 15.0625, "learning_rate": 1.4780835881753316e-05, "loss": 0.3223, "step": 1461 }, { "epoch": 1.6770908047625879, "grad_norm": 70.5, "learning_rate": 1.4775739041794088e-05, "loss": 0.7413, "step": 1462 }, { "epoch": 1.6782384162960837, "grad_norm": 50.25, "learning_rate": 1.4770642201834863e-05, "loss": 0.5802, "step": 1463 }, { "epoch": 1.6793860278295796, "grad_norm": 13.3125, "learning_rate": 1.4765545361875637e-05, "loss": 0.3988, "step": 1464 }, { "epoch": 1.6805336393630756, "grad_norm": 22.125, "learning_rate": 1.4760448521916413e-05, "loss": 0.2763, "step": 1465 }, { "epoch": 1.6816812508965715, "grad_norm": 63.5, "learning_rate": 1.4755351681957188e-05, "loss": 0.5855, "step": 1466 }, { "epoch": 1.6828288624300676, "grad_norm": 41.75, "learning_rate": 1.4750254841997962e-05, "loss": 0.413, "step": 1467 }, { "epoch": 1.6839764739635634, "grad_norm": 75.0, "learning_rate": 1.4745158002038738e-05, "loss": 1.2905, "step": 1468 }, { "epoch": 1.6851240854970593, "grad_norm": 35.0, "learning_rate": 1.474006116207951e-05, "loss": 0.5774, "step": 1469 }, { "epoch": 1.6862716970305551, "grad_norm": 28.25, "learning_rate": 1.4734964322120286e-05, "loss": 0.8901, "step": 1470 }, { "epoch": 1.687419308564051, "grad_norm": 35.0, "learning_rate": 1.4729867482161061e-05, "loss": 0.457, "step": 1471 }, { "epoch": 1.6885669200975468, "grad_norm": 34.0, "learning_rate": 1.4724770642201835e-05, "loss": 0.4638, "step": 1472 }, { "epoch": 1.689714531631043, "grad_norm": 37.5, "learning_rate": 1.4719673802242611e-05, "loss": 0.5084, "step": 1473 }, { "epoch": 1.6908621431645388, "grad_norm": 26.625, "learning_rate": 1.4714576962283385e-05, "loss": 0.2821, "step": 1474 }, { "epoch": 1.6920097546980348, "grad_norm": 34.25, "learning_rate": 1.470948012232416e-05, "loss": 0.3812, "step": 1475 }, { "epoch": 1.6931573662315307, "grad_norm": 59.0, "learning_rate": 1.4704383282364936e-05, "loss": 0.5477, "step": 1476 }, { "epoch": 1.6943049777650265, "grad_norm": 28.25, "learning_rate": 1.469928644240571e-05, "loss": 0.6984, "step": 1477 }, { "epoch": 1.6954525892985224, "grad_norm": 69.5, "learning_rate": 1.4694189602446486e-05, "loss": 0.7855, "step": 1478 }, { "epoch": 1.6966002008320182, "grad_norm": 49.0, "learning_rate": 1.4689092762487258e-05, "loss": 0.984, "step": 1479 }, { "epoch": 1.6977478123655143, "grad_norm": 22.875, "learning_rate": 1.4683995922528033e-05, "loss": 0.6088, "step": 1480 }, { "epoch": 1.6988954238990102, "grad_norm": 17.875, "learning_rate": 1.4678899082568809e-05, "loss": 0.1793, "step": 1481 }, { "epoch": 1.7000430354325062, "grad_norm": 22.375, "learning_rate": 1.4673802242609583e-05, "loss": 0.4399, "step": 1482 }, { "epoch": 1.701190646966002, "grad_norm": 44.75, "learning_rate": 1.4668705402650358e-05, "loss": 0.8196, "step": 1483 }, { "epoch": 1.702338258499498, "grad_norm": 17.75, "learning_rate": 1.4663608562691132e-05, "loss": 0.3481, "step": 1484 }, { "epoch": 1.7034858700329938, "grad_norm": 30.75, "learning_rate": 1.4658511722731908e-05, "loss": 0.5881, "step": 1485 }, { "epoch": 1.7046334815664896, "grad_norm": 54.5, "learning_rate": 1.4653414882772684e-05, "loss": 0.9103, "step": 1486 }, { "epoch": 1.7057810930999855, "grad_norm": 22.0, "learning_rate": 1.4648318042813456e-05, "loss": 0.9757, "step": 1487 }, { "epoch": 1.7069287046334816, "grad_norm": 41.25, "learning_rate": 1.4643221202854231e-05, "loss": 0.2791, "step": 1488 }, { "epoch": 1.7080763161669776, "grad_norm": 72.5, "learning_rate": 1.4638124362895005e-05, "loss": 0.6413, "step": 1489 }, { "epoch": 1.7092239277004735, "grad_norm": 31.25, "learning_rate": 1.463302752293578e-05, "loss": 0.6097, "step": 1490 }, { "epoch": 1.7103715392339693, "grad_norm": 31.625, "learning_rate": 1.4627930682976556e-05, "loss": 0.6532, "step": 1491 }, { "epoch": 1.7115191507674652, "grad_norm": 23.75, "learning_rate": 1.462283384301733e-05, "loss": 0.5511, "step": 1492 }, { "epoch": 1.712666762300961, "grad_norm": 44.25, "learning_rate": 1.4617737003058106e-05, "loss": 0.5933, "step": 1493 }, { "epoch": 1.713814373834457, "grad_norm": 175.0, "learning_rate": 1.461264016309888e-05, "loss": 0.8476, "step": 1494 }, { "epoch": 1.714961985367953, "grad_norm": 12.0625, "learning_rate": 1.4607543323139655e-05, "loss": 0.2916, "step": 1495 }, { "epoch": 1.7161095969014488, "grad_norm": 40.0, "learning_rate": 1.4602446483180431e-05, "loss": 0.4779, "step": 1496 }, { "epoch": 1.717257208434945, "grad_norm": 19.25, "learning_rate": 1.4597349643221203e-05, "loss": 0.3403, "step": 1497 }, { "epoch": 1.7184048199684407, "grad_norm": 20.125, "learning_rate": 1.4592252803261979e-05, "loss": 0.4278, "step": 1498 }, { "epoch": 1.7195524315019366, "grad_norm": 11.125, "learning_rate": 1.4587155963302753e-05, "loss": 0.4435, "step": 1499 }, { "epoch": 1.7207000430354324, "grad_norm": 47.75, "learning_rate": 1.4582059123343528e-05, "loss": 0.6405, "step": 1500 }, { "epoch": 1.7207000430354324, "eval_accuracy": 0.64, "eval_loss": 0.4719592034816742, "eval_runtime": 49.6324, "eval_samples_per_second": 2.015, "eval_steps_per_second": 2.015, "step": 1500 }, { "epoch": 1.7218476545689283, "grad_norm": 11.5, "learning_rate": 1.4576962283384304e-05, "loss": 0.3975, "step": 1501 }, { "epoch": 1.7229952661024244, "grad_norm": 20.875, "learning_rate": 1.4571865443425078e-05, "loss": 0.3939, "step": 1502 }, { "epoch": 1.7241428776359202, "grad_norm": 44.25, "learning_rate": 1.4566768603465853e-05, "loss": 0.7124, "step": 1503 }, { "epoch": 1.7252904891694163, "grad_norm": 33.0, "learning_rate": 1.4561671763506626e-05, "loss": 0.5179, "step": 1504 }, { "epoch": 1.7264381007029121, "grad_norm": 13.9375, "learning_rate": 1.4556574923547401e-05, "loss": 0.6342, "step": 1505 }, { "epoch": 1.727585712236408, "grad_norm": 20.75, "learning_rate": 1.4551478083588177e-05, "loss": 0.397, "step": 1506 }, { "epoch": 1.7287333237699039, "grad_norm": 12.375, "learning_rate": 1.454638124362895e-05, "loss": 0.3495, "step": 1507 }, { "epoch": 1.7298809353033997, "grad_norm": 53.75, "learning_rate": 1.4541284403669726e-05, "loss": 0.5092, "step": 1508 }, { "epoch": 1.7310285468368956, "grad_norm": 14.25, "learning_rate": 1.45361875637105e-05, "loss": 0.1927, "step": 1509 }, { "epoch": 1.7321761583703916, "grad_norm": 20.875, "learning_rate": 1.4531090723751276e-05, "loss": 0.7156, "step": 1510 }, { "epoch": 1.7333237699038875, "grad_norm": 8.6875, "learning_rate": 1.4525993883792051e-05, "loss": 0.3399, "step": 1511 }, { "epoch": 1.7344713814373836, "grad_norm": 16.125, "learning_rate": 1.4520897043832824e-05, "loss": 0.5978, "step": 1512 }, { "epoch": 1.7356189929708794, "grad_norm": 42.0, "learning_rate": 1.45158002038736e-05, "loss": 0.9311, "step": 1513 }, { "epoch": 1.7367666045043753, "grad_norm": 70.5, "learning_rate": 1.4510703363914373e-05, "loss": 0.7334, "step": 1514 }, { "epoch": 1.737914216037871, "grad_norm": 16.625, "learning_rate": 1.4505606523955149e-05, "loss": 0.4106, "step": 1515 }, { "epoch": 1.739061827571367, "grad_norm": 12.0, "learning_rate": 1.4500509683995924e-05, "loss": 0.2984, "step": 1516 }, { "epoch": 1.740209439104863, "grad_norm": 27.125, "learning_rate": 1.4495412844036698e-05, "loss": 0.3245, "step": 1517 }, { "epoch": 1.7413570506383589, "grad_norm": 40.25, "learning_rate": 1.4490316004077474e-05, "loss": 0.5248, "step": 1518 }, { "epoch": 1.742504662171855, "grad_norm": 15.5, "learning_rate": 1.4485219164118248e-05, "loss": 0.3244, "step": 1519 }, { "epoch": 1.7436522737053508, "grad_norm": 70.5, "learning_rate": 1.4480122324159023e-05, "loss": 0.9236, "step": 1520 }, { "epoch": 1.7447998852388467, "grad_norm": 30.625, "learning_rate": 1.4475025484199799e-05, "loss": 0.8874, "step": 1521 }, { "epoch": 1.7459474967723425, "grad_norm": 11.6875, "learning_rate": 1.4469928644240571e-05, "loss": 0.3286, "step": 1522 }, { "epoch": 1.7470951083058384, "grad_norm": 26.875, "learning_rate": 1.4464831804281347e-05, "loss": 0.3404, "step": 1523 }, { "epoch": 1.7482427198393344, "grad_norm": 15.375, "learning_rate": 1.445973496432212e-05, "loss": 0.4482, "step": 1524 }, { "epoch": 1.7493903313728303, "grad_norm": 27.0, "learning_rate": 1.4454638124362896e-05, "loss": 0.476, "step": 1525 }, { "epoch": 1.7505379429063264, "grad_norm": 20.5, "learning_rate": 1.4449541284403672e-05, "loss": 0.3796, "step": 1526 }, { "epoch": 1.7516855544398222, "grad_norm": 47.75, "learning_rate": 1.4444444444444446e-05, "loss": 0.5618, "step": 1527 }, { "epoch": 1.752833165973318, "grad_norm": 29.5, "learning_rate": 1.4439347604485221e-05, "loss": 0.4359, "step": 1528 }, { "epoch": 1.753980777506814, "grad_norm": 52.25, "learning_rate": 1.4434250764525994e-05, "loss": 0.6163, "step": 1529 }, { "epoch": 1.7551283890403098, "grad_norm": 19.125, "learning_rate": 1.442915392456677e-05, "loss": 0.5202, "step": 1530 }, { "epoch": 1.7562760005738056, "grad_norm": 14.0, "learning_rate": 1.4424057084607545e-05, "loss": 0.3921, "step": 1531 }, { "epoch": 1.7574236121073017, "grad_norm": 64.0, "learning_rate": 1.4418960244648319e-05, "loss": 0.7896, "step": 1532 }, { "epoch": 1.7585712236407975, "grad_norm": 23.5, "learning_rate": 1.4413863404689094e-05, "loss": 0.4141, "step": 1533 }, { "epoch": 1.7597188351742936, "grad_norm": 39.75, "learning_rate": 1.4408766564729868e-05, "loss": 0.8279, "step": 1534 }, { "epoch": 1.7608664467077895, "grad_norm": 60.5, "learning_rate": 1.4403669724770644e-05, "loss": 0.6541, "step": 1535 }, { "epoch": 1.7620140582412853, "grad_norm": 22.375, "learning_rate": 1.4398572884811418e-05, "loss": 0.4579, "step": 1536 }, { "epoch": 1.7631616697747812, "grad_norm": 34.5, "learning_rate": 1.4393476044852193e-05, "loss": 0.3177, "step": 1537 }, { "epoch": 1.764309281308277, "grad_norm": 13.9375, "learning_rate": 1.4388379204892969e-05, "loss": 0.405, "step": 1538 }, { "epoch": 1.765456892841773, "grad_norm": 45.25, "learning_rate": 1.4383282364933741e-05, "loss": 0.4536, "step": 1539 }, { "epoch": 1.766604504375269, "grad_norm": 15.0, "learning_rate": 1.4378185524974517e-05, "loss": 0.658, "step": 1540 }, { "epoch": 1.767752115908765, "grad_norm": 23.125, "learning_rate": 1.437308868501529e-05, "loss": 0.5647, "step": 1541 }, { "epoch": 1.7688997274422609, "grad_norm": 49.5, "learning_rate": 1.4367991845056066e-05, "loss": 0.6544, "step": 1542 }, { "epoch": 1.7700473389757567, "grad_norm": 14.625, "learning_rate": 1.4362895005096842e-05, "loss": 0.3288, "step": 1543 }, { "epoch": 1.7711949505092526, "grad_norm": 14.875, "learning_rate": 1.4357798165137616e-05, "loss": 0.5407, "step": 1544 }, { "epoch": 1.7723425620427484, "grad_norm": 69.0, "learning_rate": 1.4352701325178391e-05, "loss": 0.4395, "step": 1545 }, { "epoch": 1.7734901735762443, "grad_norm": 32.5, "learning_rate": 1.4347604485219164e-05, "loss": 0.4165, "step": 1546 }, { "epoch": 1.7746377851097404, "grad_norm": 52.25, "learning_rate": 1.434250764525994e-05, "loss": 0.455, "step": 1547 }, { "epoch": 1.7757853966432364, "grad_norm": 26.875, "learning_rate": 1.4337410805300715e-05, "loss": 0.5133, "step": 1548 }, { "epoch": 1.7769330081767323, "grad_norm": 63.75, "learning_rate": 1.4332313965341489e-05, "loss": 0.8173, "step": 1549 }, { "epoch": 1.7780806197102281, "grad_norm": 69.5, "learning_rate": 1.4327217125382264e-05, "loss": 0.7585, "step": 1550 }, { "epoch": 1.779228231243724, "grad_norm": 12.25, "learning_rate": 1.4322120285423038e-05, "loss": 0.4586, "step": 1551 }, { "epoch": 1.7803758427772198, "grad_norm": 76.0, "learning_rate": 1.4317023445463814e-05, "loss": 0.6924, "step": 1552 }, { "epoch": 1.7815234543107157, "grad_norm": 12.4375, "learning_rate": 1.431192660550459e-05, "loss": 0.3333, "step": 1553 }, { "epoch": 1.7826710658442118, "grad_norm": 23.5, "learning_rate": 1.4306829765545363e-05, "loss": 0.7329, "step": 1554 }, { "epoch": 1.7838186773777076, "grad_norm": 22.875, "learning_rate": 1.4301732925586139e-05, "loss": 0.2949, "step": 1555 }, { "epoch": 1.7849662889112037, "grad_norm": 52.0, "learning_rate": 1.4296636085626911e-05, "loss": 0.6708, "step": 1556 }, { "epoch": 1.7861139004446995, "grad_norm": 75.0, "learning_rate": 1.4291539245667687e-05, "loss": 0.6416, "step": 1557 }, { "epoch": 1.7872615119781954, "grad_norm": 16.0, "learning_rate": 1.4286442405708462e-05, "loss": 0.1615, "step": 1558 }, { "epoch": 1.7884091235116912, "grad_norm": 13.8125, "learning_rate": 1.4281345565749236e-05, "loss": 0.2567, "step": 1559 }, { "epoch": 1.789556735045187, "grad_norm": 27.125, "learning_rate": 1.4276248725790012e-05, "loss": 0.3011, "step": 1560 }, { "epoch": 1.7907043465786832, "grad_norm": 37.5, "learning_rate": 1.4271151885830786e-05, "loss": 0.4136, "step": 1561 }, { "epoch": 1.791851958112179, "grad_norm": 64.0, "learning_rate": 1.4266055045871561e-05, "loss": 0.5132, "step": 1562 }, { "epoch": 1.792999569645675, "grad_norm": 23.5, "learning_rate": 1.4260958205912337e-05, "loss": 0.8581, "step": 1563 }, { "epoch": 1.794147181179171, "grad_norm": 35.75, "learning_rate": 1.4255861365953109e-05, "loss": 0.4336, "step": 1564 }, { "epoch": 1.7952947927126668, "grad_norm": 34.5, "learning_rate": 1.4250764525993885e-05, "loss": 0.7922, "step": 1565 }, { "epoch": 1.7964424042461626, "grad_norm": 12.375, "learning_rate": 1.4245667686034659e-05, "loss": 0.385, "step": 1566 }, { "epoch": 1.7975900157796585, "grad_norm": 22.125, "learning_rate": 1.4240570846075434e-05, "loss": 0.1375, "step": 1567 }, { "epoch": 1.7987376273131543, "grad_norm": 49.25, "learning_rate": 1.423547400611621e-05, "loss": 0.2854, "step": 1568 }, { "epoch": 1.7998852388466504, "grad_norm": 74.5, "learning_rate": 1.4230377166156984e-05, "loss": 0.8727, "step": 1569 }, { "epoch": 1.8010328503801463, "grad_norm": 5.875, "learning_rate": 1.422528032619776e-05, "loss": 0.0896, "step": 1570 }, { "epoch": 1.8021804619136423, "grad_norm": 23.0, "learning_rate": 1.4220183486238533e-05, "loss": 0.709, "step": 1571 }, { "epoch": 1.8033280734471382, "grad_norm": 8.5, "learning_rate": 1.4215086646279309e-05, "loss": 0.1583, "step": 1572 }, { "epoch": 1.804475684980634, "grad_norm": 45.25, "learning_rate": 1.4209989806320084e-05, "loss": 0.3676, "step": 1573 }, { "epoch": 1.80562329651413, "grad_norm": 14.125, "learning_rate": 1.4204892966360857e-05, "loss": 0.2197, "step": 1574 }, { "epoch": 1.8067709080476257, "grad_norm": 8.6875, "learning_rate": 1.4199796126401632e-05, "loss": 0.2591, "step": 1575 }, { "epoch": 1.8079185195811218, "grad_norm": 43.25, "learning_rate": 1.4194699286442406e-05, "loss": 0.4846, "step": 1576 }, { "epoch": 1.8090661311146177, "grad_norm": 32.0, "learning_rate": 1.4189602446483182e-05, "loss": 0.2703, "step": 1577 }, { "epoch": 1.8102137426481137, "grad_norm": 46.25, "learning_rate": 1.4184505606523957e-05, "loss": 0.6256, "step": 1578 }, { "epoch": 1.8113613541816096, "grad_norm": 38.25, "learning_rate": 1.4179408766564731e-05, "loss": 1.0764, "step": 1579 }, { "epoch": 1.8125089657151054, "grad_norm": 21.625, "learning_rate": 1.4174311926605507e-05, "loss": 0.1879, "step": 1580 }, { "epoch": 1.8136565772486013, "grad_norm": 25.375, "learning_rate": 1.4169215086646279e-05, "loss": 0.8602, "step": 1581 }, { "epoch": 1.8148041887820971, "grad_norm": 73.0, "learning_rate": 1.4164118246687055e-05, "loss": 0.6298, "step": 1582 }, { "epoch": 1.8159518003155932, "grad_norm": 33.0, "learning_rate": 1.415902140672783e-05, "loss": 0.3714, "step": 1583 }, { "epoch": 1.817099411849089, "grad_norm": 13.9375, "learning_rate": 1.4153924566768604e-05, "loss": 0.2252, "step": 1584 }, { "epoch": 1.8182470233825851, "grad_norm": 42.5, "learning_rate": 1.414882772680938e-05, "loss": 0.577, "step": 1585 }, { "epoch": 1.819394634916081, "grad_norm": 28.375, "learning_rate": 1.4143730886850154e-05, "loss": 0.5294, "step": 1586 }, { "epoch": 1.8205422464495769, "grad_norm": 29.25, "learning_rate": 1.413863404689093e-05, "loss": 0.4661, "step": 1587 }, { "epoch": 1.8216898579830727, "grad_norm": 15.6875, "learning_rate": 1.4133537206931705e-05, "loss": 0.358, "step": 1588 }, { "epoch": 1.8228374695165686, "grad_norm": 42.0, "learning_rate": 1.4128440366972477e-05, "loss": 0.5276, "step": 1589 }, { "epoch": 1.8239850810500644, "grad_norm": 98.5, "learning_rate": 1.4123343527013254e-05, "loss": 0.6566, "step": 1590 }, { "epoch": 1.8251326925835605, "grad_norm": 37.0, "learning_rate": 1.4118246687054027e-05, "loss": 0.2234, "step": 1591 }, { "epoch": 1.8262803041170563, "grad_norm": 49.5, "learning_rate": 1.4113149847094802e-05, "loss": 0.5727, "step": 1592 }, { "epoch": 1.8274279156505524, "grad_norm": 31.75, "learning_rate": 1.4108053007135578e-05, "loss": 0.7391, "step": 1593 }, { "epoch": 1.8285755271840483, "grad_norm": 81.0, "learning_rate": 1.4102956167176352e-05, "loss": 0.762, "step": 1594 }, { "epoch": 1.829723138717544, "grad_norm": 56.0, "learning_rate": 1.4097859327217127e-05, "loss": 0.371, "step": 1595 }, { "epoch": 1.83087075025104, "grad_norm": 33.75, "learning_rate": 1.4092762487257901e-05, "loss": 0.5857, "step": 1596 }, { "epoch": 1.8320183617845358, "grad_norm": 15.0625, "learning_rate": 1.4087665647298677e-05, "loss": 0.2163, "step": 1597 }, { "epoch": 1.8331659733180319, "grad_norm": 21.25, "learning_rate": 1.4082568807339452e-05, "loss": 0.4766, "step": 1598 }, { "epoch": 1.8343135848515277, "grad_norm": 49.75, "learning_rate": 1.4077471967380225e-05, "loss": 0.3923, "step": 1599 }, { "epoch": 1.8354611963850238, "grad_norm": 38.25, "learning_rate": 1.4072375127421e-05, "loss": 0.445, "step": 1600 }, { "epoch": 1.8354611963850238, "eval_accuracy": 0.69, "eval_loss": 0.5018435120582581, "eval_runtime": 49.4827, "eval_samples_per_second": 2.021, "eval_steps_per_second": 2.021, "step": 1600 }, { "epoch": 1.8366088079185197, "grad_norm": 36.25, "learning_rate": 1.4067278287461774e-05, "loss": 0.7721, "step": 1601 }, { "epoch": 1.8377564194520155, "grad_norm": 26.875, "learning_rate": 1.406218144750255e-05, "loss": 0.9496, "step": 1602 }, { "epoch": 1.8389040309855114, "grad_norm": 47.5, "learning_rate": 1.4057084607543325e-05, "loss": 0.5079, "step": 1603 }, { "epoch": 1.8400516425190072, "grad_norm": 14.0625, "learning_rate": 1.40519877675841e-05, "loss": 0.2523, "step": 1604 }, { "epoch": 1.841199254052503, "grad_norm": 36.5, "learning_rate": 1.4046890927624875e-05, "loss": 0.6013, "step": 1605 }, { "epoch": 1.8423468655859991, "grad_norm": 33.25, "learning_rate": 1.4041794087665647e-05, "loss": 0.4822, "step": 1606 }, { "epoch": 1.8434944771194952, "grad_norm": 12.6875, "learning_rate": 1.4036697247706423e-05, "loss": 0.4923, "step": 1607 }, { "epoch": 1.844642088652991, "grad_norm": 15.0625, "learning_rate": 1.4031600407747196e-05, "loss": 0.2454, "step": 1608 }, { "epoch": 1.845789700186487, "grad_norm": 20.5, "learning_rate": 1.4026503567787972e-05, "loss": 0.2125, "step": 1609 }, { "epoch": 1.8469373117199828, "grad_norm": 18.875, "learning_rate": 1.4021406727828748e-05, "loss": 0.6114, "step": 1610 }, { "epoch": 1.8480849232534786, "grad_norm": 37.25, "learning_rate": 1.4016309887869522e-05, "loss": 0.4515, "step": 1611 }, { "epoch": 1.8492325347869745, "grad_norm": 10.5, "learning_rate": 1.4011213047910297e-05, "loss": 0.2101, "step": 1612 }, { "epoch": 1.8503801463204705, "grad_norm": 33.5, "learning_rate": 1.4006116207951071e-05, "loss": 0.5889, "step": 1613 }, { "epoch": 1.8515277578539664, "grad_norm": 19.875, "learning_rate": 1.4001019367991847e-05, "loss": 0.4676, "step": 1614 }, { "epoch": 1.8526753693874625, "grad_norm": 51.0, "learning_rate": 1.3995922528032622e-05, "loss": 0.5021, "step": 1615 }, { "epoch": 1.8538229809209583, "grad_norm": 38.0, "learning_rate": 1.3990825688073395e-05, "loss": 0.6099, "step": 1616 }, { "epoch": 1.8549705924544542, "grad_norm": 49.75, "learning_rate": 1.398572884811417e-05, "loss": 0.6493, "step": 1617 }, { "epoch": 1.85611820398795, "grad_norm": 14.5625, "learning_rate": 1.3980632008154944e-05, "loss": 0.1851, "step": 1618 }, { "epoch": 1.8572658155214459, "grad_norm": 74.5, "learning_rate": 1.397553516819572e-05, "loss": 0.625, "step": 1619 }, { "epoch": 1.858413427054942, "grad_norm": 49.25, "learning_rate": 1.3970438328236495e-05, "loss": 0.4501, "step": 1620 }, { "epoch": 1.8595610385884378, "grad_norm": 36.0, "learning_rate": 1.3965341488277269e-05, "loss": 0.4769, "step": 1621 }, { "epoch": 1.8607086501219339, "grad_norm": 72.5, "learning_rate": 1.3960244648318045e-05, "loss": 0.6018, "step": 1622 }, { "epoch": 1.8618562616554297, "grad_norm": 28.75, "learning_rate": 1.3955147808358817e-05, "loss": 0.4446, "step": 1623 }, { "epoch": 1.8630038731889256, "grad_norm": 58.5, "learning_rate": 1.3950050968399593e-05, "loss": 0.7133, "step": 1624 }, { "epoch": 1.8641514847224214, "grad_norm": 14.6875, "learning_rate": 1.3944954128440368e-05, "loss": 0.2074, "step": 1625 }, { "epoch": 1.8652990962559173, "grad_norm": 54.25, "learning_rate": 1.3939857288481142e-05, "loss": 0.3376, "step": 1626 }, { "epoch": 1.8664467077894131, "grad_norm": 23.0, "learning_rate": 1.3934760448521918e-05, "loss": 0.5169, "step": 1627 }, { "epoch": 1.8675943193229092, "grad_norm": 55.5, "learning_rate": 1.3929663608562692e-05, "loss": 0.3699, "step": 1628 }, { "epoch": 1.868741930856405, "grad_norm": 18.125, "learning_rate": 1.3924566768603467e-05, "loss": 0.207, "step": 1629 }, { "epoch": 1.8698895423899011, "grad_norm": 26.5, "learning_rate": 1.3919469928644243e-05, "loss": 0.4332, "step": 1630 }, { "epoch": 1.871037153923397, "grad_norm": 22.75, "learning_rate": 1.3914373088685017e-05, "loss": 0.4837, "step": 1631 }, { "epoch": 1.8721847654568928, "grad_norm": 69.5, "learning_rate": 1.3909276248725792e-05, "loss": 0.5754, "step": 1632 }, { "epoch": 1.8733323769903887, "grad_norm": 16.25, "learning_rate": 1.3904179408766564e-05, "loss": 0.2141, "step": 1633 }, { "epoch": 1.8744799885238845, "grad_norm": 35.75, "learning_rate": 1.389908256880734e-05, "loss": 0.3208, "step": 1634 }, { "epoch": 1.8756276000573806, "grad_norm": 29.75, "learning_rate": 1.3893985728848116e-05, "loss": 0.6767, "step": 1635 }, { "epoch": 1.8767752115908765, "grad_norm": 19.375, "learning_rate": 1.388888888888889e-05, "loss": 0.1118, "step": 1636 }, { "epoch": 1.8779228231243725, "grad_norm": 15.8125, "learning_rate": 1.3883792048929665e-05, "loss": 0.1238, "step": 1637 }, { "epoch": 1.8790704346578684, "grad_norm": 31.875, "learning_rate": 1.3878695208970439e-05, "loss": 0.5031, "step": 1638 }, { "epoch": 1.8802180461913642, "grad_norm": 40.25, "learning_rate": 1.3873598369011215e-05, "loss": 0.8107, "step": 1639 }, { "epoch": 1.88136565772486, "grad_norm": 25.0, "learning_rate": 1.386850152905199e-05, "loss": 0.3873, "step": 1640 }, { "epoch": 1.882513269258356, "grad_norm": 78.0, "learning_rate": 1.3863404689092762e-05, "loss": 1.1926, "step": 1641 }, { "epoch": 1.883660880791852, "grad_norm": 34.25, "learning_rate": 1.3858307849133538e-05, "loss": 0.5274, "step": 1642 }, { "epoch": 1.8848084923253479, "grad_norm": 15.125, "learning_rate": 1.3853211009174312e-05, "loss": 0.4215, "step": 1643 }, { "epoch": 1.885956103858844, "grad_norm": 28.0, "learning_rate": 1.3848114169215088e-05, "loss": 0.2697, "step": 1644 }, { "epoch": 1.8871037153923398, "grad_norm": 34.25, "learning_rate": 1.3843017329255863e-05, "loss": 0.2025, "step": 1645 }, { "epoch": 1.8882513269258356, "grad_norm": 91.5, "learning_rate": 1.3837920489296637e-05, "loss": 0.7438, "step": 1646 }, { "epoch": 1.8893989384593315, "grad_norm": 88.0, "learning_rate": 1.3832823649337413e-05, "loss": 0.9659, "step": 1647 }, { "epoch": 1.8905465499928273, "grad_norm": 26.875, "learning_rate": 1.3827726809378187e-05, "loss": 0.2307, "step": 1648 }, { "epoch": 1.8916941615263232, "grad_norm": 13.375, "learning_rate": 1.3822629969418962e-05, "loss": 0.3359, "step": 1649 }, { "epoch": 1.8928417730598193, "grad_norm": 72.0, "learning_rate": 1.3817533129459738e-05, "loss": 0.5043, "step": 1650 }, { "epoch": 1.8939893845933151, "grad_norm": 46.75, "learning_rate": 1.381243628950051e-05, "loss": 0.4365, "step": 1651 }, { "epoch": 1.8951369961268112, "grad_norm": 27.0, "learning_rate": 1.3807339449541286e-05, "loss": 0.4578, "step": 1652 }, { "epoch": 1.896284607660307, "grad_norm": 49.75, "learning_rate": 1.380224260958206e-05, "loss": 0.645, "step": 1653 }, { "epoch": 1.897432219193803, "grad_norm": 58.25, "learning_rate": 1.3797145769622835e-05, "loss": 0.7014, "step": 1654 }, { "epoch": 1.8985798307272987, "grad_norm": 41.75, "learning_rate": 1.379204892966361e-05, "loss": 0.6419, "step": 1655 }, { "epoch": 1.8997274422607946, "grad_norm": 49.5, "learning_rate": 1.3786952089704385e-05, "loss": 0.6695, "step": 1656 }, { "epoch": 1.9008750537942907, "grad_norm": 34.5, "learning_rate": 1.378185524974516e-05, "loss": 0.324, "step": 1657 }, { "epoch": 1.9020226653277865, "grad_norm": 14.25, "learning_rate": 1.3776758409785932e-05, "loss": 0.289, "step": 1658 }, { "epoch": 1.9031702768612826, "grad_norm": 20.875, "learning_rate": 1.3771661569826708e-05, "loss": 0.2563, "step": 1659 }, { "epoch": 1.9043178883947784, "grad_norm": 26.0, "learning_rate": 1.3766564729867484e-05, "loss": 0.7482, "step": 1660 }, { "epoch": 1.9054654999282743, "grad_norm": 14.4375, "learning_rate": 1.3761467889908258e-05, "loss": 0.4048, "step": 1661 }, { "epoch": 1.9066131114617701, "grad_norm": 51.25, "learning_rate": 1.3756371049949033e-05, "loss": 0.6209, "step": 1662 }, { "epoch": 1.907760722995266, "grad_norm": 30.875, "learning_rate": 1.3751274209989807e-05, "loss": 0.7158, "step": 1663 }, { "epoch": 1.9089083345287619, "grad_norm": 9.4375, "learning_rate": 1.3746177370030583e-05, "loss": 0.1511, "step": 1664 }, { "epoch": 1.910055946062258, "grad_norm": 19.125, "learning_rate": 1.3741080530071358e-05, "loss": 0.3676, "step": 1665 }, { "epoch": 1.911203557595754, "grad_norm": 19.875, "learning_rate": 1.3735983690112132e-05, "loss": 0.3149, "step": 1666 }, { "epoch": 1.9123511691292499, "grad_norm": 22.25, "learning_rate": 1.3730886850152908e-05, "loss": 0.2507, "step": 1667 }, { "epoch": 1.9134987806627457, "grad_norm": 13.0, "learning_rate": 1.372579001019368e-05, "loss": 0.5281, "step": 1668 }, { "epoch": 1.9146463921962416, "grad_norm": 22.625, "learning_rate": 1.3720693170234456e-05, "loss": 0.3352, "step": 1669 }, { "epoch": 1.9157940037297374, "grad_norm": 25.625, "learning_rate": 1.3715596330275231e-05, "loss": 0.3003, "step": 1670 }, { "epoch": 1.9169416152632333, "grad_norm": 37.5, "learning_rate": 1.3710499490316005e-05, "loss": 0.2462, "step": 1671 }, { "epoch": 1.9180892267967293, "grad_norm": 20.75, "learning_rate": 1.370540265035678e-05, "loss": 0.6685, "step": 1672 }, { "epoch": 1.9192368383302252, "grad_norm": 30.0, "learning_rate": 1.3700305810397555e-05, "loss": 0.5793, "step": 1673 }, { "epoch": 1.9203844498637213, "grad_norm": 67.5, "learning_rate": 1.369520897043833e-05, "loss": 0.5628, "step": 1674 }, { "epoch": 1.921532061397217, "grad_norm": 68.0, "learning_rate": 1.3690112130479106e-05, "loss": 0.3445, "step": 1675 }, { "epoch": 1.922679672930713, "grad_norm": 18.375, "learning_rate": 1.3685015290519878e-05, "loss": 0.3626, "step": 1676 }, { "epoch": 1.9238272844642088, "grad_norm": 26.875, "learning_rate": 1.3679918450560654e-05, "loss": 0.8984, "step": 1677 }, { "epoch": 1.9249748959977047, "grad_norm": 27.125, "learning_rate": 1.3674821610601427e-05, "loss": 0.4586, "step": 1678 }, { "epoch": 1.9261225075312007, "grad_norm": 62.0, "learning_rate": 1.3669724770642203e-05, "loss": 0.7513, "step": 1679 }, { "epoch": 1.9272701190646966, "grad_norm": 38.25, "learning_rate": 1.3664627930682979e-05, "loss": 0.4712, "step": 1680 }, { "epoch": 1.9284177305981927, "grad_norm": 19.125, "learning_rate": 1.3659531090723753e-05, "loss": 0.2701, "step": 1681 }, { "epoch": 1.9295653421316885, "grad_norm": 16.625, "learning_rate": 1.3654434250764528e-05, "loss": 0.3847, "step": 1682 }, { "epoch": 1.9307129536651844, "grad_norm": 52.0, "learning_rate": 1.36493374108053e-05, "loss": 0.4352, "step": 1683 }, { "epoch": 1.9318605651986802, "grad_norm": 100.0, "learning_rate": 1.3644240570846076e-05, "loss": 1.0839, "step": 1684 }, { "epoch": 1.933008176732176, "grad_norm": 53.25, "learning_rate": 1.363914373088685e-05, "loss": 0.5791, "step": 1685 }, { "epoch": 1.934155788265672, "grad_norm": 55.5, "learning_rate": 1.3634046890927625e-05, "loss": 0.7248, "step": 1686 }, { "epoch": 1.935303399799168, "grad_norm": 16.5, "learning_rate": 1.3628950050968401e-05, "loss": 0.3914, "step": 1687 }, { "epoch": 1.9364510113326638, "grad_norm": 43.5, "learning_rate": 1.3623853211009175e-05, "loss": 0.348, "step": 1688 }, { "epoch": 1.93759862286616, "grad_norm": 34.25, "learning_rate": 1.361875637104995e-05, "loss": 0.4504, "step": 1689 }, { "epoch": 1.9387462343996558, "grad_norm": 32.5, "learning_rate": 1.3613659531090724e-05, "loss": 0.4256, "step": 1690 }, { "epoch": 1.9398938459331516, "grad_norm": 17.125, "learning_rate": 1.36085626911315e-05, "loss": 0.2441, "step": 1691 }, { "epoch": 1.9410414574666475, "grad_norm": 31.625, "learning_rate": 1.3603465851172276e-05, "loss": 0.5579, "step": 1692 }, { "epoch": 1.9421890690001433, "grad_norm": 29.75, "learning_rate": 1.3598369011213048e-05, "loss": 0.8088, "step": 1693 }, { "epoch": 1.9433366805336394, "grad_norm": 51.5, "learning_rate": 1.3593272171253823e-05, "loss": 0.6118, "step": 1694 }, { "epoch": 1.9444842920671352, "grad_norm": 27.875, "learning_rate": 1.3588175331294597e-05, "loss": 0.2742, "step": 1695 }, { "epoch": 1.9456319036006313, "grad_norm": 11.8125, "learning_rate": 1.3583078491335373e-05, "loss": 0.2417, "step": 1696 }, { "epoch": 1.9467795151341272, "grad_norm": 30.75, "learning_rate": 1.3577981651376149e-05, "loss": 0.236, "step": 1697 }, { "epoch": 1.947927126667623, "grad_norm": 28.5, "learning_rate": 1.3572884811416922e-05, "loss": 0.299, "step": 1698 }, { "epoch": 1.9490747382011189, "grad_norm": 15.125, "learning_rate": 1.3567787971457698e-05, "loss": 0.1027, "step": 1699 }, { "epoch": 1.9502223497346147, "grad_norm": 35.25, "learning_rate": 1.356269113149847e-05, "loss": 0.2869, "step": 1700 }, { "epoch": 1.9502223497346147, "eval_accuracy": 0.72, "eval_loss": 0.4552258551120758, "eval_runtime": 49.3148, "eval_samples_per_second": 2.028, "eval_steps_per_second": 2.028, "step": 1700 }, { "epoch": 1.9513699612681108, "grad_norm": 27.875, "learning_rate": 1.3557594291539246e-05, "loss": 0.9343, "step": 1701 }, { "epoch": 1.9525175728016066, "grad_norm": 31.25, "learning_rate": 1.3552497451580021e-05, "loss": 0.5365, "step": 1702 }, { "epoch": 1.9536651843351027, "grad_norm": 56.25, "learning_rate": 1.3547400611620795e-05, "loss": 0.6064, "step": 1703 }, { "epoch": 1.9548127958685986, "grad_norm": 8.875, "learning_rate": 1.3542303771661571e-05, "loss": 0.2503, "step": 1704 }, { "epoch": 1.9559604074020944, "grad_norm": 23.25, "learning_rate": 1.3537206931702345e-05, "loss": 0.6551, "step": 1705 }, { "epoch": 1.9571080189355903, "grad_norm": 26.75, "learning_rate": 1.353211009174312e-05, "loss": 0.4402, "step": 1706 }, { "epoch": 1.9582556304690861, "grad_norm": 19.75, "learning_rate": 1.3527013251783896e-05, "loss": 0.5219, "step": 1707 }, { "epoch": 1.959403242002582, "grad_norm": 109.0, "learning_rate": 1.352191641182467e-05, "loss": 0.698, "step": 1708 }, { "epoch": 1.960550853536078, "grad_norm": 81.5, "learning_rate": 1.3516819571865446e-05, "loss": 0.5249, "step": 1709 }, { "epoch": 1.961698465069574, "grad_norm": 29.125, "learning_rate": 1.3511722731906218e-05, "loss": 0.5226, "step": 1710 }, { "epoch": 1.96284607660307, "grad_norm": 59.75, "learning_rate": 1.3506625891946993e-05, "loss": 0.6405, "step": 1711 }, { "epoch": 1.9639936881365658, "grad_norm": 21.5, "learning_rate": 1.3501529051987769e-05, "loss": 0.4448, "step": 1712 }, { "epoch": 1.9651412996700617, "grad_norm": 34.75, "learning_rate": 1.3496432212028543e-05, "loss": 0.667, "step": 1713 }, { "epoch": 1.9662889112035575, "grad_norm": 15.125, "learning_rate": 1.3491335372069319e-05, "loss": 0.4765, "step": 1714 }, { "epoch": 1.9674365227370534, "grad_norm": 31.0, "learning_rate": 1.3486238532110092e-05, "loss": 0.2273, "step": 1715 }, { "epoch": 1.9685841342705495, "grad_norm": 20.75, "learning_rate": 1.3481141692150868e-05, "loss": 0.3604, "step": 1716 }, { "epoch": 1.9697317458040453, "grad_norm": 39.5, "learning_rate": 1.3476044852191644e-05, "loss": 0.4167, "step": 1717 }, { "epoch": 1.9708793573375414, "grad_norm": 28.5, "learning_rate": 1.3470948012232416e-05, "loss": 0.4476, "step": 1718 }, { "epoch": 1.9720269688710372, "grad_norm": 19.25, "learning_rate": 1.3465851172273191e-05, "loss": 0.5297, "step": 1719 }, { "epoch": 1.973174580404533, "grad_norm": 65.0, "learning_rate": 1.3460754332313965e-05, "loss": 0.8327, "step": 1720 }, { "epoch": 1.974322191938029, "grad_norm": 23.875, "learning_rate": 1.3455657492354741e-05, "loss": 0.1996, "step": 1721 }, { "epoch": 1.9754698034715248, "grad_norm": 23.0, "learning_rate": 1.3450560652395517e-05, "loss": 0.4416, "step": 1722 }, { "epoch": 1.9766174150050206, "grad_norm": 11.4375, "learning_rate": 1.344546381243629e-05, "loss": 0.2721, "step": 1723 }, { "epoch": 1.9777650265385167, "grad_norm": 35.0, "learning_rate": 1.3440366972477066e-05, "loss": 0.5629, "step": 1724 }, { "epoch": 1.9789126380720128, "grad_norm": 67.5, "learning_rate": 1.343527013251784e-05, "loss": 0.6305, "step": 1725 }, { "epoch": 1.9800602496055086, "grad_norm": 32.75, "learning_rate": 1.3430173292558616e-05, "loss": 0.2927, "step": 1726 }, { "epoch": 1.9812078611390045, "grad_norm": 35.25, "learning_rate": 1.3425076452599391e-05, "loss": 0.238, "step": 1727 }, { "epoch": 1.9823554726725003, "grad_norm": 20.875, "learning_rate": 1.3419979612640163e-05, "loss": 0.4392, "step": 1728 }, { "epoch": 1.9835030842059962, "grad_norm": 44.0, "learning_rate": 1.3414882772680939e-05, "loss": 0.398, "step": 1729 }, { "epoch": 1.984650695739492, "grad_norm": 26.0, "learning_rate": 1.3409785932721713e-05, "loss": 0.7501, "step": 1730 }, { "epoch": 1.9857983072729881, "grad_norm": 20.75, "learning_rate": 1.3404689092762488e-05, "loss": 0.3494, "step": 1731 }, { "epoch": 1.986945918806484, "grad_norm": 30.875, "learning_rate": 1.3399592252803264e-05, "loss": 1.1064, "step": 1732 }, { "epoch": 1.98809353033998, "grad_norm": 30.0, "learning_rate": 1.3394495412844038e-05, "loss": 0.6117, "step": 1733 }, { "epoch": 1.989241141873476, "grad_norm": 16.875, "learning_rate": 1.3389398572884814e-05, "loss": 0.3173, "step": 1734 }, { "epoch": 1.9903887534069717, "grad_norm": 15.75, "learning_rate": 1.3384301732925586e-05, "loss": 0.4467, "step": 1735 }, { "epoch": 1.9915363649404676, "grad_norm": 49.0, "learning_rate": 1.3379204892966361e-05, "loss": 0.7462, "step": 1736 }, { "epoch": 1.9926839764739634, "grad_norm": 22.25, "learning_rate": 1.3374108053007137e-05, "loss": 0.4648, "step": 1737 }, { "epoch": 1.9938315880074595, "grad_norm": 41.25, "learning_rate": 1.3369011213047911e-05, "loss": 0.2781, "step": 1738 }, { "epoch": 1.9949791995409554, "grad_norm": 22.625, "learning_rate": 1.3363914373088686e-05, "loss": 0.6798, "step": 1739 }, { "epoch": 1.9961268110744514, "grad_norm": 61.5, "learning_rate": 1.335881753312946e-05, "loss": 0.4519, "step": 1740 }, { "epoch": 1.9972744226079473, "grad_norm": 22.375, "learning_rate": 1.3353720693170236e-05, "loss": 0.7196, "step": 1741 }, { "epoch": 1.9984220341414431, "grad_norm": 9.9375, "learning_rate": 1.3348623853211012e-05, "loss": 0.2518, "step": 1742 }, { "epoch": 1.999569645674939, "grad_norm": 24.0, "learning_rate": 1.3343527013251785e-05, "loss": 0.5652, "step": 1743 }, { "epoch": 2.0, "grad_norm": 43.0, "learning_rate": 1.3338430173292561e-05, "loss": 0.2228, "step": 1744 }, { "epoch": 2.001147611533496, "grad_norm": 12.75, "learning_rate": 1.3333333333333333e-05, "loss": 0.2022, "step": 1745 }, { "epoch": 2.0022952230669917, "grad_norm": 38.5, "learning_rate": 1.3328236493374109e-05, "loss": 0.5685, "step": 1746 }, { "epoch": 2.0034428346004876, "grad_norm": 17.875, "learning_rate": 1.3323139653414884e-05, "loss": 0.4231, "step": 1747 }, { "epoch": 2.004590446133984, "grad_norm": 30.125, "learning_rate": 1.3318042813455658e-05, "loss": 0.4095, "step": 1748 }, { "epoch": 2.0057380576674797, "grad_norm": 9.125, "learning_rate": 1.3312945973496434e-05, "loss": 0.2246, "step": 1749 }, { "epoch": 2.0068856692009756, "grad_norm": 13.3125, "learning_rate": 1.3307849133537208e-05, "loss": 0.3579, "step": 1750 }, { "epoch": 2.0080332807344714, "grad_norm": 43.25, "learning_rate": 1.3302752293577984e-05, "loss": 0.2611, "step": 1751 }, { "epoch": 2.0091808922679673, "grad_norm": 29.125, "learning_rate": 1.3297655453618759e-05, "loss": 0.1953, "step": 1752 }, { "epoch": 2.010328503801463, "grad_norm": 25.625, "learning_rate": 1.3292558613659531e-05, "loss": 0.3513, "step": 1753 }, { "epoch": 2.011476115334959, "grad_norm": 16.5, "learning_rate": 1.3287461773700307e-05, "loss": 0.2294, "step": 1754 }, { "epoch": 2.012623726868455, "grad_norm": 7.34375, "learning_rate": 1.328236493374108e-05, "loss": 0.1434, "step": 1755 }, { "epoch": 2.013771338401951, "grad_norm": 18.625, "learning_rate": 1.3277268093781856e-05, "loss": 0.5704, "step": 1756 }, { "epoch": 2.014918949935447, "grad_norm": 47.25, "learning_rate": 1.3272171253822632e-05, "loss": 0.4283, "step": 1757 }, { "epoch": 2.016066561468943, "grad_norm": 9.6875, "learning_rate": 1.3267074413863406e-05, "loss": 0.2391, "step": 1758 }, { "epoch": 2.0172141730024387, "grad_norm": 34.5, "learning_rate": 1.3261977573904182e-05, "loss": 0.248, "step": 1759 }, { "epoch": 2.0183617845359345, "grad_norm": 11.9375, "learning_rate": 1.3256880733944954e-05, "loss": 0.2023, "step": 1760 }, { "epoch": 2.0195093960694304, "grad_norm": 27.375, "learning_rate": 1.325178389398573e-05, "loss": 0.261, "step": 1761 }, { "epoch": 2.020657007602926, "grad_norm": 10.875, "learning_rate": 1.3246687054026503e-05, "loss": 0.224, "step": 1762 }, { "epoch": 2.0218046191364225, "grad_norm": 24.375, "learning_rate": 1.3241590214067279e-05, "loss": 0.2463, "step": 1763 }, { "epoch": 2.0229522306699184, "grad_norm": 18.5, "learning_rate": 1.3236493374108054e-05, "loss": 0.2944, "step": 1764 }, { "epoch": 2.024099842203414, "grad_norm": 24.0, "learning_rate": 1.3231396534148828e-05, "loss": 0.315, "step": 1765 }, { "epoch": 2.02524745373691, "grad_norm": 54.0, "learning_rate": 1.3226299694189604e-05, "loss": 0.3818, "step": 1766 }, { "epoch": 2.026395065270406, "grad_norm": 18.5, "learning_rate": 1.3221202854230378e-05, "loss": 0.3524, "step": 1767 }, { "epoch": 2.0275426768039018, "grad_norm": 19.75, "learning_rate": 1.3216106014271153e-05, "loss": 0.3522, "step": 1768 }, { "epoch": 2.0286902883373976, "grad_norm": 35.0, "learning_rate": 1.3211009174311929e-05, "loss": 0.3279, "step": 1769 }, { "epoch": 2.029837899870894, "grad_norm": 56.75, "learning_rate": 1.3205912334352701e-05, "loss": 1.3613, "step": 1770 }, { "epoch": 2.0309855114043898, "grad_norm": 28.0, "learning_rate": 1.3200815494393477e-05, "loss": 0.2122, "step": 1771 }, { "epoch": 2.0321331229378856, "grad_norm": 37.0, "learning_rate": 1.319571865443425e-05, "loss": 0.1997, "step": 1772 }, { "epoch": 2.0332807344713815, "grad_norm": 6.0, "learning_rate": 1.3190621814475026e-05, "loss": 0.0679, "step": 1773 }, { "epoch": 2.0344283460048773, "grad_norm": 16.75, "learning_rate": 1.3185524974515802e-05, "loss": 0.3065, "step": 1774 }, { "epoch": 2.035575957538373, "grad_norm": 33.5, "learning_rate": 1.3180428134556576e-05, "loss": 0.2069, "step": 1775 }, { "epoch": 2.036723569071869, "grad_norm": 27.25, "learning_rate": 1.3175331294597351e-05, "loss": 0.2496, "step": 1776 }, { "epoch": 2.037871180605365, "grad_norm": 29.25, "learning_rate": 1.3170234454638124e-05, "loss": 0.3496, "step": 1777 }, { "epoch": 2.039018792138861, "grad_norm": 22.375, "learning_rate": 1.31651376146789e-05, "loss": 0.302, "step": 1778 }, { "epoch": 2.040166403672357, "grad_norm": 30.375, "learning_rate": 1.3160040774719675e-05, "loss": 0.5632, "step": 1779 }, { "epoch": 2.041314015205853, "grad_norm": 61.0, "learning_rate": 1.3154943934760449e-05, "loss": 0.3705, "step": 1780 }, { "epoch": 2.0424616267393487, "grad_norm": 9.1875, "learning_rate": 1.3149847094801224e-05, "loss": 0.1027, "step": 1781 }, { "epoch": 2.0436092382728446, "grad_norm": 13.75, "learning_rate": 1.3144750254841998e-05, "loss": 0.0892, "step": 1782 }, { "epoch": 2.0447568498063404, "grad_norm": 25.0, "learning_rate": 1.3139653414882774e-05, "loss": 0.6006, "step": 1783 }, { "epoch": 2.0459044613398363, "grad_norm": 11.0625, "learning_rate": 1.313455657492355e-05, "loss": 0.1804, "step": 1784 }, { "epoch": 2.0470520728733326, "grad_norm": 32.75, "learning_rate": 1.3129459734964323e-05, "loss": 0.3527, "step": 1785 }, { "epoch": 2.0481996844068284, "grad_norm": 98.5, "learning_rate": 1.3124362895005099e-05, "loss": 0.8699, "step": 1786 }, { "epoch": 2.0493472959403243, "grad_norm": 40.5, "learning_rate": 1.3119266055045871e-05, "loss": 0.193, "step": 1787 }, { "epoch": 2.05049490747382, "grad_norm": 27.75, "learning_rate": 1.3114169215086647e-05, "loss": 0.3553, "step": 1788 }, { "epoch": 2.051642519007316, "grad_norm": 14.4375, "learning_rate": 1.3109072375127422e-05, "loss": 0.1813, "step": 1789 }, { "epoch": 2.052790130540812, "grad_norm": 17.75, "learning_rate": 1.3103975535168196e-05, "loss": 0.0898, "step": 1790 }, { "epoch": 2.0539377420743077, "grad_norm": 9.25, "learning_rate": 1.3098878695208972e-05, "loss": 0.1149, "step": 1791 }, { "epoch": 2.0550853536078035, "grad_norm": 46.0, "learning_rate": 1.3093781855249746e-05, "loss": 0.2937, "step": 1792 }, { "epoch": 2.0562329651413, "grad_norm": 17.125, "learning_rate": 1.3088685015290521e-05, "loss": 0.3886, "step": 1793 }, { "epoch": 2.0573805766747957, "grad_norm": 25.875, "learning_rate": 1.3083588175331297e-05, "loss": 0.2858, "step": 1794 }, { "epoch": 2.0585281882082915, "grad_norm": 42.5, "learning_rate": 1.307849133537207e-05, "loss": 0.6463, "step": 1795 }, { "epoch": 2.0596757997417874, "grad_norm": 97.5, "learning_rate": 1.3073394495412845e-05, "loss": 0.9309, "step": 1796 }, { "epoch": 2.0608234112752832, "grad_norm": 27.25, "learning_rate": 1.3068297655453619e-05, "loss": 0.3763, "step": 1797 }, { "epoch": 2.061971022808779, "grad_norm": 137.0, "learning_rate": 1.3063200815494394e-05, "loss": 1.1044, "step": 1798 }, { "epoch": 2.063118634342275, "grad_norm": 12.1875, "learning_rate": 1.305810397553517e-05, "loss": 0.1574, "step": 1799 }, { "epoch": 2.0642662458757712, "grad_norm": 22.875, "learning_rate": 1.3053007135575944e-05, "loss": 0.1174, "step": 1800 }, { "epoch": 2.0642662458757712, "eval_accuracy": 0.74, "eval_loss": 0.4835154712200165, "eval_runtime": 49.2987, "eval_samples_per_second": 2.028, "eval_steps_per_second": 2.028, "step": 1800 } ], "logging_steps": 1, "max_steps": 4360, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.358825065150048e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }