{ "best_metric": 0.07552551478147507, "best_model_checkpoint": "ctc-modernbert-base-autotrain-filtered/checkpoint-9016", "epoch": 1.0, "eval_steps": 500, "global_step": 9016, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00033274179236912155, "grad_norm": 116.45602416992188, "learning_rate": 3.325942350332594e-08, "loss": 2.7382, "step": 3 }, { "epoch": 0.0006654835847382431, "grad_norm": 121.71471405029297, "learning_rate": 6.651884700665188e-08, "loss": 2.7504, "step": 6 }, { "epoch": 0.0009982253771073646, "grad_norm": 113.23915100097656, "learning_rate": 9.977827050997783e-08, "loss": 2.6337, "step": 9 }, { "epoch": 0.0013309671694764862, "grad_norm": 117.65518951416016, "learning_rate": 1.3303769401330377e-07, "loss": 2.6018, "step": 12 }, { "epoch": 0.0016637089618456078, "grad_norm": 115.69046783447266, "learning_rate": 1.6629711751662972e-07, "loss": 2.5365, "step": 15 }, { "epoch": 0.001996450754214729, "grad_norm": 106.99871826171875, "learning_rate": 1.9955654101995567e-07, "loss": 2.4244, "step": 18 }, { "epoch": 0.002329192546583851, "grad_norm": 103.35103607177734, "learning_rate": 2.3281596452328162e-07, "loss": 2.2755, "step": 21 }, { "epoch": 0.0026619343389529724, "grad_norm": 95.66140747070312, "learning_rate": 2.6607538802660754e-07, "loss": 2.1386, "step": 24 }, { "epoch": 0.002994676131322094, "grad_norm": 86.7462158203125, "learning_rate": 2.993348115299335e-07, "loss": 1.9544, "step": 27 }, { "epoch": 0.0033274179236912156, "grad_norm": 82.22989654541016, "learning_rate": 3.3259423503325944e-07, "loss": 1.7853, "step": 30 }, { "epoch": 0.003660159716060337, "grad_norm": 77.33919525146484, "learning_rate": 3.6585365853658536e-07, "loss": 1.5973, "step": 33 }, { "epoch": 0.003992901508429458, "grad_norm": 66.63919067382812, "learning_rate": 3.9911308203991133e-07, "loss": 1.4044, "step": 36 }, { "epoch": 0.00432564330079858, "grad_norm": 61.586795806884766, "learning_rate": 4.3237250554323726e-07, "loss": 1.2546, "step": 39 }, { "epoch": 0.004658385093167702, "grad_norm": 53.07887268066406, "learning_rate": 4.6563192904656323e-07, "loss": 1.1361, "step": 42 }, { "epoch": 0.004991126885536824, "grad_norm": 46.03369140625, "learning_rate": 4.988913525498892e-07, "loss": 0.9384, "step": 45 }, { "epoch": 0.005323868677905945, "grad_norm": 35.72394561767578, "learning_rate": 5.321507760532151e-07, "loss": 0.7908, "step": 48 }, { "epoch": 0.005656610470275067, "grad_norm": 31.20680046081543, "learning_rate": 5.65410199556541e-07, "loss": 0.647, "step": 51 }, { "epoch": 0.005989352262644188, "grad_norm": 23.265235900878906, "learning_rate": 5.98669623059867e-07, "loss": 0.6121, "step": 54 }, { "epoch": 0.006322094055013309, "grad_norm": 18.328041076660156, "learning_rate": 6.319290465631929e-07, "loss": 0.488, "step": 57 }, { "epoch": 0.006654835847382431, "grad_norm": 11.46716594696045, "learning_rate": 6.651884700665189e-07, "loss": 0.4149, "step": 60 }, { "epoch": 0.006987577639751553, "grad_norm": 6.271081924438477, "learning_rate": 6.984478935698448e-07, "loss": 0.3356, "step": 63 }, { "epoch": 0.007320319432120674, "grad_norm": 5.837164878845215, "learning_rate": 7.317073170731707e-07, "loss": 0.3602, "step": 66 }, { "epoch": 0.007653061224489796, "grad_norm": 5.139904975891113, "learning_rate": 7.649667405764967e-07, "loss": 0.3182, "step": 69 }, { "epoch": 0.007985803016858917, "grad_norm": 1.933678150177002, "learning_rate": 7.982261640798227e-07, "loss": 0.2973, "step": 72 }, { "epoch": 0.00831854480922804, "grad_norm": 1.275964379310608, "learning_rate": 8.314855875831486e-07, "loss": 0.2432, "step": 75 }, { "epoch": 0.00865128660159716, "grad_norm": 3.207477331161499, "learning_rate": 8.647450110864745e-07, "loss": 0.3515, "step": 78 }, { "epoch": 0.008984028393966283, "grad_norm": 2.2875447273254395, "learning_rate": 8.980044345898005e-07, "loss": 0.2443, "step": 81 }, { "epoch": 0.009316770186335404, "grad_norm": 4.695093154907227, "learning_rate": 9.312638580931265e-07, "loss": 0.252, "step": 84 }, { "epoch": 0.009649511978704525, "grad_norm": 1.3692725896835327, "learning_rate": 9.645232815964523e-07, "loss": 0.1746, "step": 87 }, { "epoch": 0.009982253771073648, "grad_norm": 1.3868299722671509, "learning_rate": 9.977827050997784e-07, "loss": 0.254, "step": 90 }, { "epoch": 0.010314995563442769, "grad_norm": 1.7074756622314453, "learning_rate": 1.0310421286031043e-06, "loss": 0.3812, "step": 93 }, { "epoch": 0.01064773735581189, "grad_norm": 1.0337737798690796, "learning_rate": 1.0643015521064302e-06, "loss": 0.2123, "step": 96 }, { "epoch": 0.010980479148181012, "grad_norm": 0.9756077527999878, "learning_rate": 1.0975609756097562e-06, "loss": 0.2246, "step": 99 }, { "epoch": 0.011313220940550133, "grad_norm": 3.3130385875701904, "learning_rate": 1.130820399113082e-06, "loss": 0.3149, "step": 102 }, { "epoch": 0.011645962732919254, "grad_norm": 2.909376382827759, "learning_rate": 1.1640798226164082e-06, "loss": 0.2407, "step": 105 }, { "epoch": 0.011978704525288377, "grad_norm": 0.8024623990058899, "learning_rate": 1.197339246119734e-06, "loss": 0.1555, "step": 108 }, { "epoch": 0.012311446317657498, "grad_norm": 1.051998257637024, "learning_rate": 1.23059866962306e-06, "loss": 0.2871, "step": 111 }, { "epoch": 0.012644188110026619, "grad_norm": 1.4658302068710327, "learning_rate": 1.2638580931263858e-06, "loss": 0.2305, "step": 114 }, { "epoch": 0.012976929902395741, "grad_norm": 1.2319782972335815, "learning_rate": 1.2971175166297117e-06, "loss": 0.221, "step": 117 }, { "epoch": 0.013309671694764862, "grad_norm": 1.6682897806167603, "learning_rate": 1.3303769401330377e-06, "loss": 0.2429, "step": 120 }, { "epoch": 0.013642413487133983, "grad_norm": 1.4028093814849854, "learning_rate": 1.3636363636363636e-06, "loss": 0.2215, "step": 123 }, { "epoch": 0.013975155279503106, "grad_norm": 4.261254787445068, "learning_rate": 1.3968957871396897e-06, "loss": 0.3644, "step": 126 }, { "epoch": 0.014307897071872227, "grad_norm": 1.8142163753509521, "learning_rate": 1.4301552106430156e-06, "loss": 0.2207, "step": 129 }, { "epoch": 0.014640638864241348, "grad_norm": 1.3985562324523926, "learning_rate": 1.4634146341463414e-06, "loss": 0.2066, "step": 132 }, { "epoch": 0.01497338065661047, "grad_norm": 1.300295352935791, "learning_rate": 1.4966740576496675e-06, "loss": 0.2565, "step": 135 }, { "epoch": 0.015306122448979591, "grad_norm": 2.3093273639678955, "learning_rate": 1.5299334811529934e-06, "loss": 0.251, "step": 138 }, { "epoch": 0.015638864241348714, "grad_norm": 1.0073604583740234, "learning_rate": 1.5631929046563193e-06, "loss": 0.2618, "step": 141 }, { "epoch": 0.015971606033717833, "grad_norm": 4.181806564331055, "learning_rate": 1.5964523281596453e-06, "loss": 0.2447, "step": 144 }, { "epoch": 0.016304347826086956, "grad_norm": 1.8999375104904175, "learning_rate": 1.6297117516629712e-06, "loss": 0.2712, "step": 147 }, { "epoch": 0.01663708961845608, "grad_norm": 3.3627398014068604, "learning_rate": 1.6629711751662973e-06, "loss": 0.27, "step": 150 }, { "epoch": 0.016969831410825198, "grad_norm": 2.289334535598755, "learning_rate": 1.6962305986696232e-06, "loss": 0.2442, "step": 153 }, { "epoch": 0.01730257320319432, "grad_norm": 1.4350817203521729, "learning_rate": 1.729490022172949e-06, "loss": 0.283, "step": 156 }, { "epoch": 0.017635314995563443, "grad_norm": 2.3187780380249023, "learning_rate": 1.762749445676275e-06, "loss": 0.2021, "step": 159 }, { "epoch": 0.017968056787932566, "grad_norm": 2.6814444065093994, "learning_rate": 1.796008869179601e-06, "loss": 0.2377, "step": 162 }, { "epoch": 0.018300798580301685, "grad_norm": 1.4875733852386475, "learning_rate": 1.8292682926829268e-06, "loss": 0.1578, "step": 165 }, { "epoch": 0.018633540372670808, "grad_norm": 1.5893361568450928, "learning_rate": 1.862527716186253e-06, "loss": 0.1904, "step": 168 }, { "epoch": 0.01896628216503993, "grad_norm": 2.5844883918762207, "learning_rate": 1.8957871396895788e-06, "loss": 0.2066, "step": 171 }, { "epoch": 0.01929902395740905, "grad_norm": 1.480236291885376, "learning_rate": 1.9290465631929047e-06, "loss": 0.2373, "step": 174 }, { "epoch": 0.019631765749778173, "grad_norm": 1.844759464263916, "learning_rate": 1.9623059866962307e-06, "loss": 0.2235, "step": 177 }, { "epoch": 0.019964507542147295, "grad_norm": 4.348615646362305, "learning_rate": 1.995565410199557e-06, "loss": 0.2003, "step": 180 }, { "epoch": 0.020297249334516414, "grad_norm": 9.635419845581055, "learning_rate": 2.0288248337028825e-06, "loss": 0.2646, "step": 183 }, { "epoch": 0.020629991126885537, "grad_norm": 1.521409273147583, "learning_rate": 2.0620842572062086e-06, "loss": 0.1599, "step": 186 }, { "epoch": 0.02096273291925466, "grad_norm": 3.996305227279663, "learning_rate": 2.0953436807095346e-06, "loss": 0.261, "step": 189 }, { "epoch": 0.02129547471162378, "grad_norm": 3.0902867317199707, "learning_rate": 2.1286031042128603e-06, "loss": 0.2138, "step": 192 }, { "epoch": 0.0216282165039929, "grad_norm": 1.6262084245681763, "learning_rate": 2.1618625277161864e-06, "loss": 0.2428, "step": 195 }, { "epoch": 0.021960958296362024, "grad_norm": 5.350574016571045, "learning_rate": 2.1951219512195125e-06, "loss": 0.2151, "step": 198 }, { "epoch": 0.022293700088731144, "grad_norm": 3.7961254119873047, "learning_rate": 2.228381374722838e-06, "loss": 0.2299, "step": 201 }, { "epoch": 0.022626441881100266, "grad_norm": 2.829364776611328, "learning_rate": 2.261640798226164e-06, "loss": 0.2001, "step": 204 }, { "epoch": 0.02295918367346939, "grad_norm": 1.8136563301086426, "learning_rate": 2.2949002217294903e-06, "loss": 0.1891, "step": 207 }, { "epoch": 0.023291925465838508, "grad_norm": 2.169090986251831, "learning_rate": 2.3281596452328164e-06, "loss": 0.1644, "step": 210 }, { "epoch": 0.02362466725820763, "grad_norm": 1.4115501642227173, "learning_rate": 2.361419068736142e-06, "loss": 0.1733, "step": 213 }, { "epoch": 0.023957409050576754, "grad_norm": 2.6943325996398926, "learning_rate": 2.394678492239468e-06, "loss": 0.2417, "step": 216 }, { "epoch": 0.024290150842945873, "grad_norm": 1.9311012029647827, "learning_rate": 2.427937915742794e-06, "loss": 0.13, "step": 219 }, { "epoch": 0.024622892635314995, "grad_norm": 1.9004130363464355, "learning_rate": 2.46119733924612e-06, "loss": 0.2017, "step": 222 }, { "epoch": 0.024955634427684118, "grad_norm": 1.9925228357315063, "learning_rate": 2.494456762749446e-06, "loss": 0.1299, "step": 225 }, { "epoch": 0.025288376220053237, "grad_norm": 6.313392639160156, "learning_rate": 2.5277161862527716e-06, "loss": 0.1386, "step": 228 }, { "epoch": 0.02562111801242236, "grad_norm": 5.422986030578613, "learning_rate": 2.5609756097560977e-06, "loss": 0.1577, "step": 231 }, { "epoch": 0.025953859804791483, "grad_norm": 3.6749863624572754, "learning_rate": 2.5942350332594233e-06, "loss": 0.1454, "step": 234 }, { "epoch": 0.026286601597160602, "grad_norm": 5.444757461547852, "learning_rate": 2.62749445676275e-06, "loss": 0.2229, "step": 237 }, { "epoch": 0.026619343389529725, "grad_norm": 10.121847152709961, "learning_rate": 2.6607538802660755e-06, "loss": 0.1786, "step": 240 }, { "epoch": 0.026952085181898847, "grad_norm": 3.036048173904419, "learning_rate": 2.6940133037694016e-06, "loss": 0.1981, "step": 243 }, { "epoch": 0.027284826974267967, "grad_norm": 5.769317626953125, "learning_rate": 2.7272727272727272e-06, "loss": 0.2239, "step": 246 }, { "epoch": 0.02761756876663709, "grad_norm": 3.947753667831421, "learning_rate": 2.7605321507760537e-06, "loss": 0.1723, "step": 249 }, { "epoch": 0.027950310559006212, "grad_norm": 3.1042640209198, "learning_rate": 2.7937915742793794e-06, "loss": 0.1552, "step": 252 }, { "epoch": 0.02828305235137533, "grad_norm": 3.8892836570739746, "learning_rate": 2.8270509977827055e-06, "loss": 0.1743, "step": 255 }, { "epoch": 0.028615794143744454, "grad_norm": 2.5682852268218994, "learning_rate": 2.860310421286031e-06, "loss": 0.1655, "step": 258 }, { "epoch": 0.028948535936113576, "grad_norm": 4.817167282104492, "learning_rate": 2.893569844789357e-06, "loss": 0.1164, "step": 261 }, { "epoch": 0.029281277728482696, "grad_norm": 7.413038730621338, "learning_rate": 2.926829268292683e-06, "loss": 0.1365, "step": 264 }, { "epoch": 0.02961401952085182, "grad_norm": 2.501265525817871, "learning_rate": 2.9600886917960094e-06, "loss": 0.2016, "step": 267 }, { "epoch": 0.02994676131322094, "grad_norm": 2.106494665145874, "learning_rate": 2.993348115299335e-06, "loss": 0.1178, "step": 270 }, { "epoch": 0.030279503105590064, "grad_norm": 2.704066753387451, "learning_rate": 3.026607538802661e-06, "loss": 0.1288, "step": 273 }, { "epoch": 0.030612244897959183, "grad_norm": 3.6587283611297607, "learning_rate": 3.0598669623059868e-06, "loss": 0.1946, "step": 276 }, { "epoch": 0.030944986690328306, "grad_norm": 7.615991592407227, "learning_rate": 3.093126385809313e-06, "loss": 0.1599, "step": 279 }, { "epoch": 0.03127772848269743, "grad_norm": 2.9951610565185547, "learning_rate": 3.1263858093126385e-06, "loss": 0.1746, "step": 282 }, { "epoch": 0.03161047027506655, "grad_norm": 5.550237655639648, "learning_rate": 3.159645232815965e-06, "loss": 0.1516, "step": 285 }, { "epoch": 0.03194321206743567, "grad_norm": 2.486086845397949, "learning_rate": 3.1929046563192907e-06, "loss": 0.2137, "step": 288 }, { "epoch": 0.03227595385980479, "grad_norm": 5.265333652496338, "learning_rate": 3.2261640798226168e-06, "loss": 0.2395, "step": 291 }, { "epoch": 0.03260869565217391, "grad_norm": 7.613430976867676, "learning_rate": 3.2594235033259424e-06, "loss": 0.1974, "step": 294 }, { "epoch": 0.03294143744454303, "grad_norm": 4.081305503845215, "learning_rate": 3.292682926829269e-06, "loss": 0.1561, "step": 297 }, { "epoch": 0.03327417923691216, "grad_norm": 2.143343687057495, "learning_rate": 3.3259423503325946e-06, "loss": 0.1528, "step": 300 }, { "epoch": 0.03360692102928128, "grad_norm": 2.6399543285369873, "learning_rate": 3.3592017738359207e-06, "loss": 0.1742, "step": 303 }, { "epoch": 0.033939662821650396, "grad_norm": 6.797262668609619, "learning_rate": 3.3924611973392463e-06, "loss": 0.1487, "step": 306 }, { "epoch": 0.03427240461401952, "grad_norm": 3.6419219970703125, "learning_rate": 3.4257206208425724e-06, "loss": 0.109, "step": 309 }, { "epoch": 0.03460514640638864, "grad_norm": 1.678947925567627, "learning_rate": 3.458980044345898e-06, "loss": 0.1372, "step": 312 }, { "epoch": 0.03493788819875776, "grad_norm": 6.261575222015381, "learning_rate": 3.4922394678492246e-06, "loss": 0.1712, "step": 315 }, { "epoch": 0.03527062999112689, "grad_norm": 4.248866558074951, "learning_rate": 3.52549889135255e-06, "loss": 0.1951, "step": 318 }, { "epoch": 0.035603371783496006, "grad_norm": 6.463629722595215, "learning_rate": 3.5587583148558763e-06, "loss": 0.1636, "step": 321 }, { "epoch": 0.03593611357586513, "grad_norm": 3.7620456218719482, "learning_rate": 3.592017738359202e-06, "loss": 0.125, "step": 324 }, { "epoch": 0.03626885536823425, "grad_norm": 7.120460033416748, "learning_rate": 3.625277161862528e-06, "loss": 0.1772, "step": 327 }, { "epoch": 0.03660159716060337, "grad_norm": 6.376923084259033, "learning_rate": 3.6585365853658537e-06, "loss": 0.1454, "step": 330 }, { "epoch": 0.0369343389529725, "grad_norm": 3.572136878967285, "learning_rate": 3.69179600886918e-06, "loss": 0.1487, "step": 333 }, { "epoch": 0.037267080745341616, "grad_norm": 10.40429973602295, "learning_rate": 3.725055432372506e-06, "loss": 0.1974, "step": 336 }, { "epoch": 0.037599822537710735, "grad_norm": 4.0785369873046875, "learning_rate": 3.758314855875832e-06, "loss": 0.1645, "step": 339 }, { "epoch": 0.03793256433007986, "grad_norm": 4.677783012390137, "learning_rate": 3.7915742793791576e-06, "loss": 0.1843, "step": 342 }, { "epoch": 0.03826530612244898, "grad_norm": 6.009620189666748, "learning_rate": 3.824833702882484e-06, "loss": 0.1457, "step": 345 }, { "epoch": 0.0385980479148181, "grad_norm": 3.66740083694458, "learning_rate": 3.858093126385809e-06, "loss": 0.1585, "step": 348 }, { "epoch": 0.038930789707187226, "grad_norm": 3.3587896823883057, "learning_rate": 3.891352549889136e-06, "loss": 0.1473, "step": 351 }, { "epoch": 0.039263531499556345, "grad_norm": 3.565919876098633, "learning_rate": 3.9246119733924615e-06, "loss": 0.1288, "step": 354 }, { "epoch": 0.039596273291925464, "grad_norm": 2.8366236686706543, "learning_rate": 3.957871396895788e-06, "loss": 0.1417, "step": 357 }, { "epoch": 0.03992901508429459, "grad_norm": 3.336660385131836, "learning_rate": 3.991130820399114e-06, "loss": 0.1448, "step": 360 }, { "epoch": 0.04026175687666371, "grad_norm": 10.178718566894531, "learning_rate": 4.024390243902439e-06, "loss": 0.1805, "step": 363 }, { "epoch": 0.04059449866903283, "grad_norm": 6.065635681152344, "learning_rate": 4.057649667405765e-06, "loss": 0.1239, "step": 366 }, { "epoch": 0.040927240461401955, "grad_norm": 1.9610888957977295, "learning_rate": 4.0909090909090915e-06, "loss": 0.1518, "step": 369 }, { "epoch": 0.041259982253771074, "grad_norm": 2.773746967315674, "learning_rate": 4.124168514412417e-06, "loss": 0.1735, "step": 372 }, { "epoch": 0.04159272404614019, "grad_norm": 4.9068074226379395, "learning_rate": 4.157427937915744e-06, "loss": 0.1587, "step": 375 }, { "epoch": 0.04192546583850932, "grad_norm": 3.2619822025299072, "learning_rate": 4.190687361419069e-06, "loss": 0.1564, "step": 378 }, { "epoch": 0.04225820763087844, "grad_norm": 2.9787867069244385, "learning_rate": 4.223946784922395e-06, "loss": 0.0813, "step": 381 }, { "epoch": 0.04259094942324756, "grad_norm": 6.57169771194458, "learning_rate": 4.257206208425721e-06, "loss": 0.1614, "step": 384 }, { "epoch": 0.042923691215616684, "grad_norm": 4.957389831542969, "learning_rate": 4.290465631929046e-06, "loss": 0.1114, "step": 387 }, { "epoch": 0.0432564330079858, "grad_norm": 2.741069793701172, "learning_rate": 4.323725055432373e-06, "loss": 0.1734, "step": 390 }, { "epoch": 0.04358917480035492, "grad_norm": 17.72960090637207, "learning_rate": 4.3569844789356984e-06, "loss": 0.1952, "step": 393 }, { "epoch": 0.04392191659272405, "grad_norm": 3.9304096698760986, "learning_rate": 4.390243902439025e-06, "loss": 0.1728, "step": 396 }, { "epoch": 0.04425465838509317, "grad_norm": 8.182540893554688, "learning_rate": 4.423503325942351e-06, "loss": 0.1359, "step": 399 }, { "epoch": 0.04458740017746229, "grad_norm": 3.8890185356140137, "learning_rate": 4.456762749445676e-06, "loss": 0.1472, "step": 402 }, { "epoch": 0.04492014196983141, "grad_norm": 5.049743175506592, "learning_rate": 4.490022172949003e-06, "loss": 0.1435, "step": 405 }, { "epoch": 0.04525288376220053, "grad_norm": 9.127073287963867, "learning_rate": 4.523281596452328e-06, "loss": 0.1558, "step": 408 }, { "epoch": 0.04558562555456965, "grad_norm": 9.523186683654785, "learning_rate": 4.556541019955654e-06, "loss": 0.1564, "step": 411 }, { "epoch": 0.04591836734693878, "grad_norm": 2.829712390899658, "learning_rate": 4.5898004434589806e-06, "loss": 0.1012, "step": 414 }, { "epoch": 0.0462511091393079, "grad_norm": 9.163731575012207, "learning_rate": 4.623059866962306e-06, "loss": 0.1796, "step": 417 }, { "epoch": 0.046583850931677016, "grad_norm": 11.374866485595703, "learning_rate": 4.656319290465633e-06, "loss": 0.1892, "step": 420 }, { "epoch": 0.04691659272404614, "grad_norm": 2.592428207397461, "learning_rate": 4.689578713968958e-06, "loss": 0.0865, "step": 423 }, { "epoch": 0.04724933451641526, "grad_norm": 6.728632926940918, "learning_rate": 4.722838137472284e-06, "loss": 0.1913, "step": 426 }, { "epoch": 0.04758207630878438, "grad_norm": 2.2662742137908936, "learning_rate": 4.75609756097561e-06, "loss": 0.1546, "step": 429 }, { "epoch": 0.04791481810115351, "grad_norm": 1.3336840867996216, "learning_rate": 4.789356984478936e-06, "loss": 0.1148, "step": 432 }, { "epoch": 0.048247559893522626, "grad_norm": 7.176014423370361, "learning_rate": 4.822616407982262e-06, "loss": 0.0976, "step": 435 }, { "epoch": 0.048580301685891746, "grad_norm": 2.6139843463897705, "learning_rate": 4.855875831485588e-06, "loss": 0.0934, "step": 438 }, { "epoch": 0.04891304347826087, "grad_norm": 4.2290825843811035, "learning_rate": 4.889135254988914e-06, "loss": 0.1469, "step": 441 }, { "epoch": 0.04924578527062999, "grad_norm": 5.063584804534912, "learning_rate": 4.92239467849224e-06, "loss": 0.122, "step": 444 }, { "epoch": 0.04957852706299911, "grad_norm": 2.5562191009521484, "learning_rate": 4.955654101995565e-06, "loss": 0.0855, "step": 447 }, { "epoch": 0.049911268855368236, "grad_norm": 3.33693528175354, "learning_rate": 4.988913525498892e-06, "loss": 0.1326, "step": 450 }, { "epoch": 0.050244010647737355, "grad_norm": 2.5598249435424805, "learning_rate": 5.0221729490022175e-06, "loss": 0.1261, "step": 453 }, { "epoch": 0.050576752440106475, "grad_norm": 5.065152645111084, "learning_rate": 5.055432372505543e-06, "loss": 0.1319, "step": 456 }, { "epoch": 0.0509094942324756, "grad_norm": 4.050305366516113, "learning_rate": 5.08869179600887e-06, "loss": 0.1381, "step": 459 }, { "epoch": 0.05124223602484472, "grad_norm": 3.2287888526916504, "learning_rate": 5.121951219512195e-06, "loss": 0.1208, "step": 462 }, { "epoch": 0.05157497781721384, "grad_norm": 6.50496244430542, "learning_rate": 5.155210643015521e-06, "loss": 0.1251, "step": 465 }, { "epoch": 0.051907719609582965, "grad_norm": 5.384757041931152, "learning_rate": 5.188470066518847e-06, "loss": 0.119, "step": 468 }, { "epoch": 0.052240461401952085, "grad_norm": 2.058398723602295, "learning_rate": 5.221729490022174e-06, "loss": 0.0751, "step": 471 }, { "epoch": 0.052573203194321204, "grad_norm": 1.4365679025650024, "learning_rate": 5.2549889135255e-06, "loss": 0.1215, "step": 474 }, { "epoch": 0.05290594498669033, "grad_norm": 2.306805372238159, "learning_rate": 5.288248337028825e-06, "loss": 0.125, "step": 477 }, { "epoch": 0.05323868677905945, "grad_norm": 9.586624145507812, "learning_rate": 5.321507760532151e-06, "loss": 0.1193, "step": 480 }, { "epoch": 0.05357142857142857, "grad_norm": 5.203243732452393, "learning_rate": 5.3547671840354775e-06, "loss": 0.1315, "step": 483 }, { "epoch": 0.053904170363797695, "grad_norm": 2.1737310886383057, "learning_rate": 5.388026607538803e-06, "loss": 0.1162, "step": 486 }, { "epoch": 0.054236912156166814, "grad_norm": 4.899698734283447, "learning_rate": 5.421286031042129e-06, "loss": 0.1377, "step": 489 }, { "epoch": 0.05456965394853593, "grad_norm": 5.738438606262207, "learning_rate": 5.4545454545454545e-06, "loss": 0.0804, "step": 492 }, { "epoch": 0.05490239574090506, "grad_norm": 2.1686787605285645, "learning_rate": 5.487804878048781e-06, "loss": 0.1548, "step": 495 }, { "epoch": 0.05523513753327418, "grad_norm": 4.465102672576904, "learning_rate": 5.5210643015521075e-06, "loss": 0.1174, "step": 498 }, { "epoch": 0.0555678793256433, "grad_norm": 5.7427144050598145, "learning_rate": 5.554323725055433e-06, "loss": 0.1745, "step": 501 }, { "epoch": 0.055900621118012424, "grad_norm": 2.415872573852539, "learning_rate": 5.587583148558759e-06, "loss": 0.1091, "step": 504 }, { "epoch": 0.05623336291038154, "grad_norm": 3.09688138961792, "learning_rate": 5.620842572062085e-06, "loss": 0.1198, "step": 507 }, { "epoch": 0.05656610470275066, "grad_norm": 2.555908679962158, "learning_rate": 5.654101995565411e-06, "loss": 0.0896, "step": 510 }, { "epoch": 0.05689884649511979, "grad_norm": 3.776787757873535, "learning_rate": 5.687361419068737e-06, "loss": 0.0987, "step": 513 }, { "epoch": 0.05723158828748891, "grad_norm": 21.37792205810547, "learning_rate": 5.720620842572062e-06, "loss": 0.1081, "step": 516 }, { "epoch": 0.05756433007985803, "grad_norm": 2.8083465099334717, "learning_rate": 5.753880266075389e-06, "loss": 0.1356, "step": 519 }, { "epoch": 0.05789707187222715, "grad_norm": 2.7282285690307617, "learning_rate": 5.787139689578714e-06, "loss": 0.1192, "step": 522 }, { "epoch": 0.05822981366459627, "grad_norm": 1.350984811782837, "learning_rate": 5.82039911308204e-06, "loss": 0.0941, "step": 525 }, { "epoch": 0.05856255545696539, "grad_norm": 3.55253529548645, "learning_rate": 5.853658536585366e-06, "loss": 0.1521, "step": 528 }, { "epoch": 0.05889529724933452, "grad_norm": 2.3493754863739014, "learning_rate": 5.886917960088693e-06, "loss": 0.085, "step": 531 }, { "epoch": 0.05922803904170364, "grad_norm": 1.8938840627670288, "learning_rate": 5.920177383592019e-06, "loss": 0.097, "step": 534 }, { "epoch": 0.05956078083407276, "grad_norm": 4.116845607757568, "learning_rate": 5.953436807095344e-06, "loss": 0.0801, "step": 537 }, { "epoch": 0.05989352262644188, "grad_norm": 3.6378135681152344, "learning_rate": 5.98669623059867e-06, "loss": 0.0803, "step": 540 }, { "epoch": 0.060226264418811, "grad_norm": 2.460942029953003, "learning_rate": 6.0199556541019966e-06, "loss": 0.0841, "step": 543 }, { "epoch": 0.06055900621118013, "grad_norm": 4.973423480987549, "learning_rate": 6.053215077605322e-06, "loss": 0.1105, "step": 546 }, { "epoch": 0.06089174800354925, "grad_norm": 2.1246228218078613, "learning_rate": 6.086474501108648e-06, "loss": 0.0714, "step": 549 }, { "epoch": 0.061224489795918366, "grad_norm": 4.982196807861328, "learning_rate": 6.1197339246119735e-06, "loss": 0.097, "step": 552 }, { "epoch": 0.06155723158828749, "grad_norm": 2.658125638961792, "learning_rate": 6.1529933481153e-06, "loss": 0.0476, "step": 555 }, { "epoch": 0.06188997338065661, "grad_norm": 5.696036338806152, "learning_rate": 6.186252771618626e-06, "loss": 0.1186, "step": 558 }, { "epoch": 0.06222271517302573, "grad_norm": 1.4269165992736816, "learning_rate": 6.219512195121951e-06, "loss": 0.0822, "step": 561 }, { "epoch": 0.06255545696539486, "grad_norm": 4.207926273345947, "learning_rate": 6.252771618625277e-06, "loss": 0.1305, "step": 564 }, { "epoch": 0.06288819875776397, "grad_norm": 5.113284111022949, "learning_rate": 6.286031042128604e-06, "loss": 0.146, "step": 567 }, { "epoch": 0.0632209405501331, "grad_norm": 3.1380107402801514, "learning_rate": 6.31929046563193e-06, "loss": 0.1212, "step": 570 }, { "epoch": 0.06355368234250222, "grad_norm": 3.6776905059814453, "learning_rate": 6.352549889135256e-06, "loss": 0.084, "step": 573 }, { "epoch": 0.06388642413487133, "grad_norm": 6.583403587341309, "learning_rate": 6.385809312638581e-06, "loss": 0.1396, "step": 576 }, { "epoch": 0.06421916592724046, "grad_norm": 4.759281158447266, "learning_rate": 6.419068736141908e-06, "loss": 0.1067, "step": 579 }, { "epoch": 0.06455190771960959, "grad_norm": 3.0004425048828125, "learning_rate": 6.4523281596452335e-06, "loss": 0.1336, "step": 582 }, { "epoch": 0.0648846495119787, "grad_norm": 2.129690408706665, "learning_rate": 6.485587583148559e-06, "loss": 0.1084, "step": 585 }, { "epoch": 0.06521739130434782, "grad_norm": 6.722754001617432, "learning_rate": 6.518847006651885e-06, "loss": 0.1129, "step": 588 }, { "epoch": 0.06555013309671695, "grad_norm": 3.5492303371429443, "learning_rate": 6.5521064301552105e-06, "loss": 0.0874, "step": 591 }, { "epoch": 0.06588287488908606, "grad_norm": 2.1250741481781006, "learning_rate": 6.585365853658538e-06, "loss": 0.0869, "step": 594 }, { "epoch": 0.06621561668145519, "grad_norm": 1.6546465158462524, "learning_rate": 6.6186252771618635e-06, "loss": 0.0457, "step": 597 }, { "epoch": 0.06654835847382432, "grad_norm": 3.9031965732574463, "learning_rate": 6.651884700665189e-06, "loss": 0.0801, "step": 600 }, { "epoch": 0.06688110026619343, "grad_norm": 3.683596611022949, "learning_rate": 6.685144124168515e-06, "loss": 0.1013, "step": 603 }, { "epoch": 0.06721384205856255, "grad_norm": 2.3814892768859863, "learning_rate": 6.718403547671841e-06, "loss": 0.08, "step": 606 }, { "epoch": 0.06754658385093168, "grad_norm": 1.4008212089538574, "learning_rate": 6.751662971175167e-06, "loss": 0.0855, "step": 609 }, { "epoch": 0.06787932564330079, "grad_norm": 1.6673588752746582, "learning_rate": 6.784922394678493e-06, "loss": 0.0768, "step": 612 }, { "epoch": 0.06821206743566992, "grad_norm": 2.4234447479248047, "learning_rate": 6.818181818181818e-06, "loss": 0.1391, "step": 615 }, { "epoch": 0.06854480922803904, "grad_norm": 3.969289779663086, "learning_rate": 6.851441241685145e-06, "loss": 0.1369, "step": 618 }, { "epoch": 0.06887755102040816, "grad_norm": 2.3645565509796143, "learning_rate": 6.8847006651884704e-06, "loss": 0.1299, "step": 621 }, { "epoch": 0.06921029281277728, "grad_norm": 3.8376338481903076, "learning_rate": 6.917960088691796e-06, "loss": 0.1371, "step": 624 }, { "epoch": 0.06954303460514641, "grad_norm": 2.8958778381347656, "learning_rate": 6.951219512195122e-06, "loss": 0.0995, "step": 627 }, { "epoch": 0.06987577639751552, "grad_norm": 1.2747795581817627, "learning_rate": 6.984478935698449e-06, "loss": 0.086, "step": 630 }, { "epoch": 0.07020851818988465, "grad_norm": 5.938511371612549, "learning_rate": 7.017738359201775e-06, "loss": 0.0894, "step": 633 }, { "epoch": 0.07054125998225377, "grad_norm": 1.856704831123352, "learning_rate": 7.0509977827051e-06, "loss": 0.0776, "step": 636 }, { "epoch": 0.0708740017746229, "grad_norm": 2.646843194961548, "learning_rate": 7.084257206208426e-06, "loss": 0.1126, "step": 639 }, { "epoch": 0.07120674356699201, "grad_norm": 1.8464751243591309, "learning_rate": 7.117516629711753e-06, "loss": 0.1175, "step": 642 }, { "epoch": 0.07153948535936114, "grad_norm": 2.5712878704071045, "learning_rate": 7.150776053215078e-06, "loss": 0.0778, "step": 645 }, { "epoch": 0.07187222715173026, "grad_norm": 6.000158786773682, "learning_rate": 7.184035476718404e-06, "loss": 0.1029, "step": 648 }, { "epoch": 0.07220496894409938, "grad_norm": 3.8735408782958984, "learning_rate": 7.2172949002217296e-06, "loss": 0.1012, "step": 651 }, { "epoch": 0.0725377107364685, "grad_norm": 3.5447640419006348, "learning_rate": 7.250554323725056e-06, "loss": 0.1358, "step": 654 }, { "epoch": 0.07287045252883763, "grad_norm": 4.614232540130615, "learning_rate": 7.283813747228382e-06, "loss": 0.0987, "step": 657 }, { "epoch": 0.07320319432120674, "grad_norm": 12.307831764221191, "learning_rate": 7.317073170731707e-06, "loss": 0.0896, "step": 660 }, { "epoch": 0.07353593611357587, "grad_norm": 3.838798761367798, "learning_rate": 7.350332594235033e-06, "loss": 0.1276, "step": 663 }, { "epoch": 0.073868677905945, "grad_norm": 6.117704391479492, "learning_rate": 7.38359201773836e-06, "loss": 0.1098, "step": 666 }, { "epoch": 0.0742014196983141, "grad_norm": 2.702186346054077, "learning_rate": 7.416851441241686e-06, "loss": 0.0861, "step": 669 }, { "epoch": 0.07453416149068323, "grad_norm": 2.2032158374786377, "learning_rate": 7.450110864745012e-06, "loss": 0.0892, "step": 672 }, { "epoch": 0.07486690328305236, "grad_norm": 2.3347060680389404, "learning_rate": 7.483370288248337e-06, "loss": 0.0641, "step": 675 }, { "epoch": 0.07519964507542147, "grad_norm": 1.5459059476852417, "learning_rate": 7.516629711751664e-06, "loss": 0.0942, "step": 678 }, { "epoch": 0.0755323868677906, "grad_norm": 1.662636160850525, "learning_rate": 7.5498891352549895e-06, "loss": 0.077, "step": 681 }, { "epoch": 0.07586512866015972, "grad_norm": 1.9959062337875366, "learning_rate": 7.583148558758315e-06, "loss": 0.0787, "step": 684 }, { "epoch": 0.07619787045252883, "grad_norm": 1.1479439735412598, "learning_rate": 7.616407982261641e-06, "loss": 0.0767, "step": 687 }, { "epoch": 0.07653061224489796, "grad_norm": 4.428623676300049, "learning_rate": 7.649667405764967e-06, "loss": 0.1073, "step": 690 }, { "epoch": 0.07686335403726709, "grad_norm": 3.6230320930480957, "learning_rate": 7.682926829268293e-06, "loss": 0.0858, "step": 693 }, { "epoch": 0.0771960958296362, "grad_norm": 5.269100666046143, "learning_rate": 7.716186252771619e-06, "loss": 0.1843, "step": 696 }, { "epoch": 0.07752883762200533, "grad_norm": 1.6854679584503174, "learning_rate": 7.749445676274944e-06, "loss": 0.091, "step": 699 }, { "epoch": 0.07786157941437445, "grad_norm": 6.573445796966553, "learning_rate": 7.782705099778272e-06, "loss": 0.084, "step": 702 }, { "epoch": 0.07819432120674356, "grad_norm": 4.061784267425537, "learning_rate": 7.815964523281597e-06, "loss": 0.1619, "step": 705 }, { "epoch": 0.07852706299911269, "grad_norm": 1.7626991271972656, "learning_rate": 7.849223946784923e-06, "loss": 0.094, "step": 708 }, { "epoch": 0.07885980479148182, "grad_norm": 3.9269795417785645, "learning_rate": 7.882483370288249e-06, "loss": 0.1343, "step": 711 }, { "epoch": 0.07919254658385093, "grad_norm": 1.755347490310669, "learning_rate": 7.915742793791576e-06, "loss": 0.1214, "step": 714 }, { "epoch": 0.07952528837622005, "grad_norm": 4.515873432159424, "learning_rate": 7.949002217294902e-06, "loss": 0.0926, "step": 717 }, { "epoch": 0.07985803016858918, "grad_norm": 2.192376136779785, "learning_rate": 7.982261640798227e-06, "loss": 0.0935, "step": 720 }, { "epoch": 0.0801907719609583, "grad_norm": 2.0695343017578125, "learning_rate": 8.015521064301553e-06, "loss": 0.0561, "step": 723 }, { "epoch": 0.08052351375332742, "grad_norm": 3.037898302078247, "learning_rate": 8.048780487804879e-06, "loss": 0.0744, "step": 726 }, { "epoch": 0.08085625554569655, "grad_norm": 4.291987419128418, "learning_rate": 8.082039911308204e-06, "loss": 0.1349, "step": 729 }, { "epoch": 0.08118899733806566, "grad_norm": 2.45395565032959, "learning_rate": 8.11529933481153e-06, "loss": 0.0849, "step": 732 }, { "epoch": 0.08152173913043478, "grad_norm": 2.638307571411133, "learning_rate": 8.148558758314856e-06, "loss": 0.1077, "step": 735 }, { "epoch": 0.08185448092280391, "grad_norm": 1.9898786544799805, "learning_rate": 8.181818181818183e-06, "loss": 0.1102, "step": 738 }, { "epoch": 0.08218722271517302, "grad_norm": 6.549917697906494, "learning_rate": 8.215077605321509e-06, "loss": 0.095, "step": 741 }, { "epoch": 0.08251996450754215, "grad_norm": 4.143815040588379, "learning_rate": 8.248337028824834e-06, "loss": 0.0837, "step": 744 }, { "epoch": 0.08285270629991127, "grad_norm": 4.698508262634277, "learning_rate": 8.28159645232816e-06, "loss": 0.1101, "step": 747 }, { "epoch": 0.08318544809228039, "grad_norm": 7.131821632385254, "learning_rate": 8.314855875831487e-06, "loss": 0.0822, "step": 750 }, { "epoch": 0.08351818988464951, "grad_norm": 4.01303243637085, "learning_rate": 8.348115299334813e-06, "loss": 0.1148, "step": 753 }, { "epoch": 0.08385093167701864, "grad_norm": 1.2312612533569336, "learning_rate": 8.381374722838139e-06, "loss": 0.1042, "step": 756 }, { "epoch": 0.08418367346938775, "grad_norm": 2.0417134761810303, "learning_rate": 8.414634146341464e-06, "loss": 0.0563, "step": 759 }, { "epoch": 0.08451641526175688, "grad_norm": 3.743445873260498, "learning_rate": 8.44789356984479e-06, "loss": 0.1435, "step": 762 }, { "epoch": 0.084849157054126, "grad_norm": 2.9309263229370117, "learning_rate": 8.481152993348116e-06, "loss": 0.1048, "step": 765 }, { "epoch": 0.08518189884649512, "grad_norm": 5.630063056945801, "learning_rate": 8.514412416851441e-06, "loss": 0.1025, "step": 768 }, { "epoch": 0.08551464063886424, "grad_norm": 5.362646102905273, "learning_rate": 8.547671840354767e-06, "loss": 0.1145, "step": 771 }, { "epoch": 0.08584738243123337, "grad_norm": 2.512972354888916, "learning_rate": 8.580931263858093e-06, "loss": 0.0918, "step": 774 }, { "epoch": 0.08618012422360248, "grad_norm": 2.9974989891052246, "learning_rate": 8.61419068736142e-06, "loss": 0.0975, "step": 777 }, { "epoch": 0.0865128660159716, "grad_norm": 2.3501694202423096, "learning_rate": 8.647450110864746e-06, "loss": 0.1047, "step": 780 }, { "epoch": 0.08684560780834073, "grad_norm": 1.074440360069275, "learning_rate": 8.680709534368071e-06, "loss": 0.0525, "step": 783 }, { "epoch": 0.08717834960070985, "grad_norm": 2.354565382003784, "learning_rate": 8.713968957871397e-06, "loss": 0.1204, "step": 786 }, { "epoch": 0.08751109139307897, "grad_norm": 1.7127368450164795, "learning_rate": 8.747228381374724e-06, "loss": 0.0983, "step": 789 }, { "epoch": 0.0878438331854481, "grad_norm": 2.8018808364868164, "learning_rate": 8.78048780487805e-06, "loss": 0.0983, "step": 792 }, { "epoch": 0.08817657497781721, "grad_norm": 2.6411852836608887, "learning_rate": 8.813747228381376e-06, "loss": 0.1175, "step": 795 }, { "epoch": 0.08850931677018634, "grad_norm": 1.3837209939956665, "learning_rate": 8.847006651884701e-06, "loss": 0.0821, "step": 798 }, { "epoch": 0.08884205856255546, "grad_norm": 2.039264440536499, "learning_rate": 8.880266075388027e-06, "loss": 0.0846, "step": 801 }, { "epoch": 0.08917480035492457, "grad_norm": 1.0869460105895996, "learning_rate": 8.913525498891353e-06, "loss": 0.1143, "step": 804 }, { "epoch": 0.0895075421472937, "grad_norm": 4.3048553466796875, "learning_rate": 8.946784922394678e-06, "loss": 0.176, "step": 807 }, { "epoch": 0.08984028393966283, "grad_norm": 1.4080750942230225, "learning_rate": 8.980044345898006e-06, "loss": 0.092, "step": 810 }, { "epoch": 0.09017302573203194, "grad_norm": 1.9575210809707642, "learning_rate": 9.013303769401331e-06, "loss": 0.08, "step": 813 }, { "epoch": 0.09050576752440107, "grad_norm": 1.1327650547027588, "learning_rate": 9.046563192904657e-06, "loss": 0.0531, "step": 816 }, { "epoch": 0.09083850931677019, "grad_norm": 2.430676221847534, "learning_rate": 9.079822616407982e-06, "loss": 0.0774, "step": 819 }, { "epoch": 0.0911712511091393, "grad_norm": 2.9505889415740967, "learning_rate": 9.113082039911308e-06, "loss": 0.0594, "step": 822 }, { "epoch": 0.09150399290150843, "grad_norm": 2.2100236415863037, "learning_rate": 9.146341463414635e-06, "loss": 0.0533, "step": 825 }, { "epoch": 0.09183673469387756, "grad_norm": 1.5018399953842163, "learning_rate": 9.179600886917961e-06, "loss": 0.0411, "step": 828 }, { "epoch": 0.09216947648624667, "grad_norm": 0.896511435508728, "learning_rate": 9.212860310421287e-06, "loss": 0.0389, "step": 831 }, { "epoch": 0.0925022182786158, "grad_norm": 2.447483777999878, "learning_rate": 9.246119733924612e-06, "loss": 0.0593, "step": 834 }, { "epoch": 0.09283496007098492, "grad_norm": 2.9552793502807617, "learning_rate": 9.27937915742794e-06, "loss": 0.0861, "step": 837 }, { "epoch": 0.09316770186335403, "grad_norm": 3.3599274158477783, "learning_rate": 9.312638580931265e-06, "loss": 0.1425, "step": 840 }, { "epoch": 0.09350044365572316, "grad_norm": 2.4016082286834717, "learning_rate": 9.345898004434591e-06, "loss": 0.0874, "step": 843 }, { "epoch": 0.09383318544809229, "grad_norm": 1.2596389055252075, "learning_rate": 9.379157427937917e-06, "loss": 0.0743, "step": 846 }, { "epoch": 0.0941659272404614, "grad_norm": 2.106100082397461, "learning_rate": 9.412416851441242e-06, "loss": 0.11, "step": 849 }, { "epoch": 0.09449866903283052, "grad_norm": 3.938939094543457, "learning_rate": 9.445676274944568e-06, "loss": 0.0737, "step": 852 }, { "epoch": 0.09483141082519965, "grad_norm": 1.3310282230377197, "learning_rate": 9.478935698447894e-06, "loss": 0.0815, "step": 855 }, { "epoch": 0.09516415261756876, "grad_norm": 2.442780017852783, "learning_rate": 9.51219512195122e-06, "loss": 0.0802, "step": 858 }, { "epoch": 0.09549689440993789, "grad_norm": 1.2009247541427612, "learning_rate": 9.545454545454547e-06, "loss": 0.1089, "step": 861 }, { "epoch": 0.09582963620230701, "grad_norm": 2.4786102771759033, "learning_rate": 9.578713968957872e-06, "loss": 0.0651, "step": 864 }, { "epoch": 0.09616237799467613, "grad_norm": 1.5461684465408325, "learning_rate": 9.611973392461198e-06, "loss": 0.0741, "step": 867 }, { "epoch": 0.09649511978704525, "grad_norm": 4.174925327301025, "learning_rate": 9.645232815964524e-06, "loss": 0.0843, "step": 870 }, { "epoch": 0.09682786157941438, "grad_norm": 2.366631507873535, "learning_rate": 9.678492239467851e-06, "loss": 0.0898, "step": 873 }, { "epoch": 0.09716060337178349, "grad_norm": 3.260111093521118, "learning_rate": 9.711751662971177e-06, "loss": 0.0778, "step": 876 }, { "epoch": 0.09749334516415262, "grad_norm": 2.4484000205993652, "learning_rate": 9.745011086474502e-06, "loss": 0.0786, "step": 879 }, { "epoch": 0.09782608695652174, "grad_norm": 1.547356128692627, "learning_rate": 9.778270509977828e-06, "loss": 0.08, "step": 882 }, { "epoch": 0.09815882874889086, "grad_norm": 1.792119026184082, "learning_rate": 9.811529933481154e-06, "loss": 0.0764, "step": 885 }, { "epoch": 0.09849157054125998, "grad_norm": 2.724433422088623, "learning_rate": 9.84478935698448e-06, "loss": 0.1165, "step": 888 }, { "epoch": 0.09882431233362911, "grad_norm": 1.7895309925079346, "learning_rate": 9.878048780487805e-06, "loss": 0.0694, "step": 891 }, { "epoch": 0.09915705412599822, "grad_norm": 1.9482133388519287, "learning_rate": 9.91130820399113e-06, "loss": 0.0908, "step": 894 }, { "epoch": 0.09948979591836735, "grad_norm": 2.0316059589385986, "learning_rate": 9.944567627494458e-06, "loss": 0.1026, "step": 897 }, { "epoch": 0.09982253771073647, "grad_norm": 1.9569859504699707, "learning_rate": 9.977827050997784e-06, "loss": 0.0637, "step": 900 }, { "epoch": 0.10015527950310558, "grad_norm": 1.181976318359375, "learning_rate": 1.0011086474501111e-05, "loss": 0.0521, "step": 903 }, { "epoch": 0.10048802129547471, "grad_norm": 4.954092025756836, "learning_rate": 1.0044345898004435e-05, "loss": 0.1118, "step": 906 }, { "epoch": 0.10082076308784384, "grad_norm": 3.5407698154449463, "learning_rate": 1.0077605321507762e-05, "loss": 0.142, "step": 909 }, { "epoch": 0.10115350488021295, "grad_norm": 3.300516128540039, "learning_rate": 1.0110864745011086e-05, "loss": 0.114, "step": 912 }, { "epoch": 0.10148624667258208, "grad_norm": 3.8588969707489014, "learning_rate": 1.0144124168514414e-05, "loss": 0.1279, "step": 915 }, { "epoch": 0.1018189884649512, "grad_norm": 1.1140732765197754, "learning_rate": 1.017738359201774e-05, "loss": 0.0726, "step": 918 }, { "epoch": 0.10215173025732031, "grad_norm": 2.5975515842437744, "learning_rate": 1.0210643015521065e-05, "loss": 0.0989, "step": 921 }, { "epoch": 0.10248447204968944, "grad_norm": 1.820598840713501, "learning_rate": 1.024390243902439e-05, "loss": 0.0797, "step": 924 }, { "epoch": 0.10281721384205857, "grad_norm": 2.6333160400390625, "learning_rate": 1.0277161862527718e-05, "loss": 0.082, "step": 927 }, { "epoch": 0.10314995563442768, "grad_norm": 2.148286819458008, "learning_rate": 1.0310421286031042e-05, "loss": 0.1158, "step": 930 }, { "epoch": 0.1034826974267968, "grad_norm": 2.047224760055542, "learning_rate": 1.034368070953437e-05, "loss": 0.0456, "step": 933 }, { "epoch": 0.10381543921916593, "grad_norm": 1.948140025138855, "learning_rate": 1.0376940133037693e-05, "loss": 0.1035, "step": 936 }, { "epoch": 0.10414818101153504, "grad_norm": 1.307896375656128, "learning_rate": 1.041019955654102e-05, "loss": 0.1038, "step": 939 }, { "epoch": 0.10448092280390417, "grad_norm": 1.479960560798645, "learning_rate": 1.0443458980044348e-05, "loss": 0.0425, "step": 942 }, { "epoch": 0.1048136645962733, "grad_norm": 1.023329734802246, "learning_rate": 1.0476718403547672e-05, "loss": 0.088, "step": 945 }, { "epoch": 0.10514640638864241, "grad_norm": 1.8618545532226562, "learning_rate": 1.0509977827051e-05, "loss": 0.0895, "step": 948 }, { "epoch": 0.10547914818101153, "grad_norm": 1.9717864990234375, "learning_rate": 1.0543237250554325e-05, "loss": 0.095, "step": 951 }, { "epoch": 0.10581188997338066, "grad_norm": 0.7104807496070862, "learning_rate": 1.057649667405765e-05, "loss": 0.0937, "step": 954 }, { "epoch": 0.10614463176574977, "grad_norm": 3.6240522861480713, "learning_rate": 1.0609756097560976e-05, "loss": 0.1039, "step": 957 }, { "epoch": 0.1064773735581189, "grad_norm": 2.7003328800201416, "learning_rate": 1.0643015521064302e-05, "loss": 0.1329, "step": 960 }, { "epoch": 0.10681011535048802, "grad_norm": 2.9740419387817383, "learning_rate": 1.0676274944567628e-05, "loss": 0.1441, "step": 963 }, { "epoch": 0.10714285714285714, "grad_norm": 3.0425968170166016, "learning_rate": 1.0709534368070955e-05, "loss": 0.0994, "step": 966 }, { "epoch": 0.10747559893522626, "grad_norm": 2.083329677581787, "learning_rate": 1.0742793791574279e-05, "loss": 0.0617, "step": 969 }, { "epoch": 0.10780834072759539, "grad_norm": 2.6448848247528076, "learning_rate": 1.0776053215077606e-05, "loss": 0.1541, "step": 972 }, { "epoch": 0.1081410825199645, "grad_norm": 1.4153597354888916, "learning_rate": 1.080931263858093e-05, "loss": 0.066, "step": 975 }, { "epoch": 0.10847382431233363, "grad_norm": 2.517460346221924, "learning_rate": 1.0842572062084258e-05, "loss": 0.12, "step": 978 }, { "epoch": 0.10880656610470275, "grad_norm": 2.0003175735473633, "learning_rate": 1.0875831485587585e-05, "loss": 0.0822, "step": 981 }, { "epoch": 0.10913930789707187, "grad_norm": 0.9331740140914917, "learning_rate": 1.0909090909090909e-05, "loss": 0.0944, "step": 984 }, { "epoch": 0.10947204968944099, "grad_norm": 2.203148365020752, "learning_rate": 1.0942350332594236e-05, "loss": 0.0985, "step": 987 }, { "epoch": 0.10980479148181012, "grad_norm": 1.6199864149093628, "learning_rate": 1.0975609756097562e-05, "loss": 0.1204, "step": 990 }, { "epoch": 0.11013753327417923, "grad_norm": 0.9034414887428284, "learning_rate": 1.1008869179600888e-05, "loss": 0.0877, "step": 993 }, { "epoch": 0.11047027506654836, "grad_norm": 1.9199018478393555, "learning_rate": 1.1042128603104215e-05, "loss": 0.0668, "step": 996 }, { "epoch": 0.11080301685891748, "grad_norm": 2.494936466217041, "learning_rate": 1.1075388026607539e-05, "loss": 0.1008, "step": 999 }, { "epoch": 0.1111357586512866, "grad_norm": 1.1572829484939575, "learning_rate": 1.1108647450110866e-05, "loss": 0.1153, "step": 1002 }, { "epoch": 0.11146850044365572, "grad_norm": 1.7787326574325562, "learning_rate": 1.1141906873614192e-05, "loss": 0.0817, "step": 1005 }, { "epoch": 0.11180124223602485, "grad_norm": 2.1385276317596436, "learning_rate": 1.1175166297117518e-05, "loss": 0.0675, "step": 1008 }, { "epoch": 0.11213398402839396, "grad_norm": 2.1192257404327393, "learning_rate": 1.1208425720620843e-05, "loss": 0.1183, "step": 1011 }, { "epoch": 0.11246672582076309, "grad_norm": 0.8357610106468201, "learning_rate": 1.124168514412417e-05, "loss": 0.0845, "step": 1014 }, { "epoch": 0.11279946761313221, "grad_norm": 1.3743176460266113, "learning_rate": 1.1274944567627495e-05, "loss": 0.0689, "step": 1017 }, { "epoch": 0.11313220940550132, "grad_norm": 2.223341703414917, "learning_rate": 1.1308203991130822e-05, "loss": 0.099, "step": 1020 }, { "epoch": 0.11346495119787045, "grad_norm": 0.8482286930084229, "learning_rate": 1.1341463414634146e-05, "loss": 0.0898, "step": 1023 }, { "epoch": 0.11379769299023958, "grad_norm": 0.7001283168792725, "learning_rate": 1.1374722838137473e-05, "loss": 0.0903, "step": 1026 }, { "epoch": 0.11413043478260869, "grad_norm": 1.2719107866287231, "learning_rate": 1.14079822616408e-05, "loss": 0.0785, "step": 1029 }, { "epoch": 0.11446317657497782, "grad_norm": 1.8513505458831787, "learning_rate": 1.1441241685144125e-05, "loss": 0.1028, "step": 1032 }, { "epoch": 0.11479591836734694, "grad_norm": 1.236826777458191, "learning_rate": 1.1474501108647452e-05, "loss": 0.0687, "step": 1035 }, { "epoch": 0.11512866015971605, "grad_norm": 3.7046868801116943, "learning_rate": 1.1507760532150778e-05, "loss": 0.0952, "step": 1038 }, { "epoch": 0.11546140195208518, "grad_norm": 1.4244076013565063, "learning_rate": 1.1541019955654103e-05, "loss": 0.0696, "step": 1041 }, { "epoch": 0.1157941437444543, "grad_norm": 0.7604947686195374, "learning_rate": 1.1574279379157429e-05, "loss": 0.071, "step": 1044 }, { "epoch": 0.11612688553682342, "grad_norm": 3.203418493270874, "learning_rate": 1.1607538802660754e-05, "loss": 0.1237, "step": 1047 }, { "epoch": 0.11645962732919254, "grad_norm": 1.999302864074707, "learning_rate": 1.164079822616408e-05, "loss": 0.1222, "step": 1050 }, { "epoch": 0.11679236912156167, "grad_norm": 1.4482122659683228, "learning_rate": 1.1674057649667408e-05, "loss": 0.0955, "step": 1053 }, { "epoch": 0.11712511091393078, "grad_norm": 2.992788314819336, "learning_rate": 1.1707317073170731e-05, "loss": 0.0817, "step": 1056 }, { "epoch": 0.11745785270629991, "grad_norm": 2.3243660926818848, "learning_rate": 1.1740576496674059e-05, "loss": 0.0869, "step": 1059 }, { "epoch": 0.11779059449866904, "grad_norm": 1.3292882442474365, "learning_rate": 1.1773835920177386e-05, "loss": 0.1149, "step": 1062 }, { "epoch": 0.11812333629103816, "grad_norm": 1.1202009916305542, "learning_rate": 1.180709534368071e-05, "loss": 0.0704, "step": 1065 }, { "epoch": 0.11845607808340727, "grad_norm": 1.1369428634643555, "learning_rate": 1.1840354767184037e-05, "loss": 0.0578, "step": 1068 }, { "epoch": 0.1187888198757764, "grad_norm": 0.7651126980781555, "learning_rate": 1.1873614190687361e-05, "loss": 0.0703, "step": 1071 }, { "epoch": 0.11912156166814553, "grad_norm": 2.069145679473877, "learning_rate": 1.1906873614190689e-05, "loss": 0.1073, "step": 1074 }, { "epoch": 0.11945430346051464, "grad_norm": 1.1232635974884033, "learning_rate": 1.1940133037694014e-05, "loss": 0.0553, "step": 1077 }, { "epoch": 0.11978704525288376, "grad_norm": 1.1090096235275269, "learning_rate": 1.197339246119734e-05, "loss": 0.0474, "step": 1080 }, { "epoch": 0.12011978704525289, "grad_norm": 1.455905795097351, "learning_rate": 1.2006651884700666e-05, "loss": 0.0558, "step": 1083 }, { "epoch": 0.120452528837622, "grad_norm": 1.1879817247390747, "learning_rate": 1.2039911308203993e-05, "loss": 0.0824, "step": 1086 }, { "epoch": 0.12078527062999113, "grad_norm": 0.5910300612449646, "learning_rate": 1.2073170731707317e-05, "loss": 0.0763, "step": 1089 }, { "epoch": 0.12111801242236025, "grad_norm": 1.6487340927124023, "learning_rate": 1.2106430155210644e-05, "loss": 0.1015, "step": 1092 }, { "epoch": 0.12145075421472937, "grad_norm": 0.9820619225502014, "learning_rate": 1.2139689578713968e-05, "loss": 0.0731, "step": 1095 }, { "epoch": 0.1217834960070985, "grad_norm": 1.3096563816070557, "learning_rate": 1.2172949002217296e-05, "loss": 0.0558, "step": 1098 }, { "epoch": 0.12211623779946762, "grad_norm": 1.0159225463867188, "learning_rate": 1.2206208425720623e-05, "loss": 0.0713, "step": 1101 }, { "epoch": 0.12244897959183673, "grad_norm": 1.4698221683502197, "learning_rate": 1.2239467849223947e-05, "loss": 0.072, "step": 1104 }, { "epoch": 0.12278172138420586, "grad_norm": 1.3137110471725464, "learning_rate": 1.2272727272727274e-05, "loss": 0.0878, "step": 1107 }, { "epoch": 0.12311446317657498, "grad_norm": 3.4029998779296875, "learning_rate": 1.23059866962306e-05, "loss": 0.1421, "step": 1110 }, { "epoch": 0.1234472049689441, "grad_norm": 1.2445021867752075, "learning_rate": 1.2339246119733926e-05, "loss": 0.0577, "step": 1113 }, { "epoch": 0.12377994676131322, "grad_norm": 1.1267825365066528, "learning_rate": 1.2372505543237251e-05, "loss": 0.0462, "step": 1116 }, { "epoch": 0.12411268855368235, "grad_norm": 2.012746810913086, "learning_rate": 1.2405764966740577e-05, "loss": 0.0548, "step": 1119 }, { "epoch": 0.12444543034605146, "grad_norm": 2.3715696334838867, "learning_rate": 1.2439024390243903e-05, "loss": 0.0911, "step": 1122 }, { "epoch": 0.12477817213842059, "grad_norm": 2.5073225498199463, "learning_rate": 1.247228381374723e-05, "loss": 0.1207, "step": 1125 }, { "epoch": 0.1251109139307897, "grad_norm": 1.119004726409912, "learning_rate": 1.2505543237250554e-05, "loss": 0.0619, "step": 1128 }, { "epoch": 0.12544365572315883, "grad_norm": 1.7201495170593262, "learning_rate": 1.2538802660753881e-05, "loss": 0.0628, "step": 1131 }, { "epoch": 0.12577639751552794, "grad_norm": 1.2569254636764526, "learning_rate": 1.2572062084257209e-05, "loss": 0.0748, "step": 1134 }, { "epoch": 0.12610913930789708, "grad_norm": 1.680189847946167, "learning_rate": 1.2605321507760533e-05, "loss": 0.0971, "step": 1137 }, { "epoch": 0.1264418811002662, "grad_norm": 2.9206900596618652, "learning_rate": 1.263858093126386e-05, "loss": 0.1017, "step": 1140 }, { "epoch": 0.1267746228926353, "grad_norm": 1.3998111486434937, "learning_rate": 1.2671840354767184e-05, "loss": 0.0834, "step": 1143 }, { "epoch": 0.12710736468500444, "grad_norm": 1.6062500476837158, "learning_rate": 1.2705099778270511e-05, "loss": 0.0836, "step": 1146 }, { "epoch": 0.12744010647737355, "grad_norm": 2.6162166595458984, "learning_rate": 1.2738359201773837e-05, "loss": 0.092, "step": 1149 }, { "epoch": 0.12777284826974267, "grad_norm": 0.8380796313285828, "learning_rate": 1.2771618625277163e-05, "loss": 0.0796, "step": 1152 }, { "epoch": 0.1281055900621118, "grad_norm": 2.0702953338623047, "learning_rate": 1.2804878048780488e-05, "loss": 0.0879, "step": 1155 }, { "epoch": 0.12843833185448092, "grad_norm": 1.3157343864440918, "learning_rate": 1.2838137472283816e-05, "loss": 0.0608, "step": 1158 }, { "epoch": 0.12877107364685003, "grad_norm": 1.341902256011963, "learning_rate": 1.287139689578714e-05, "loss": 0.1094, "step": 1161 }, { "epoch": 0.12910381543921917, "grad_norm": 0.824084460735321, "learning_rate": 1.2904656319290467e-05, "loss": 0.0781, "step": 1164 }, { "epoch": 0.12943655723158828, "grad_norm": 1.1117424964904785, "learning_rate": 1.2937915742793791e-05, "loss": 0.1018, "step": 1167 }, { "epoch": 0.1297692990239574, "grad_norm": 1.7318905591964722, "learning_rate": 1.2971175166297118e-05, "loss": 0.0878, "step": 1170 }, { "epoch": 0.13010204081632654, "grad_norm": 1.409029245376587, "learning_rate": 1.3004434589800446e-05, "loss": 0.0929, "step": 1173 }, { "epoch": 0.13043478260869565, "grad_norm": 0.7251594066619873, "learning_rate": 1.303769401330377e-05, "loss": 0.0682, "step": 1176 }, { "epoch": 0.13076752440106476, "grad_norm": 1.8068056106567383, "learning_rate": 1.3070953436807097e-05, "loss": 0.0605, "step": 1179 }, { "epoch": 0.1311002661934339, "grad_norm": 1.1403309106826782, "learning_rate": 1.3104212860310421e-05, "loss": 0.0509, "step": 1182 }, { "epoch": 0.131433007985803, "grad_norm": 0.762871265411377, "learning_rate": 1.3137472283813748e-05, "loss": 0.0482, "step": 1185 }, { "epoch": 0.13176574977817213, "grad_norm": 1.763113021850586, "learning_rate": 1.3170731707317076e-05, "loss": 0.1115, "step": 1188 }, { "epoch": 0.13209849157054127, "grad_norm": 1.5917179584503174, "learning_rate": 1.32039911308204e-05, "loss": 0.057, "step": 1191 }, { "epoch": 0.13243123336291038, "grad_norm": 0.7929653525352478, "learning_rate": 1.3237250554323727e-05, "loss": 0.0823, "step": 1194 }, { "epoch": 0.1327639751552795, "grad_norm": 1.1743680238723755, "learning_rate": 1.3270509977827053e-05, "loss": 0.0599, "step": 1197 }, { "epoch": 0.13309671694764863, "grad_norm": 0.9967820048332214, "learning_rate": 1.3303769401330378e-05, "loss": 0.071, "step": 1200 }, { "epoch": 0.13342945874001774, "grad_norm": 0.8471358418464661, "learning_rate": 1.3337028824833704e-05, "loss": 0.1133, "step": 1203 }, { "epoch": 0.13376220053238685, "grad_norm": 0.8135191798210144, "learning_rate": 1.337028824833703e-05, "loss": 0.0531, "step": 1206 }, { "epoch": 0.134094942324756, "grad_norm": 0.9596357941627502, "learning_rate": 1.3403547671840355e-05, "loss": 0.0593, "step": 1209 }, { "epoch": 0.1344276841171251, "grad_norm": 1.2310141324996948, "learning_rate": 1.3436807095343683e-05, "loss": 0.0736, "step": 1212 }, { "epoch": 0.13476042590949422, "grad_norm": 0.7923886775970459, "learning_rate": 1.3470066518847007e-05, "loss": 0.0378, "step": 1215 }, { "epoch": 0.13509316770186336, "grad_norm": 1.8099709749221802, "learning_rate": 1.3503325942350334e-05, "loss": 0.0842, "step": 1218 }, { "epoch": 0.13542590949423247, "grad_norm": 1.16802179813385, "learning_rate": 1.3536585365853661e-05, "loss": 0.0948, "step": 1221 }, { "epoch": 0.13575865128660158, "grad_norm": 0.9041295647621155, "learning_rate": 1.3569844789356985e-05, "loss": 0.0599, "step": 1224 }, { "epoch": 0.13609139307897072, "grad_norm": 1.2614595890045166, "learning_rate": 1.3603104212860313e-05, "loss": 0.1014, "step": 1227 }, { "epoch": 0.13642413487133984, "grad_norm": 0.920734167098999, "learning_rate": 1.3636363636363637e-05, "loss": 0.0868, "step": 1230 }, { "epoch": 0.13675687666370895, "grad_norm": 1.2486292123794556, "learning_rate": 1.3669623059866964e-05, "loss": 0.0542, "step": 1233 }, { "epoch": 0.1370896184560781, "grad_norm": 0.8448286056518555, "learning_rate": 1.370288248337029e-05, "loss": 0.0774, "step": 1236 }, { "epoch": 0.1374223602484472, "grad_norm": 0.9328780174255371, "learning_rate": 1.3736141906873615e-05, "loss": 0.0715, "step": 1239 }, { "epoch": 0.1377551020408163, "grad_norm": 2.634031295776367, "learning_rate": 1.3769401330376941e-05, "loss": 0.0885, "step": 1242 }, { "epoch": 0.13808784383318545, "grad_norm": 2.154402494430542, "learning_rate": 1.3802660753880268e-05, "loss": 0.0868, "step": 1245 }, { "epoch": 0.13842058562555457, "grad_norm": 1.1136975288391113, "learning_rate": 1.3835920177383592e-05, "loss": 0.0365, "step": 1248 }, { "epoch": 0.13875332741792368, "grad_norm": 2.3923723697662354, "learning_rate": 1.386917960088692e-05, "loss": 0.0803, "step": 1251 }, { "epoch": 0.13908606921029282, "grad_norm": 0.622138500213623, "learning_rate": 1.3902439024390244e-05, "loss": 0.0424, "step": 1254 }, { "epoch": 0.13941881100266193, "grad_norm": 1.8852219581604004, "learning_rate": 1.3935698447893571e-05, "loss": 0.0748, "step": 1257 }, { "epoch": 0.13975155279503104, "grad_norm": 2.8278660774230957, "learning_rate": 1.3968957871396898e-05, "loss": 0.1544, "step": 1260 }, { "epoch": 0.14008429458740018, "grad_norm": 1.8192918300628662, "learning_rate": 1.4002217294900222e-05, "loss": 0.0861, "step": 1263 }, { "epoch": 0.1404170363797693, "grad_norm": 1.0294805765151978, "learning_rate": 1.403547671840355e-05, "loss": 0.0836, "step": 1266 }, { "epoch": 0.14074977817213843, "grad_norm": 2.797724962234497, "learning_rate": 1.4068736141906875e-05, "loss": 0.0828, "step": 1269 }, { "epoch": 0.14108251996450755, "grad_norm": 4.029115676879883, "learning_rate": 1.41019955654102e-05, "loss": 0.1656, "step": 1272 }, { "epoch": 0.14141526175687666, "grad_norm": 1.827744722366333, "learning_rate": 1.4135254988913527e-05, "loss": 0.0839, "step": 1275 }, { "epoch": 0.1417480035492458, "grad_norm": 1.3064441680908203, "learning_rate": 1.4168514412416852e-05, "loss": 0.0645, "step": 1278 }, { "epoch": 0.1420807453416149, "grad_norm": 1.062459945678711, "learning_rate": 1.4201773835920178e-05, "loss": 0.0701, "step": 1281 }, { "epoch": 0.14241348713398402, "grad_norm": 2.5912277698516846, "learning_rate": 1.4235033259423505e-05, "loss": 0.0709, "step": 1284 }, { "epoch": 0.14274622892635316, "grad_norm": 2.9151272773742676, "learning_rate": 1.4268292682926829e-05, "loss": 0.071, "step": 1287 }, { "epoch": 0.14307897071872228, "grad_norm": 1.558341145515442, "learning_rate": 1.4301552106430156e-05, "loss": 0.0492, "step": 1290 }, { "epoch": 0.1434117125110914, "grad_norm": 1.0456184148788452, "learning_rate": 1.4334811529933484e-05, "loss": 0.0638, "step": 1293 }, { "epoch": 0.14374445430346053, "grad_norm": 2.345285654067993, "learning_rate": 1.4368070953436808e-05, "loss": 0.1137, "step": 1296 }, { "epoch": 0.14407719609582964, "grad_norm": 0.7095193862915039, "learning_rate": 1.4401330376940135e-05, "loss": 0.0696, "step": 1299 }, { "epoch": 0.14440993788819875, "grad_norm": 3.114135265350342, "learning_rate": 1.4434589800443459e-05, "loss": 0.0814, "step": 1302 }, { "epoch": 0.1447426796805679, "grad_norm": 0.9389676451683044, "learning_rate": 1.4467849223946786e-05, "loss": 0.0753, "step": 1305 }, { "epoch": 0.145075421472937, "grad_norm": 0.8419822454452515, "learning_rate": 1.4501108647450112e-05, "loss": 0.0844, "step": 1308 }, { "epoch": 0.14540816326530612, "grad_norm": 0.8320896029472351, "learning_rate": 1.4534368070953438e-05, "loss": 0.0753, "step": 1311 }, { "epoch": 0.14574090505767526, "grad_norm": 0.8611564040184021, "learning_rate": 1.4567627494456763e-05, "loss": 0.096, "step": 1314 }, { "epoch": 0.14607364685004437, "grad_norm": 1.4661246538162231, "learning_rate": 1.460088691796009e-05, "loss": 0.1041, "step": 1317 }, { "epoch": 0.14640638864241348, "grad_norm": 1.8236814737319946, "learning_rate": 1.4634146341463415e-05, "loss": 0.0713, "step": 1320 }, { "epoch": 0.14673913043478262, "grad_norm": 0.9663561582565308, "learning_rate": 1.4667405764966742e-05, "loss": 0.0731, "step": 1323 }, { "epoch": 0.14707187222715173, "grad_norm": 0.6863815784454346, "learning_rate": 1.4700665188470066e-05, "loss": 0.051, "step": 1326 }, { "epoch": 0.14740461401952085, "grad_norm": 1.6920056343078613, "learning_rate": 1.4733924611973393e-05, "loss": 0.0945, "step": 1329 }, { "epoch": 0.14773735581189, "grad_norm": 1.8799203634262085, "learning_rate": 1.476718403547672e-05, "loss": 0.096, "step": 1332 }, { "epoch": 0.1480700976042591, "grad_norm": 1.00322425365448, "learning_rate": 1.4800443458980045e-05, "loss": 0.0616, "step": 1335 }, { "epoch": 0.1484028393966282, "grad_norm": 1.161470890045166, "learning_rate": 1.4833702882483372e-05, "loss": 0.0453, "step": 1338 }, { "epoch": 0.14873558118899735, "grad_norm": 0.7702887654304504, "learning_rate": 1.4866962305986698e-05, "loss": 0.0547, "step": 1341 }, { "epoch": 0.14906832298136646, "grad_norm": 1.6348271369934082, "learning_rate": 1.4900221729490023e-05, "loss": 0.102, "step": 1344 }, { "epoch": 0.14940106477373558, "grad_norm": 0.4332348704338074, "learning_rate": 1.4933481152993349e-05, "loss": 0.049, "step": 1347 }, { "epoch": 0.14973380656610472, "grad_norm": 1.4695441722869873, "learning_rate": 1.4966740576496675e-05, "loss": 0.0876, "step": 1350 }, { "epoch": 0.15006654835847383, "grad_norm": 2.274678945541382, "learning_rate": 1.5000000000000002e-05, "loss": 0.1042, "step": 1353 }, { "epoch": 0.15039929015084294, "grad_norm": 0.7162629961967468, "learning_rate": 1.5033259423503328e-05, "loss": 0.0602, "step": 1356 }, { "epoch": 0.15073203194321208, "grad_norm": 0.4464901089668274, "learning_rate": 1.5066518847006653e-05, "loss": 0.0684, "step": 1359 }, { "epoch": 0.1510647737355812, "grad_norm": 0.4327393174171448, "learning_rate": 1.5099778270509979e-05, "loss": 0.0544, "step": 1362 }, { "epoch": 0.1513975155279503, "grad_norm": 1.8435128927230835, "learning_rate": 1.5133037694013305e-05, "loss": 0.0417, "step": 1365 }, { "epoch": 0.15173025732031944, "grad_norm": 0.38839346170425415, "learning_rate": 1.516629711751663e-05, "loss": 0.0393, "step": 1368 }, { "epoch": 0.15206299911268856, "grad_norm": 1.142568588256836, "learning_rate": 1.5199556541019958e-05, "loss": 0.0927, "step": 1371 }, { "epoch": 0.15239574090505767, "grad_norm": 3.3241283893585205, "learning_rate": 1.5232815964523282e-05, "loss": 0.1182, "step": 1374 }, { "epoch": 0.1527284826974268, "grad_norm": 4.283749103546143, "learning_rate": 1.5266075388026607e-05, "loss": 0.0854, "step": 1377 }, { "epoch": 0.15306122448979592, "grad_norm": 1.1255176067352295, "learning_rate": 1.5299334811529935e-05, "loss": 0.0732, "step": 1380 }, { "epoch": 0.15339396628216503, "grad_norm": 0.7234096527099609, "learning_rate": 1.533259423503326e-05, "loss": 0.0985, "step": 1383 }, { "epoch": 0.15372670807453417, "grad_norm": 2.3150339126586914, "learning_rate": 1.5365853658536586e-05, "loss": 0.0778, "step": 1386 }, { "epoch": 0.1540594498669033, "grad_norm": 1.1159558296203613, "learning_rate": 1.539911308203991e-05, "loss": 0.0856, "step": 1389 }, { "epoch": 0.1543921916592724, "grad_norm": 1.5602436065673828, "learning_rate": 1.5432372505543237e-05, "loss": 0.0718, "step": 1392 }, { "epoch": 0.15472493345164154, "grad_norm": 1.5615876913070679, "learning_rate": 1.5465631929046565e-05, "loss": 0.0837, "step": 1395 }, { "epoch": 0.15505767524401065, "grad_norm": 1.638400912284851, "learning_rate": 1.549889135254989e-05, "loss": 0.0821, "step": 1398 }, { "epoch": 0.15539041703637976, "grad_norm": 1.362937092781067, "learning_rate": 1.5532150776053216e-05, "loss": 0.0681, "step": 1401 }, { "epoch": 0.1557231588287489, "grad_norm": 0.9878178834915161, "learning_rate": 1.5565410199556543e-05, "loss": 0.0391, "step": 1404 }, { "epoch": 0.15605590062111802, "grad_norm": 1.7634403705596924, "learning_rate": 1.5598669623059867e-05, "loss": 0.0639, "step": 1407 }, { "epoch": 0.15638864241348713, "grad_norm": 1.1708582639694214, "learning_rate": 1.5631929046563195e-05, "loss": 0.0628, "step": 1410 }, { "epoch": 0.15672138420585627, "grad_norm": 2.7513132095336914, "learning_rate": 1.566518847006652e-05, "loss": 0.1164, "step": 1413 }, { "epoch": 0.15705412599822538, "grad_norm": 1.1276545524597168, "learning_rate": 1.5698447893569846e-05, "loss": 0.0661, "step": 1416 }, { "epoch": 0.1573868677905945, "grad_norm": 1.1963578462600708, "learning_rate": 1.5731707317073173e-05, "loss": 0.0607, "step": 1419 }, { "epoch": 0.15771960958296363, "grad_norm": 0.8448947668075562, "learning_rate": 1.5764966740576497e-05, "loss": 0.084, "step": 1422 }, { "epoch": 0.15805235137533274, "grad_norm": 0.3935372829437256, "learning_rate": 1.5798226164079825e-05, "loss": 0.0739, "step": 1425 }, { "epoch": 0.15838509316770186, "grad_norm": 1.3059343099594116, "learning_rate": 1.5831485587583152e-05, "loss": 0.0491, "step": 1428 }, { "epoch": 0.158717834960071, "grad_norm": 0.7886009216308594, "learning_rate": 1.5864745011086476e-05, "loss": 0.0866, "step": 1431 }, { "epoch": 0.1590505767524401, "grad_norm": 1.6436578035354614, "learning_rate": 1.5898004434589803e-05, "loss": 0.08, "step": 1434 }, { "epoch": 0.15938331854480922, "grad_norm": 2.8411166667938232, "learning_rate": 1.5931263858093127e-05, "loss": 0.0915, "step": 1437 }, { "epoch": 0.15971606033717836, "grad_norm": 2.2763893604278564, "learning_rate": 1.5964523281596455e-05, "loss": 0.0835, "step": 1440 }, { "epoch": 0.16004880212954747, "grad_norm": 1.2924195528030396, "learning_rate": 1.599778270509978e-05, "loss": 0.0891, "step": 1443 }, { "epoch": 0.1603815439219166, "grad_norm": 1.3674718141555786, "learning_rate": 1.6031042128603106e-05, "loss": 0.0566, "step": 1446 }, { "epoch": 0.16071428571428573, "grad_norm": 1.0053977966308594, "learning_rate": 1.6064301552106433e-05, "loss": 0.0694, "step": 1449 }, { "epoch": 0.16104702750665484, "grad_norm": 2.6498258113861084, "learning_rate": 1.6097560975609757e-05, "loss": 0.1074, "step": 1452 }, { "epoch": 0.16137976929902395, "grad_norm": 1.2702534198760986, "learning_rate": 1.6130820399113085e-05, "loss": 0.0937, "step": 1455 }, { "epoch": 0.1617125110913931, "grad_norm": 0.890235185623169, "learning_rate": 1.616407982261641e-05, "loss": 0.059, "step": 1458 }, { "epoch": 0.1620452528837622, "grad_norm": 0.566232442855835, "learning_rate": 1.6197339246119736e-05, "loss": 0.0671, "step": 1461 }, { "epoch": 0.16237799467613132, "grad_norm": 2.0057742595672607, "learning_rate": 1.623059866962306e-05, "loss": 0.0909, "step": 1464 }, { "epoch": 0.16271073646850046, "grad_norm": 0.42235231399536133, "learning_rate": 1.6263858093126387e-05, "loss": 0.0419, "step": 1467 }, { "epoch": 0.16304347826086957, "grad_norm": 0.7255648970603943, "learning_rate": 1.629711751662971e-05, "loss": 0.0719, "step": 1470 }, { "epoch": 0.16337622005323868, "grad_norm": 0.6950805187225342, "learning_rate": 1.633037694013304e-05, "loss": 0.0845, "step": 1473 }, { "epoch": 0.16370896184560782, "grad_norm": 1.140456199645996, "learning_rate": 1.6363636363636366e-05, "loss": 0.069, "step": 1476 }, { "epoch": 0.16404170363797693, "grad_norm": 0.648154616355896, "learning_rate": 1.639689578713969e-05, "loss": 0.048, "step": 1479 }, { "epoch": 0.16437444543034604, "grad_norm": 0.783742368221283, "learning_rate": 1.6430155210643017e-05, "loss": 0.1239, "step": 1482 }, { "epoch": 0.16470718722271518, "grad_norm": 2.055687427520752, "learning_rate": 1.646341463414634e-05, "loss": 0.0548, "step": 1485 }, { "epoch": 0.1650399290150843, "grad_norm": 1.3763246536254883, "learning_rate": 1.649667405764967e-05, "loss": 0.071, "step": 1488 }, { "epoch": 0.1653726708074534, "grad_norm": 1.0269008874893188, "learning_rate": 1.6529933481152996e-05, "loss": 0.0749, "step": 1491 }, { "epoch": 0.16570541259982255, "grad_norm": 1.4048177003860474, "learning_rate": 1.656319290465632e-05, "loss": 0.117, "step": 1494 }, { "epoch": 0.16603815439219166, "grad_norm": 0.9395716786384583, "learning_rate": 1.6596452328159647e-05, "loss": 0.09, "step": 1497 }, { "epoch": 0.16637089618456077, "grad_norm": 0.6683193445205688, "learning_rate": 1.6629711751662975e-05, "loss": 0.0512, "step": 1500 }, { "epoch": 0.1667036379769299, "grad_norm": 1.3794102668762207, "learning_rate": 1.66629711751663e-05, "loss": 0.1061, "step": 1503 }, { "epoch": 0.16703637976929903, "grad_norm": 1.6345226764678955, "learning_rate": 1.6696230598669626e-05, "loss": 0.074, "step": 1506 }, { "epoch": 0.16736912156166814, "grad_norm": 0.9517784118652344, "learning_rate": 1.672949002217295e-05, "loss": 0.0905, "step": 1509 }, { "epoch": 0.16770186335403728, "grad_norm": 1.4260486364364624, "learning_rate": 1.6762749445676277e-05, "loss": 0.079, "step": 1512 }, { "epoch": 0.1680346051464064, "grad_norm": 0.947848379611969, "learning_rate": 1.6796008869179605e-05, "loss": 0.0754, "step": 1515 }, { "epoch": 0.1683673469387755, "grad_norm": 0.8699008226394653, "learning_rate": 1.682926829268293e-05, "loss": 0.0551, "step": 1518 }, { "epoch": 0.16870008873114464, "grad_norm": 1.405056118965149, "learning_rate": 1.6862527716186256e-05, "loss": 0.1045, "step": 1521 }, { "epoch": 0.16903283052351376, "grad_norm": 1.6286131143569946, "learning_rate": 1.689578713968958e-05, "loss": 0.0944, "step": 1524 }, { "epoch": 0.16936557231588287, "grad_norm": 1.314643144607544, "learning_rate": 1.6929046563192907e-05, "loss": 0.0779, "step": 1527 }, { "epoch": 0.169698314108252, "grad_norm": 2.5065033435821533, "learning_rate": 1.696230598669623e-05, "loss": 0.0984, "step": 1530 }, { "epoch": 0.17003105590062112, "grad_norm": 1.3400218486785889, "learning_rate": 1.699556541019956e-05, "loss": 0.1262, "step": 1533 }, { "epoch": 0.17036379769299023, "grad_norm": 1.536643624305725, "learning_rate": 1.7028824833702882e-05, "loss": 0.0955, "step": 1536 }, { "epoch": 0.17069653948535937, "grad_norm": 1.3329426050186157, "learning_rate": 1.706208425720621e-05, "loss": 0.1003, "step": 1539 }, { "epoch": 0.17102928127772848, "grad_norm": 1.4992012977600098, "learning_rate": 1.7095343680709534e-05, "loss": 0.0756, "step": 1542 }, { "epoch": 0.1713620230700976, "grad_norm": 1.1840115785598755, "learning_rate": 1.712860310421286e-05, "loss": 0.0698, "step": 1545 }, { "epoch": 0.17169476486246674, "grad_norm": 1.9921666383743286, "learning_rate": 1.7161862527716185e-05, "loss": 0.1012, "step": 1548 }, { "epoch": 0.17202750665483585, "grad_norm": 1.538561463356018, "learning_rate": 1.7195121951219512e-05, "loss": 0.1055, "step": 1551 }, { "epoch": 0.17236024844720496, "grad_norm": 1.748213529586792, "learning_rate": 1.722838137472284e-05, "loss": 0.0805, "step": 1554 }, { "epoch": 0.1726929902395741, "grad_norm": 1.3871899843215942, "learning_rate": 1.7261640798226164e-05, "loss": 0.0789, "step": 1557 }, { "epoch": 0.1730257320319432, "grad_norm": 0.9328947067260742, "learning_rate": 1.729490022172949e-05, "loss": 0.0854, "step": 1560 }, { "epoch": 0.17335847382431233, "grad_norm": 0.6234099864959717, "learning_rate": 1.732815964523282e-05, "loss": 0.0906, "step": 1563 }, { "epoch": 0.17369121561668147, "grad_norm": 0.8881933689117432, "learning_rate": 1.7361419068736142e-05, "loss": 0.0715, "step": 1566 }, { "epoch": 0.17402395740905058, "grad_norm": 1.136181354522705, "learning_rate": 1.739467849223947e-05, "loss": 0.0776, "step": 1569 }, { "epoch": 0.1743566992014197, "grad_norm": 1.2839716672897339, "learning_rate": 1.7427937915742794e-05, "loss": 0.0974, "step": 1572 }, { "epoch": 0.17468944099378883, "grad_norm": 0.6437132954597473, "learning_rate": 1.746119733924612e-05, "loss": 0.1222, "step": 1575 }, { "epoch": 0.17502218278615794, "grad_norm": 1.5346838235855103, "learning_rate": 1.749445676274945e-05, "loss": 0.1427, "step": 1578 }, { "epoch": 0.17535492457852705, "grad_norm": 0.7953895330429077, "learning_rate": 1.7527716186252772e-05, "loss": 0.1301, "step": 1581 }, { "epoch": 0.1756876663708962, "grad_norm": 0.8569965362548828, "learning_rate": 1.75609756097561e-05, "loss": 0.088, "step": 1584 }, { "epoch": 0.1760204081632653, "grad_norm": 0.7296930551528931, "learning_rate": 1.7594235033259427e-05, "loss": 0.0749, "step": 1587 }, { "epoch": 0.17635314995563442, "grad_norm": 0.9114010334014893, "learning_rate": 1.762749445676275e-05, "loss": 0.0875, "step": 1590 }, { "epoch": 0.17668589174800356, "grad_norm": 0.785427987575531, "learning_rate": 1.766075388026608e-05, "loss": 0.1049, "step": 1593 }, { "epoch": 0.17701863354037267, "grad_norm": 1.1411901712417603, "learning_rate": 1.7694013303769402e-05, "loss": 0.0869, "step": 1596 }, { "epoch": 0.17735137533274178, "grad_norm": 1.332963228225708, "learning_rate": 1.772727272727273e-05, "loss": 0.0772, "step": 1599 }, { "epoch": 0.17768411712511092, "grad_norm": 0.9477018713951111, "learning_rate": 1.7760532150776054e-05, "loss": 0.0469, "step": 1602 }, { "epoch": 0.17801685891748004, "grad_norm": 1.370919942855835, "learning_rate": 1.779379157427938e-05, "loss": 0.0832, "step": 1605 }, { "epoch": 0.17834960070984915, "grad_norm": 0.7993923425674438, "learning_rate": 1.7827050997782705e-05, "loss": 0.032, "step": 1608 }, { "epoch": 0.1786823425022183, "grad_norm": 3.0080225467681885, "learning_rate": 1.7860310421286032e-05, "loss": 0.1381, "step": 1611 }, { "epoch": 0.1790150842945874, "grad_norm": 1.223503589630127, "learning_rate": 1.7893569844789356e-05, "loss": 0.0976, "step": 1614 }, { "epoch": 0.1793478260869565, "grad_norm": 1.5700201988220215, "learning_rate": 1.7926829268292684e-05, "loss": 0.0857, "step": 1617 }, { "epoch": 0.17968056787932565, "grad_norm": 0.9695276021957397, "learning_rate": 1.796008869179601e-05, "loss": 0.1001, "step": 1620 }, { "epoch": 0.18001330967169477, "grad_norm": 1.0046852827072144, "learning_rate": 1.7993348115299335e-05, "loss": 0.1125, "step": 1623 }, { "epoch": 0.18034605146406388, "grad_norm": 0.5635235905647278, "learning_rate": 1.8026607538802662e-05, "loss": 0.0797, "step": 1626 }, { "epoch": 0.18067879325643302, "grad_norm": 2.1704487800598145, "learning_rate": 1.8059866962305986e-05, "loss": 0.0991, "step": 1629 }, { "epoch": 0.18101153504880213, "grad_norm": 0.9466676115989685, "learning_rate": 1.8093126385809314e-05, "loss": 0.0754, "step": 1632 }, { "epoch": 0.18134427684117124, "grad_norm": 0.5863909721374512, "learning_rate": 1.812638580931264e-05, "loss": 0.0526, "step": 1635 }, { "epoch": 0.18167701863354038, "grad_norm": 0.7421858310699463, "learning_rate": 1.8159645232815965e-05, "loss": 0.065, "step": 1638 }, { "epoch": 0.1820097604259095, "grad_norm": 3.3749873638153076, "learning_rate": 1.8192904656319292e-05, "loss": 0.1505, "step": 1641 }, { "epoch": 0.1823425022182786, "grad_norm": 1.2545760869979858, "learning_rate": 1.8226164079822616e-05, "loss": 0.0549, "step": 1644 }, { "epoch": 0.18267524401064775, "grad_norm": 0.8090751767158508, "learning_rate": 1.8259423503325944e-05, "loss": 0.0411, "step": 1647 }, { "epoch": 0.18300798580301686, "grad_norm": 0.6511043906211853, "learning_rate": 1.829268292682927e-05, "loss": 0.0658, "step": 1650 }, { "epoch": 0.18334072759538597, "grad_norm": 0.7397048473358154, "learning_rate": 1.8325942350332595e-05, "loss": 0.0725, "step": 1653 }, { "epoch": 0.1836734693877551, "grad_norm": 1.1153345108032227, "learning_rate": 1.8359201773835922e-05, "loss": 0.0829, "step": 1656 }, { "epoch": 0.18400621118012422, "grad_norm": 1.675535798072815, "learning_rate": 1.839246119733925e-05, "loss": 0.0731, "step": 1659 }, { "epoch": 0.18433895297249334, "grad_norm": 1.0902782678604126, "learning_rate": 1.8425720620842574e-05, "loss": 0.0603, "step": 1662 }, { "epoch": 0.18467169476486248, "grad_norm": 1.1933726072311401, "learning_rate": 1.84589800443459e-05, "loss": 0.0997, "step": 1665 }, { "epoch": 0.1850044365572316, "grad_norm": 1.2702038288116455, "learning_rate": 1.8492239467849225e-05, "loss": 0.0609, "step": 1668 }, { "epoch": 0.1853371783496007, "grad_norm": 1.9480712413787842, "learning_rate": 1.8525498891352552e-05, "loss": 0.0767, "step": 1671 }, { "epoch": 0.18566992014196984, "grad_norm": 0.8623412847518921, "learning_rate": 1.855875831485588e-05, "loss": 0.0754, "step": 1674 }, { "epoch": 0.18600266193433895, "grad_norm": 0.6660126447677612, "learning_rate": 1.8592017738359204e-05, "loss": 0.0601, "step": 1677 }, { "epoch": 0.18633540372670807, "grad_norm": 0.6740169525146484, "learning_rate": 1.862527716186253e-05, "loss": 0.0804, "step": 1680 }, { "epoch": 0.1866681455190772, "grad_norm": 3.831813335418701, "learning_rate": 1.8658536585365855e-05, "loss": 0.0662, "step": 1683 }, { "epoch": 0.18700088731144632, "grad_norm": 0.6091939210891724, "learning_rate": 1.8691796008869182e-05, "loss": 0.0469, "step": 1686 }, { "epoch": 0.18733362910381543, "grad_norm": 0.8407537937164307, "learning_rate": 1.8725055432372506e-05, "loss": 0.065, "step": 1689 }, { "epoch": 0.18766637089618457, "grad_norm": 0.948610246181488, "learning_rate": 1.8758314855875834e-05, "loss": 0.028, "step": 1692 }, { "epoch": 0.18799911268855368, "grad_norm": 1.0049147605895996, "learning_rate": 1.8791574279379158e-05, "loss": 0.0585, "step": 1695 }, { "epoch": 0.1883318544809228, "grad_norm": 0.9311211109161377, "learning_rate": 1.8824833702882485e-05, "loss": 0.0551, "step": 1698 }, { "epoch": 0.18866459627329193, "grad_norm": 1.029343843460083, "learning_rate": 1.885809312638581e-05, "loss": 0.0544, "step": 1701 }, { "epoch": 0.18899733806566105, "grad_norm": 2.7237510681152344, "learning_rate": 1.8891352549889136e-05, "loss": 0.1068, "step": 1704 }, { "epoch": 0.18933007985803016, "grad_norm": 1.5521245002746582, "learning_rate": 1.8924611973392464e-05, "loss": 0.1054, "step": 1707 }, { "epoch": 0.1896628216503993, "grad_norm": 2.9119269847869873, "learning_rate": 1.8957871396895788e-05, "loss": 0.0929, "step": 1710 }, { "epoch": 0.1899955634427684, "grad_norm": 1.5868738889694214, "learning_rate": 1.8991130820399115e-05, "loss": 0.0855, "step": 1713 }, { "epoch": 0.19032830523513752, "grad_norm": 1.1308763027191162, "learning_rate": 1.902439024390244e-05, "loss": 0.0685, "step": 1716 }, { "epoch": 0.19066104702750666, "grad_norm": 0.8427382707595825, "learning_rate": 1.9057649667405766e-05, "loss": 0.0877, "step": 1719 }, { "epoch": 0.19099378881987578, "grad_norm": 0.7581518292427063, "learning_rate": 1.9090909090909094e-05, "loss": 0.0543, "step": 1722 }, { "epoch": 0.1913265306122449, "grad_norm": 0.7596389055252075, "learning_rate": 1.9124168514412418e-05, "loss": 0.0755, "step": 1725 }, { "epoch": 0.19165927240461403, "grad_norm": 0.393960565328598, "learning_rate": 1.9157427937915745e-05, "loss": 0.0628, "step": 1728 }, { "epoch": 0.19199201419698314, "grad_norm": 1.003400206565857, "learning_rate": 1.9190687361419072e-05, "loss": 0.0615, "step": 1731 }, { "epoch": 0.19232475598935225, "grad_norm": 0.43695637583732605, "learning_rate": 1.9223946784922396e-05, "loss": 0.0313, "step": 1734 }, { "epoch": 0.1926574977817214, "grad_norm": 1.1501188278198242, "learning_rate": 1.9257206208425724e-05, "loss": 0.0533, "step": 1737 }, { "epoch": 0.1929902395740905, "grad_norm": 1.1870830059051514, "learning_rate": 1.9290465631929047e-05, "loss": 0.0386, "step": 1740 }, { "epoch": 0.19332298136645962, "grad_norm": 0.3313629627227783, "learning_rate": 1.9323725055432375e-05, "loss": 0.0466, "step": 1743 }, { "epoch": 0.19365572315882876, "grad_norm": 1.685494303703308, "learning_rate": 1.9356984478935702e-05, "loss": 0.0776, "step": 1746 }, { "epoch": 0.19398846495119787, "grad_norm": 0.9488961696624756, "learning_rate": 1.9390243902439026e-05, "loss": 0.047, "step": 1749 }, { "epoch": 0.19432120674356698, "grad_norm": 1.6315311193466187, "learning_rate": 1.9423503325942354e-05, "loss": 0.0396, "step": 1752 }, { "epoch": 0.19465394853593612, "grad_norm": 0.9489657878875732, "learning_rate": 1.9456762749445677e-05, "loss": 0.0554, "step": 1755 }, { "epoch": 0.19498669032830523, "grad_norm": 0.8608976602554321, "learning_rate": 1.9490022172949005e-05, "loss": 0.0392, "step": 1758 }, { "epoch": 0.19531943212067435, "grad_norm": 0.7633286714553833, "learning_rate": 1.952328159645233e-05, "loss": 0.0718, "step": 1761 }, { "epoch": 0.1956521739130435, "grad_norm": 0.48051804304122925, "learning_rate": 1.9556541019955656e-05, "loss": 0.0482, "step": 1764 }, { "epoch": 0.1959849157054126, "grad_norm": 1.0360554456710815, "learning_rate": 1.958980044345898e-05, "loss": 0.1043, "step": 1767 }, { "epoch": 0.1963176574977817, "grad_norm": 0.5213366150856018, "learning_rate": 1.9623059866962307e-05, "loss": 0.0587, "step": 1770 }, { "epoch": 0.19665039929015085, "grad_norm": 0.5328545570373535, "learning_rate": 1.965631929046563e-05, "loss": 0.0418, "step": 1773 }, { "epoch": 0.19698314108251996, "grad_norm": 1.0951993465423584, "learning_rate": 1.968957871396896e-05, "loss": 0.1039, "step": 1776 }, { "epoch": 0.19731588287488908, "grad_norm": 2.154322385787964, "learning_rate": 1.9722838137472283e-05, "loss": 0.0927, "step": 1779 }, { "epoch": 0.19764862466725822, "grad_norm": 1.0912052392959595, "learning_rate": 1.975609756097561e-05, "loss": 0.0734, "step": 1782 }, { "epoch": 0.19798136645962733, "grad_norm": 1.93555748462677, "learning_rate": 1.9789356984478937e-05, "loss": 0.0801, "step": 1785 }, { "epoch": 0.19831410825199644, "grad_norm": 0.6059892177581787, "learning_rate": 1.982261640798226e-05, "loss": 0.066, "step": 1788 }, { "epoch": 0.19864685004436558, "grad_norm": 0.759278416633606, "learning_rate": 1.985587583148559e-05, "loss": 0.0338, "step": 1791 }, { "epoch": 0.1989795918367347, "grad_norm": 1.9972269535064697, "learning_rate": 1.9889135254988916e-05, "loss": 0.0838, "step": 1794 }, { "epoch": 0.1993123336291038, "grad_norm": 1.591235876083374, "learning_rate": 1.992239467849224e-05, "loss": 0.0679, "step": 1797 }, { "epoch": 0.19964507542147295, "grad_norm": 1.204803466796875, "learning_rate": 1.9955654101995567e-05, "loss": 0.0814, "step": 1800 }, { "epoch": 0.19997781721384206, "grad_norm": 0.8379703760147095, "learning_rate": 1.998891352549889e-05, "loss": 0.0821, "step": 1803 }, { "epoch": 0.20031055900621117, "grad_norm": 1.0270556211471558, "learning_rate": 1.9997535124476217e-05, "loss": 0.1211, "step": 1806 }, { "epoch": 0.2006433007985803, "grad_norm": 1.1367464065551758, "learning_rate": 1.9993837811190537e-05, "loss": 0.0808, "step": 1809 }, { "epoch": 0.20097604259094942, "grad_norm": 1.2980434894561768, "learning_rate": 1.9990140497904858e-05, "loss": 0.0774, "step": 1812 }, { "epoch": 0.20130878438331853, "grad_norm": 0.8563756346702576, "learning_rate": 1.998644318461918e-05, "loss": 0.0552, "step": 1815 }, { "epoch": 0.20164152617568767, "grad_norm": 1.5977102518081665, "learning_rate": 1.99827458713335e-05, "loss": 0.0762, "step": 1818 }, { "epoch": 0.2019742679680568, "grad_norm": 0.5069833993911743, "learning_rate": 1.997904855804782e-05, "loss": 0.0317, "step": 1821 }, { "epoch": 0.2023070097604259, "grad_norm": 1.0647510290145874, "learning_rate": 1.997535124476214e-05, "loss": 0.0816, "step": 1824 }, { "epoch": 0.20263975155279504, "grad_norm": 0.6669637560844421, "learning_rate": 1.9971653931476462e-05, "loss": 0.0368, "step": 1827 }, { "epoch": 0.20297249334516415, "grad_norm": 1.4038233757019043, "learning_rate": 1.9967956618190783e-05, "loss": 0.0907, "step": 1830 }, { "epoch": 0.20330523513753326, "grad_norm": 1.0725094079971313, "learning_rate": 1.9964259304905104e-05, "loss": 0.0941, "step": 1833 }, { "epoch": 0.2036379769299024, "grad_norm": 2.3538780212402344, "learning_rate": 1.9960561991619425e-05, "loss": 0.1091, "step": 1836 }, { "epoch": 0.20397071872227152, "grad_norm": 0.8055842518806458, "learning_rate": 1.9956864678333746e-05, "loss": 0.086, "step": 1839 }, { "epoch": 0.20430346051464063, "grad_norm": 1.5095528364181519, "learning_rate": 1.9953167365048066e-05, "loss": 0.0651, "step": 1842 }, { "epoch": 0.20463620230700977, "grad_norm": 0.8168131113052368, "learning_rate": 1.9949470051762387e-05, "loss": 0.078, "step": 1845 }, { "epoch": 0.20496894409937888, "grad_norm": 0.6577603816986084, "learning_rate": 1.9945772738476708e-05, "loss": 0.0769, "step": 1848 }, { "epoch": 0.205301685891748, "grad_norm": 0.5910478830337524, "learning_rate": 1.994207542519103e-05, "loss": 0.0866, "step": 1851 }, { "epoch": 0.20563442768411713, "grad_norm": 0.5141116976737976, "learning_rate": 1.993837811190535e-05, "loss": 0.0917, "step": 1854 }, { "epoch": 0.20596716947648624, "grad_norm": 1.8222280740737915, "learning_rate": 1.993468079861967e-05, "loss": 0.1144, "step": 1857 }, { "epoch": 0.20629991126885536, "grad_norm": 0.9868729710578918, "learning_rate": 1.993098348533399e-05, "loss": 0.0656, "step": 1860 }, { "epoch": 0.2066326530612245, "grad_norm": 1.4309096336364746, "learning_rate": 1.9927286172048312e-05, "loss": 0.0549, "step": 1863 }, { "epoch": 0.2069653948535936, "grad_norm": 1.047324299812317, "learning_rate": 1.9923588858762633e-05, "loss": 0.0604, "step": 1866 }, { "epoch": 0.20729813664596272, "grad_norm": 0.7866485118865967, "learning_rate": 1.9919891545476954e-05, "loss": 0.0583, "step": 1869 }, { "epoch": 0.20763087843833186, "grad_norm": 1.5920039415359497, "learning_rate": 1.9916194232191275e-05, "loss": 0.0918, "step": 1872 }, { "epoch": 0.20796362023070097, "grad_norm": 1.070116400718689, "learning_rate": 1.99124969189056e-05, "loss": 0.0724, "step": 1875 }, { "epoch": 0.2082963620230701, "grad_norm": 0.5457541346549988, "learning_rate": 1.9908799605619916e-05, "loss": 0.0524, "step": 1878 }, { "epoch": 0.20862910381543923, "grad_norm": 1.581435203552246, "learning_rate": 1.990510229233424e-05, "loss": 0.0995, "step": 1881 }, { "epoch": 0.20896184560780834, "grad_norm": 0.7186698317527771, "learning_rate": 1.990140497904856e-05, "loss": 0.0951, "step": 1884 }, { "epoch": 0.20929458740017745, "grad_norm": 0.8162978887557983, "learning_rate": 1.989770766576288e-05, "loss": 0.0448, "step": 1887 }, { "epoch": 0.2096273291925466, "grad_norm": 0.6919522285461426, "learning_rate": 1.9894010352477203e-05, "loss": 0.0563, "step": 1890 }, { "epoch": 0.2099600709849157, "grad_norm": 1.3932820558547974, "learning_rate": 1.9890313039191524e-05, "loss": 0.0773, "step": 1893 }, { "epoch": 0.21029281277728482, "grad_norm": 0.5151865482330322, "learning_rate": 1.988661572590584e-05, "loss": 0.0421, "step": 1896 }, { "epoch": 0.21062555456965396, "grad_norm": 2.778536558151245, "learning_rate": 1.9882918412620165e-05, "loss": 0.1315, "step": 1899 }, { "epoch": 0.21095829636202307, "grad_norm": 1.1439509391784668, "learning_rate": 1.9879221099334486e-05, "loss": 0.0615, "step": 1902 }, { "epoch": 0.21129103815439218, "grad_norm": 0.7477930188179016, "learning_rate": 1.9875523786048807e-05, "loss": 0.0738, "step": 1905 }, { "epoch": 0.21162377994676132, "grad_norm": 0.5724446177482605, "learning_rate": 1.9871826472763128e-05, "loss": 0.077, "step": 1908 }, { "epoch": 0.21195652173913043, "grad_norm": 1.0049103498458862, "learning_rate": 1.986812915947745e-05, "loss": 0.0798, "step": 1911 }, { "epoch": 0.21228926353149954, "grad_norm": 0.3736647963523865, "learning_rate": 1.986443184619177e-05, "loss": 0.0638, "step": 1914 }, { "epoch": 0.21262200532386868, "grad_norm": 0.8310322165489197, "learning_rate": 1.986073453290609e-05, "loss": 0.0681, "step": 1917 }, { "epoch": 0.2129547471162378, "grad_norm": 0.6784893274307251, "learning_rate": 1.985703721962041e-05, "loss": 0.053, "step": 1920 }, { "epoch": 0.2132874889086069, "grad_norm": 0.5100345611572266, "learning_rate": 1.9853339906334732e-05, "loss": 0.0623, "step": 1923 }, { "epoch": 0.21362023070097605, "grad_norm": 0.5457683801651001, "learning_rate": 1.9849642593049053e-05, "loss": 0.0524, "step": 1926 }, { "epoch": 0.21395297249334516, "grad_norm": 0.6275314092636108, "learning_rate": 1.9845945279763374e-05, "loss": 0.0964, "step": 1929 }, { "epoch": 0.21428571428571427, "grad_norm": 1.0508310794830322, "learning_rate": 1.9842247966477694e-05, "loss": 0.064, "step": 1932 }, { "epoch": 0.2146184560780834, "grad_norm": 0.8478765487670898, "learning_rate": 1.9838550653192015e-05, "loss": 0.0377, "step": 1935 }, { "epoch": 0.21495119787045253, "grad_norm": 0.6189776062965393, "learning_rate": 1.9834853339906336e-05, "loss": 0.0871, "step": 1938 }, { "epoch": 0.21528393966282164, "grad_norm": 0.7202403545379639, "learning_rate": 1.9831156026620657e-05, "loss": 0.0783, "step": 1941 }, { "epoch": 0.21561668145519078, "grad_norm": 0.8559534549713135, "learning_rate": 1.9827458713334978e-05, "loss": 0.0611, "step": 1944 }, { "epoch": 0.2159494232475599, "grad_norm": 0.46736812591552734, "learning_rate": 1.98237614000493e-05, "loss": 0.0571, "step": 1947 }, { "epoch": 0.216282165039929, "grad_norm": 0.8284899592399597, "learning_rate": 1.982006408676362e-05, "loss": 0.1107, "step": 1950 }, { "epoch": 0.21661490683229814, "grad_norm": 0.8727707862854004, "learning_rate": 1.981636677347794e-05, "loss": 0.0592, "step": 1953 }, { "epoch": 0.21694764862466726, "grad_norm": 1.2675083875656128, "learning_rate": 1.981266946019226e-05, "loss": 0.095, "step": 1956 }, { "epoch": 0.21728039041703637, "grad_norm": 0.829780638217926, "learning_rate": 1.9808972146906582e-05, "loss": 0.051, "step": 1959 }, { "epoch": 0.2176131322094055, "grad_norm": 1.0676740407943726, "learning_rate": 1.9805274833620906e-05, "loss": 0.111, "step": 1962 }, { "epoch": 0.21794587400177462, "grad_norm": 0.2878391444683075, "learning_rate": 1.9801577520335224e-05, "loss": 0.0467, "step": 1965 }, { "epoch": 0.21827861579414373, "grad_norm": 0.8337568640708923, "learning_rate": 1.9797880207049544e-05, "loss": 0.075, "step": 1968 }, { "epoch": 0.21861135758651287, "grad_norm": 0.6597438454627991, "learning_rate": 1.979418289376387e-05, "loss": 0.0523, "step": 1971 }, { "epoch": 0.21894409937888198, "grad_norm": 0.9186668992042542, "learning_rate": 1.9790485580478186e-05, "loss": 0.0767, "step": 1974 }, { "epoch": 0.2192768411712511, "grad_norm": 0.9256647229194641, "learning_rate": 1.978678826719251e-05, "loss": 0.0603, "step": 1977 }, { "epoch": 0.21960958296362024, "grad_norm": 0.8006538152694702, "learning_rate": 1.978309095390683e-05, "loss": 0.089, "step": 1980 }, { "epoch": 0.21994232475598935, "grad_norm": 0.5475347638130188, "learning_rate": 1.977939364062115e-05, "loss": 0.0548, "step": 1983 }, { "epoch": 0.22027506654835846, "grad_norm": 1.42274808883667, "learning_rate": 1.9775696327335473e-05, "loss": 0.0898, "step": 1986 }, { "epoch": 0.2206078083407276, "grad_norm": 1.091268539428711, "learning_rate": 1.9771999014049793e-05, "loss": 0.0358, "step": 1989 }, { "epoch": 0.2209405501330967, "grad_norm": 1.13118577003479, "learning_rate": 1.976830170076411e-05, "loss": 0.0939, "step": 1992 }, { "epoch": 0.22127329192546583, "grad_norm": 0.8882220983505249, "learning_rate": 1.9764604387478435e-05, "loss": 0.0706, "step": 1995 }, { "epoch": 0.22160603371783497, "grad_norm": 0.8605800271034241, "learning_rate": 1.9760907074192756e-05, "loss": 0.0567, "step": 1998 }, { "epoch": 0.22193877551020408, "grad_norm": 0.5770094990730286, "learning_rate": 1.9757209760907077e-05, "loss": 0.0753, "step": 2001 }, { "epoch": 0.2222715173025732, "grad_norm": 0.48515406250953674, "learning_rate": 1.9753512447621398e-05, "loss": 0.0547, "step": 2004 }, { "epoch": 0.22260425909494233, "grad_norm": 0.6738666296005249, "learning_rate": 1.974981513433572e-05, "loss": 0.0563, "step": 2007 }, { "epoch": 0.22293700088731144, "grad_norm": 1.349905252456665, "learning_rate": 1.974611782105004e-05, "loss": 0.0501, "step": 2010 }, { "epoch": 0.22326974267968056, "grad_norm": 1.3771185874938965, "learning_rate": 1.974242050776436e-05, "loss": 0.0862, "step": 2013 }, { "epoch": 0.2236024844720497, "grad_norm": 0.9702403545379639, "learning_rate": 1.973872319447868e-05, "loss": 0.0822, "step": 2016 }, { "epoch": 0.2239352262644188, "grad_norm": 2.028946876525879, "learning_rate": 1.9735025881193e-05, "loss": 0.0978, "step": 2019 }, { "epoch": 0.22426796805678792, "grad_norm": 0.6262312531471252, "learning_rate": 1.9731328567907322e-05, "loss": 0.0529, "step": 2022 }, { "epoch": 0.22460070984915706, "grad_norm": 0.4341925084590912, "learning_rate": 1.9727631254621643e-05, "loss": 0.0437, "step": 2025 }, { "epoch": 0.22493345164152617, "grad_norm": 1.4538863897323608, "learning_rate": 1.9723933941335964e-05, "loss": 0.0946, "step": 2028 }, { "epoch": 0.22526619343389528, "grad_norm": 0.456615149974823, "learning_rate": 1.9720236628050285e-05, "loss": 0.0458, "step": 2031 }, { "epoch": 0.22559893522626442, "grad_norm": 1.0852501392364502, "learning_rate": 1.9716539314764606e-05, "loss": 0.0788, "step": 2034 }, { "epoch": 0.22593167701863354, "grad_norm": 0.8469471335411072, "learning_rate": 1.9712842001478927e-05, "loss": 0.0818, "step": 2037 }, { "epoch": 0.22626441881100265, "grad_norm": 0.9655031561851501, "learning_rate": 1.9709144688193247e-05, "loss": 0.0311, "step": 2040 }, { "epoch": 0.2265971606033718, "grad_norm": 0.6236404180526733, "learning_rate": 1.9705447374907568e-05, "loss": 0.0589, "step": 2043 }, { "epoch": 0.2269299023957409, "grad_norm": 0.357272207736969, "learning_rate": 1.970175006162189e-05, "loss": 0.0928, "step": 2046 }, { "epoch": 0.22726264418811, "grad_norm": 0.9189961552619934, "learning_rate": 1.9698052748336213e-05, "loss": 0.0635, "step": 2049 }, { "epoch": 0.22759538598047915, "grad_norm": 0.3791709840297699, "learning_rate": 1.969435543505053e-05, "loss": 0.0379, "step": 2052 }, { "epoch": 0.22792812777284827, "grad_norm": 1.2580230236053467, "learning_rate": 1.969065812176485e-05, "loss": 0.081, "step": 2055 }, { "epoch": 0.22826086956521738, "grad_norm": 0.6685125827789307, "learning_rate": 1.9686960808479176e-05, "loss": 0.0756, "step": 2058 }, { "epoch": 0.22859361135758652, "grad_norm": 0.5911781191825867, "learning_rate": 1.9683263495193493e-05, "loss": 0.0894, "step": 2061 }, { "epoch": 0.22892635314995563, "grad_norm": 0.8878726363182068, "learning_rate": 1.9679566181907814e-05, "loss": 0.1069, "step": 2064 }, { "epoch": 0.22925909494232474, "grad_norm": 1.0746994018554688, "learning_rate": 1.9675868868622138e-05, "loss": 0.0794, "step": 2067 }, { "epoch": 0.22959183673469388, "grad_norm": 0.881909191608429, "learning_rate": 1.9672171555336456e-05, "loss": 0.0639, "step": 2070 }, { "epoch": 0.229924578527063, "grad_norm": 0.4917859137058258, "learning_rate": 1.966847424205078e-05, "loss": 0.0617, "step": 2073 }, { "epoch": 0.2302573203194321, "grad_norm": 0.7469626069068909, "learning_rate": 1.96647769287651e-05, "loss": 0.0415, "step": 2076 }, { "epoch": 0.23059006211180125, "grad_norm": 0.7138136029243469, "learning_rate": 1.9661079615479418e-05, "loss": 0.0663, "step": 2079 }, { "epoch": 0.23092280390417036, "grad_norm": 1.2402080297470093, "learning_rate": 1.9657382302193742e-05, "loss": 0.0645, "step": 2082 }, { "epoch": 0.23125554569653947, "grad_norm": 0.8617576956748962, "learning_rate": 1.9653684988908063e-05, "loss": 0.0551, "step": 2085 }, { "epoch": 0.2315882874889086, "grad_norm": 1.2075835466384888, "learning_rate": 1.964998767562238e-05, "loss": 0.0551, "step": 2088 }, { "epoch": 0.23192102928127772, "grad_norm": 0.5005934238433838, "learning_rate": 1.9646290362336705e-05, "loss": 0.0654, "step": 2091 }, { "epoch": 0.23225377107364684, "grad_norm": 1.3452153205871582, "learning_rate": 1.9642593049051026e-05, "loss": 0.0779, "step": 2094 }, { "epoch": 0.23258651286601598, "grad_norm": 0.6829342842102051, "learning_rate": 1.9638895735765346e-05, "loss": 0.0292, "step": 2097 }, { "epoch": 0.2329192546583851, "grad_norm": 0.4742944836616516, "learning_rate": 1.9635198422479667e-05, "loss": 0.0584, "step": 2100 }, { "epoch": 0.2332519964507542, "grad_norm": 1.1028013229370117, "learning_rate": 1.9631501109193988e-05, "loss": 0.0557, "step": 2103 }, { "epoch": 0.23358473824312334, "grad_norm": 1.3418288230895996, "learning_rate": 1.962780379590831e-05, "loss": 0.0768, "step": 2106 }, { "epoch": 0.23391748003549245, "grad_norm": 0.6676630973815918, "learning_rate": 1.962410648262263e-05, "loss": 0.048, "step": 2109 }, { "epoch": 0.23425022182786157, "grad_norm": 0.5586355328559875, "learning_rate": 1.962040916933695e-05, "loss": 0.0261, "step": 2112 }, { "epoch": 0.2345829636202307, "grad_norm": 1.5230592489242554, "learning_rate": 1.961671185605127e-05, "loss": 0.0435, "step": 2115 }, { "epoch": 0.23491570541259982, "grad_norm": 0.8380368947982788, "learning_rate": 1.9613014542765592e-05, "loss": 0.0854, "step": 2118 }, { "epoch": 0.23524844720496896, "grad_norm": 0.8411734700202942, "learning_rate": 1.9609317229479913e-05, "loss": 0.0737, "step": 2121 }, { "epoch": 0.23558118899733807, "grad_norm": 1.0250523090362549, "learning_rate": 1.9605619916194234e-05, "loss": 0.066, "step": 2124 }, { "epoch": 0.23591393078970718, "grad_norm": 0.9919677376747131, "learning_rate": 1.9601922602908555e-05, "loss": 0.0805, "step": 2127 }, { "epoch": 0.23624667258207632, "grad_norm": 0.7227186560630798, "learning_rate": 1.9598225289622875e-05, "loss": 0.0588, "step": 2130 }, { "epoch": 0.23657941437444543, "grad_norm": 1.0386593341827393, "learning_rate": 1.9594527976337196e-05, "loss": 0.1065, "step": 2133 }, { "epoch": 0.23691215616681455, "grad_norm": 0.4986654222011566, "learning_rate": 1.9590830663051517e-05, "loss": 0.0599, "step": 2136 }, { "epoch": 0.2372448979591837, "grad_norm": 0.7021065950393677, "learning_rate": 1.9587133349765838e-05, "loss": 0.0517, "step": 2139 }, { "epoch": 0.2375776397515528, "grad_norm": 0.5577269792556763, "learning_rate": 1.958343603648016e-05, "loss": 0.0638, "step": 2142 }, { "epoch": 0.2379103815439219, "grad_norm": 0.9565064907073975, "learning_rate": 1.9579738723194483e-05, "loss": 0.0864, "step": 2145 }, { "epoch": 0.23824312333629105, "grad_norm": 0.5262056589126587, "learning_rate": 1.95760414099088e-05, "loss": 0.0488, "step": 2148 }, { "epoch": 0.23857586512866016, "grad_norm": 0.8435450196266174, "learning_rate": 1.957234409662312e-05, "loss": 0.0512, "step": 2151 }, { "epoch": 0.23890860692102928, "grad_norm": 0.9660298824310303, "learning_rate": 1.9568646783337445e-05, "loss": 0.0464, "step": 2154 }, { "epoch": 0.23924134871339842, "grad_norm": 1.0983567237854004, "learning_rate": 1.9564949470051763e-05, "loss": 0.0325, "step": 2157 }, { "epoch": 0.23957409050576753, "grad_norm": 2.104891777038574, "learning_rate": 1.9561252156766084e-05, "loss": 0.1256, "step": 2160 }, { "epoch": 0.23990683229813664, "grad_norm": 1.3818904161453247, "learning_rate": 1.9557554843480408e-05, "loss": 0.0823, "step": 2163 }, { "epoch": 0.24023957409050578, "grad_norm": 1.2880382537841797, "learning_rate": 1.9553857530194725e-05, "loss": 0.059, "step": 2166 }, { "epoch": 0.2405723158828749, "grad_norm": 1.1482425928115845, "learning_rate": 1.955016021690905e-05, "loss": 0.046, "step": 2169 }, { "epoch": 0.240905057675244, "grad_norm": 0.20881745219230652, "learning_rate": 1.954646290362337e-05, "loss": 0.0373, "step": 2172 }, { "epoch": 0.24123779946761315, "grad_norm": 0.2179490476846695, "learning_rate": 1.9542765590337688e-05, "loss": 0.0572, "step": 2175 }, { "epoch": 0.24157054125998226, "grad_norm": 1.2042059898376465, "learning_rate": 1.9539068277052012e-05, "loss": 0.063, "step": 2178 }, { "epoch": 0.24190328305235137, "grad_norm": 0.6907942295074463, "learning_rate": 1.9535370963766333e-05, "loss": 0.0748, "step": 2181 }, { "epoch": 0.2422360248447205, "grad_norm": 0.8266691565513611, "learning_rate": 1.953167365048065e-05, "loss": 0.0425, "step": 2184 }, { "epoch": 0.24256876663708962, "grad_norm": 0.8956523537635803, "learning_rate": 1.9527976337194974e-05, "loss": 0.0429, "step": 2187 }, { "epoch": 0.24290150842945873, "grad_norm": 0.9546002149581909, "learning_rate": 1.9524279023909295e-05, "loss": 0.109, "step": 2190 }, { "epoch": 0.24323425022182787, "grad_norm": 0.9478087425231934, "learning_rate": 1.9520581710623616e-05, "loss": 0.0498, "step": 2193 }, { "epoch": 0.243566992014197, "grad_norm": 0.7246885895729065, "learning_rate": 1.9516884397337937e-05, "loss": 0.0297, "step": 2196 }, { "epoch": 0.2438997338065661, "grad_norm": 0.21815499663352966, "learning_rate": 1.9513187084052258e-05, "loss": 0.0484, "step": 2199 }, { "epoch": 0.24423247559893524, "grad_norm": 0.7129120230674744, "learning_rate": 1.950948977076658e-05, "loss": 0.0365, "step": 2202 }, { "epoch": 0.24456521739130435, "grad_norm": 0.5119603276252747, "learning_rate": 1.95057924574809e-05, "loss": 0.0745, "step": 2205 }, { "epoch": 0.24489795918367346, "grad_norm": 0.4234178066253662, "learning_rate": 1.950209514419522e-05, "loss": 0.0394, "step": 2208 }, { "epoch": 0.2452307009760426, "grad_norm": 0.772867739200592, "learning_rate": 1.949839783090954e-05, "loss": 0.0597, "step": 2211 }, { "epoch": 0.24556344276841172, "grad_norm": 0.8054187893867493, "learning_rate": 1.9494700517623862e-05, "loss": 0.0603, "step": 2214 }, { "epoch": 0.24589618456078083, "grad_norm": 0.7277831435203552, "learning_rate": 1.9491003204338183e-05, "loss": 0.0632, "step": 2217 }, { "epoch": 0.24622892635314997, "grad_norm": 0.741657018661499, "learning_rate": 1.9487305891052503e-05, "loss": 0.055, "step": 2220 }, { "epoch": 0.24656166814551908, "grad_norm": 0.8727126717567444, "learning_rate": 1.9483608577766824e-05, "loss": 0.0861, "step": 2223 }, { "epoch": 0.2468944099378882, "grad_norm": 0.9698671102523804, "learning_rate": 1.9479911264481145e-05, "loss": 0.0626, "step": 2226 }, { "epoch": 0.24722715173025733, "grad_norm": 0.6069017052650452, "learning_rate": 1.9476213951195466e-05, "loss": 0.0684, "step": 2229 }, { "epoch": 0.24755989352262645, "grad_norm": 0.5673190951347351, "learning_rate": 1.9472516637909787e-05, "loss": 0.0787, "step": 2232 }, { "epoch": 0.24789263531499556, "grad_norm": 0.6465863585472107, "learning_rate": 1.9468819324624107e-05, "loss": 0.0456, "step": 2235 }, { "epoch": 0.2482253771073647, "grad_norm": 1.3087613582611084, "learning_rate": 1.946512201133843e-05, "loss": 0.0583, "step": 2238 }, { "epoch": 0.2485581188997338, "grad_norm": 0.5403178930282593, "learning_rate": 1.9461424698052753e-05, "loss": 0.0622, "step": 2241 }, { "epoch": 0.24889086069210292, "grad_norm": 0.40616196393966675, "learning_rate": 1.945772738476707e-05, "loss": 0.0633, "step": 2244 }, { "epoch": 0.24922360248447206, "grad_norm": 0.8137437105178833, "learning_rate": 1.945403007148139e-05, "loss": 0.0538, "step": 2247 }, { "epoch": 0.24955634427684117, "grad_norm": 1.3155440092086792, "learning_rate": 1.9450332758195715e-05, "loss": 0.0445, "step": 2250 }, { "epoch": 0.2498890860692103, "grad_norm": 0.9610825777053833, "learning_rate": 1.9446635444910032e-05, "loss": 0.0537, "step": 2253 }, { "epoch": 0.2502218278615794, "grad_norm": 0.7824046015739441, "learning_rate": 1.9442938131624353e-05, "loss": 0.064, "step": 2256 }, { "epoch": 0.2505545696539485, "grad_norm": 0.8565216064453125, "learning_rate": 1.9439240818338677e-05, "loss": 0.0832, "step": 2259 }, { "epoch": 0.25088731144631765, "grad_norm": 0.6441604495048523, "learning_rate": 1.9435543505052995e-05, "loss": 0.0419, "step": 2262 }, { "epoch": 0.2512200532386868, "grad_norm": 0.7871369123458862, "learning_rate": 1.9431846191767316e-05, "loss": 0.0796, "step": 2265 }, { "epoch": 0.2515527950310559, "grad_norm": 0.7528088688850403, "learning_rate": 1.942814887848164e-05, "loss": 0.0392, "step": 2268 }, { "epoch": 0.251885536823425, "grad_norm": 0.6693938970565796, "learning_rate": 1.9424451565195957e-05, "loss": 0.052, "step": 2271 }, { "epoch": 0.25221827861579416, "grad_norm": 1.5403079986572266, "learning_rate": 1.942075425191028e-05, "loss": 0.0541, "step": 2274 }, { "epoch": 0.25255102040816324, "grad_norm": 0.6135419607162476, "learning_rate": 1.9417056938624602e-05, "loss": 0.0552, "step": 2277 }, { "epoch": 0.2528837622005324, "grad_norm": 1.380359172821045, "learning_rate": 1.941335962533892e-05, "loss": 0.0836, "step": 2280 }, { "epoch": 0.2532165039929015, "grad_norm": 0.9621865153312683, "learning_rate": 1.9409662312053244e-05, "loss": 0.0794, "step": 2283 }, { "epoch": 0.2535492457852706, "grad_norm": 0.798099935054779, "learning_rate": 1.9405964998767565e-05, "loss": 0.0407, "step": 2286 }, { "epoch": 0.25388198757763975, "grad_norm": 0.8459501266479492, "learning_rate": 1.9402267685481882e-05, "loss": 0.0438, "step": 2289 }, { "epoch": 0.2542147293700089, "grad_norm": 0.5273587107658386, "learning_rate": 1.9398570372196206e-05, "loss": 0.097, "step": 2292 }, { "epoch": 0.25454747116237797, "grad_norm": 0.7755928635597229, "learning_rate": 1.9394873058910527e-05, "loss": 0.1102, "step": 2295 }, { "epoch": 0.2548802129547471, "grad_norm": 0.6923895478248596, "learning_rate": 1.9391175745624848e-05, "loss": 0.0709, "step": 2298 }, { "epoch": 0.25521295474711625, "grad_norm": 0.5582640767097473, "learning_rate": 1.938747843233917e-05, "loss": 0.0336, "step": 2301 }, { "epoch": 0.25554569653948533, "grad_norm": 0.74805748462677, "learning_rate": 1.938378111905349e-05, "loss": 0.0515, "step": 2304 }, { "epoch": 0.2558784383318545, "grad_norm": 0.6663379669189453, "learning_rate": 1.938008380576781e-05, "loss": 0.0639, "step": 2307 }, { "epoch": 0.2562111801242236, "grad_norm": 0.8004273772239685, "learning_rate": 1.937638649248213e-05, "loss": 0.0709, "step": 2310 }, { "epoch": 0.2565439219165927, "grad_norm": 1.1386858224868774, "learning_rate": 1.9372689179196452e-05, "loss": 0.0977, "step": 2313 }, { "epoch": 0.25687666370896184, "grad_norm": 0.44749006628990173, "learning_rate": 1.9368991865910773e-05, "loss": 0.0427, "step": 2316 }, { "epoch": 0.257209405501331, "grad_norm": 0.6668152213096619, "learning_rate": 1.9365294552625094e-05, "loss": 0.0487, "step": 2319 }, { "epoch": 0.25754214729370006, "grad_norm": 0.5316388010978699, "learning_rate": 1.9361597239339415e-05, "loss": 0.0418, "step": 2322 }, { "epoch": 0.2578748890860692, "grad_norm": 0.5631162524223328, "learning_rate": 1.9357899926053735e-05, "loss": 0.1031, "step": 2325 }, { "epoch": 0.25820763087843834, "grad_norm": 0.9314045906066895, "learning_rate": 1.9354202612768056e-05, "loss": 0.0459, "step": 2328 }, { "epoch": 0.25854037267080743, "grad_norm": 0.7432307004928589, "learning_rate": 1.9350505299482377e-05, "loss": 0.0824, "step": 2331 }, { "epoch": 0.25887311446317657, "grad_norm": 0.9146548509597778, "learning_rate": 1.9346807986196698e-05, "loss": 0.0656, "step": 2334 }, { "epoch": 0.2592058562555457, "grad_norm": 1.4115939140319824, "learning_rate": 1.934311067291102e-05, "loss": 0.0845, "step": 2337 }, { "epoch": 0.2595385980479148, "grad_norm": 1.4611786603927612, "learning_rate": 1.933941335962534e-05, "loss": 0.0718, "step": 2340 }, { "epoch": 0.25987133984028393, "grad_norm": 0.8505660891532898, "learning_rate": 1.933571604633966e-05, "loss": 0.1012, "step": 2343 }, { "epoch": 0.2602040816326531, "grad_norm": 0.6535142660140991, "learning_rate": 1.9332018733053985e-05, "loss": 0.0646, "step": 2346 }, { "epoch": 0.26053682342502216, "grad_norm": 0.38412073254585266, "learning_rate": 1.9328321419768302e-05, "loss": 0.0395, "step": 2349 }, { "epoch": 0.2608695652173913, "grad_norm": 1.1341195106506348, "learning_rate": 1.9324624106482623e-05, "loss": 0.0686, "step": 2352 }, { "epoch": 0.26120230700976044, "grad_norm": 0.33791735768318176, "learning_rate": 1.9320926793196947e-05, "loss": 0.035, "step": 2355 }, { "epoch": 0.2615350488021295, "grad_norm": 0.7765600085258484, "learning_rate": 1.9317229479911265e-05, "loss": 0.0755, "step": 2358 }, { "epoch": 0.26186779059449866, "grad_norm": 1.1713680028915405, "learning_rate": 1.9313532166625585e-05, "loss": 0.0565, "step": 2361 }, { "epoch": 0.2622005323868678, "grad_norm": 0.7793876528739929, "learning_rate": 1.930983485333991e-05, "loss": 0.0452, "step": 2364 }, { "epoch": 0.2625332741792369, "grad_norm": 1.5912864208221436, "learning_rate": 1.9306137540054227e-05, "loss": 0.0752, "step": 2367 }, { "epoch": 0.262866015971606, "grad_norm": 0.6959377527236938, "learning_rate": 1.930244022676855e-05, "loss": 0.0493, "step": 2370 }, { "epoch": 0.26319875776397517, "grad_norm": 0.7494945526123047, "learning_rate": 1.9298742913482872e-05, "loss": 0.0475, "step": 2373 }, { "epoch": 0.26353149955634425, "grad_norm": 1.1948646306991577, "learning_rate": 1.929504560019719e-05, "loss": 0.0651, "step": 2376 }, { "epoch": 0.2638642413487134, "grad_norm": 0.8735336065292358, "learning_rate": 1.9291348286911514e-05, "loss": 0.0883, "step": 2379 }, { "epoch": 0.26419698314108253, "grad_norm": 0.40724876523017883, "learning_rate": 1.9287650973625834e-05, "loss": 0.0543, "step": 2382 }, { "epoch": 0.2645297249334516, "grad_norm": 0.4705963730812073, "learning_rate": 1.9283953660340152e-05, "loss": 0.0432, "step": 2385 }, { "epoch": 0.26486246672582076, "grad_norm": 0.3115015923976898, "learning_rate": 1.9280256347054476e-05, "loss": 0.0448, "step": 2388 }, { "epoch": 0.2651952085181899, "grad_norm": 0.2946385145187378, "learning_rate": 1.9276559033768797e-05, "loss": 0.0416, "step": 2391 }, { "epoch": 0.265527950310559, "grad_norm": 0.6723032593727112, "learning_rate": 1.9272861720483118e-05, "loss": 0.0593, "step": 2394 }, { "epoch": 0.2658606921029281, "grad_norm": 0.22190292179584503, "learning_rate": 1.926916440719744e-05, "loss": 0.0715, "step": 2397 }, { "epoch": 0.26619343389529726, "grad_norm": 0.567344605922699, "learning_rate": 1.926546709391176e-05, "loss": 0.0976, "step": 2400 }, { "epoch": 0.26652617568766634, "grad_norm": 1.2799891233444214, "learning_rate": 1.926176978062608e-05, "loss": 0.0418, "step": 2403 }, { "epoch": 0.2668589174800355, "grad_norm": 0.9109591245651245, "learning_rate": 1.92580724673404e-05, "loss": 0.052, "step": 2406 }, { "epoch": 0.2671916592724046, "grad_norm": 0.8677592873573303, "learning_rate": 1.9254375154054722e-05, "loss": 0.082, "step": 2409 }, { "epoch": 0.2675244010647737, "grad_norm": 1.6945568323135376, "learning_rate": 1.9250677840769043e-05, "loss": 0.0554, "step": 2412 }, { "epoch": 0.26785714285714285, "grad_norm": 0.5041748285293579, "learning_rate": 1.9246980527483363e-05, "loss": 0.0607, "step": 2415 }, { "epoch": 0.268189884649512, "grad_norm": 0.8741277456283569, "learning_rate": 1.9243283214197684e-05, "loss": 0.0829, "step": 2418 }, { "epoch": 0.2685226264418811, "grad_norm": 0.7464337944984436, "learning_rate": 1.9239585900912005e-05, "loss": 0.0461, "step": 2421 }, { "epoch": 0.2688553682342502, "grad_norm": 1.0820220708847046, "learning_rate": 1.9235888587626326e-05, "loss": 0.0736, "step": 2424 }, { "epoch": 0.26918811002661935, "grad_norm": 1.4900410175323486, "learning_rate": 1.9232191274340647e-05, "loss": 0.1042, "step": 2427 }, { "epoch": 0.26952085181898844, "grad_norm": 0.627911388874054, "learning_rate": 1.9228493961054968e-05, "loss": 0.0684, "step": 2430 }, { "epoch": 0.2698535936113576, "grad_norm": 0.5567779541015625, "learning_rate": 1.922479664776929e-05, "loss": 0.052, "step": 2433 }, { "epoch": 0.2701863354037267, "grad_norm": 0.9577146768569946, "learning_rate": 1.922109933448361e-05, "loss": 0.0537, "step": 2436 }, { "epoch": 0.2705190771960958, "grad_norm": 0.7689815163612366, "learning_rate": 1.921740202119793e-05, "loss": 0.0647, "step": 2439 }, { "epoch": 0.27085181898846494, "grad_norm": 0.9823862314224243, "learning_rate": 1.9213704707912254e-05, "loss": 0.0862, "step": 2442 }, { "epoch": 0.2711845607808341, "grad_norm": 0.8488829135894775, "learning_rate": 1.921000739462657e-05, "loss": 0.0521, "step": 2445 }, { "epoch": 0.27151730257320317, "grad_norm": 0.7293717265129089, "learning_rate": 1.9206310081340893e-05, "loss": 0.0851, "step": 2448 }, { "epoch": 0.2718500443655723, "grad_norm": 0.5914182066917419, "learning_rate": 1.9202612768055217e-05, "loss": 0.072, "step": 2451 }, { "epoch": 0.27218278615794145, "grad_norm": 1.202878713607788, "learning_rate": 1.9198915454769534e-05, "loss": 0.0838, "step": 2454 }, { "epoch": 0.27251552795031053, "grad_norm": 0.7631196975708008, "learning_rate": 1.9195218141483855e-05, "loss": 0.0747, "step": 2457 }, { "epoch": 0.27284826974267967, "grad_norm": 0.9376946091651917, "learning_rate": 1.919152082819818e-05, "loss": 0.053, "step": 2460 }, { "epoch": 0.2731810115350488, "grad_norm": 0.4363947808742523, "learning_rate": 1.9187823514912497e-05, "loss": 0.0635, "step": 2463 }, { "epoch": 0.2735137533274179, "grad_norm": 0.3375091850757599, "learning_rate": 1.918412620162682e-05, "loss": 0.0783, "step": 2466 }, { "epoch": 0.27384649511978704, "grad_norm": 1.0196411609649658, "learning_rate": 1.918042888834114e-05, "loss": 0.0787, "step": 2469 }, { "epoch": 0.2741792369121562, "grad_norm": 0.5412901639938354, "learning_rate": 1.917673157505546e-05, "loss": 0.0692, "step": 2472 }, { "epoch": 0.27451197870452526, "grad_norm": 0.8565272092819214, "learning_rate": 1.9173034261769783e-05, "loss": 0.0565, "step": 2475 }, { "epoch": 0.2748447204968944, "grad_norm": 0.7125255465507507, "learning_rate": 1.9169336948484104e-05, "loss": 0.0448, "step": 2478 }, { "epoch": 0.27517746228926354, "grad_norm": 0.5161442160606384, "learning_rate": 1.916563963519842e-05, "loss": 0.0946, "step": 2481 }, { "epoch": 0.2755102040816326, "grad_norm": 0.42078039050102234, "learning_rate": 1.9161942321912746e-05, "loss": 0.0358, "step": 2484 }, { "epoch": 0.27584294587400177, "grad_norm": 0.3587859272956848, "learning_rate": 1.9158245008627067e-05, "loss": 0.054, "step": 2487 }, { "epoch": 0.2761756876663709, "grad_norm": 0.7641632556915283, "learning_rate": 1.9154547695341387e-05, "loss": 0.051, "step": 2490 }, { "epoch": 0.27650842945874, "grad_norm": 1.310001254081726, "learning_rate": 1.9150850382055708e-05, "loss": 0.0573, "step": 2493 }, { "epoch": 0.27684117125110913, "grad_norm": 1.064342975616455, "learning_rate": 1.914715306877003e-05, "loss": 0.0941, "step": 2496 }, { "epoch": 0.27717391304347827, "grad_norm": 0.5678613781929016, "learning_rate": 1.914345575548435e-05, "loss": 0.0635, "step": 2499 }, { "epoch": 0.27750665483584736, "grad_norm": 1.0358970165252686, "learning_rate": 1.913975844219867e-05, "loss": 0.0713, "step": 2502 }, { "epoch": 0.2778393966282165, "grad_norm": 0.8945434093475342, "learning_rate": 1.913606112891299e-05, "loss": 0.0521, "step": 2505 }, { "epoch": 0.27817213842058564, "grad_norm": 0.21948325634002686, "learning_rate": 1.9132363815627312e-05, "loss": 0.0329, "step": 2508 }, { "epoch": 0.2785048802129547, "grad_norm": 0.4656367599964142, "learning_rate": 1.9128666502341633e-05, "loss": 0.0528, "step": 2511 }, { "epoch": 0.27883762200532386, "grad_norm": 0.6004865765571594, "learning_rate": 1.9124969189055954e-05, "loss": 0.0931, "step": 2514 }, { "epoch": 0.279170363797693, "grad_norm": 0.9988088607788086, "learning_rate": 1.9121271875770275e-05, "loss": 0.0696, "step": 2517 }, { "epoch": 0.2795031055900621, "grad_norm": 0.6354143619537354, "learning_rate": 1.9117574562484596e-05, "loss": 0.0522, "step": 2520 }, { "epoch": 0.2798358473824312, "grad_norm": 0.8368982076644897, "learning_rate": 1.9113877249198916e-05, "loss": 0.0659, "step": 2523 }, { "epoch": 0.28016858917480036, "grad_norm": 0.9057294726371765, "learning_rate": 1.9110179935913237e-05, "loss": 0.0618, "step": 2526 }, { "epoch": 0.28050133096716945, "grad_norm": 0.9997484683990479, "learning_rate": 1.9106482622627558e-05, "loss": 0.0897, "step": 2529 }, { "epoch": 0.2808340727595386, "grad_norm": 0.32672780752182007, "learning_rate": 1.910278530934188e-05, "loss": 0.0382, "step": 2532 }, { "epoch": 0.28116681455190773, "grad_norm": 0.8742029070854187, "learning_rate": 1.90990879960562e-05, "loss": 0.0694, "step": 2535 }, { "epoch": 0.28149955634427687, "grad_norm": 0.42675644159317017, "learning_rate": 1.9095390682770524e-05, "loss": 0.041, "step": 2538 }, { "epoch": 0.28183229813664595, "grad_norm": 0.5001950263977051, "learning_rate": 1.909169336948484e-05, "loss": 0.0352, "step": 2541 }, { "epoch": 0.2821650399290151, "grad_norm": 1.2744576930999756, "learning_rate": 1.9087996056199162e-05, "loss": 0.0705, "step": 2544 }, { "epoch": 0.28249778172138423, "grad_norm": 0.551737368106842, "learning_rate": 1.9084298742913486e-05, "loss": 0.0499, "step": 2547 }, { "epoch": 0.2828305235137533, "grad_norm": 0.6501047015190125, "learning_rate": 1.9080601429627804e-05, "loss": 0.0494, "step": 2550 }, { "epoch": 0.28316326530612246, "grad_norm": 0.48653414845466614, "learning_rate": 1.9076904116342125e-05, "loss": 0.0454, "step": 2553 }, { "epoch": 0.2834960070984916, "grad_norm": 1.1439030170440674, "learning_rate": 1.907320680305645e-05, "loss": 0.078, "step": 2556 }, { "epoch": 0.2838287488908607, "grad_norm": 1.0533591508865356, "learning_rate": 1.9069509489770766e-05, "loss": 0.0643, "step": 2559 }, { "epoch": 0.2841614906832298, "grad_norm": 1.2341842651367188, "learning_rate": 1.906581217648509e-05, "loss": 0.0586, "step": 2562 }, { "epoch": 0.28449423247559896, "grad_norm": 0.7049636244773865, "learning_rate": 1.906211486319941e-05, "loss": 0.0685, "step": 2565 }, { "epoch": 0.28482697426796805, "grad_norm": 0.943742573261261, "learning_rate": 1.905841754991373e-05, "loss": 0.0691, "step": 2568 }, { "epoch": 0.2851597160603372, "grad_norm": 0.7329608798027039, "learning_rate": 1.9054720236628053e-05, "loss": 0.0589, "step": 2571 }, { "epoch": 0.2854924578527063, "grad_norm": 0.4950379431247711, "learning_rate": 1.9051022923342374e-05, "loss": 0.0547, "step": 2574 }, { "epoch": 0.2858251996450754, "grad_norm": 0.7328525185585022, "learning_rate": 1.904732561005669e-05, "loss": 0.0705, "step": 2577 }, { "epoch": 0.28615794143744455, "grad_norm": 0.7032660245895386, "learning_rate": 1.9043628296771015e-05, "loss": 0.0475, "step": 2580 }, { "epoch": 0.2864906832298137, "grad_norm": 0.5802333950996399, "learning_rate": 1.9039930983485336e-05, "loss": 0.0714, "step": 2583 }, { "epoch": 0.2868234250221828, "grad_norm": 1.7617226839065552, "learning_rate": 1.9036233670199657e-05, "loss": 0.1235, "step": 2586 }, { "epoch": 0.2871561668145519, "grad_norm": 0.5295485854148865, "learning_rate": 1.9032536356913978e-05, "loss": 0.0764, "step": 2589 }, { "epoch": 0.28748890860692106, "grad_norm": 1.1503182649612427, "learning_rate": 1.90288390436283e-05, "loss": 0.0658, "step": 2592 }, { "epoch": 0.28782165039929014, "grad_norm": 0.8864259719848633, "learning_rate": 1.902514173034262e-05, "loss": 0.0585, "step": 2595 }, { "epoch": 0.2881543921916593, "grad_norm": 0.6619686484336853, "learning_rate": 1.902144441705694e-05, "loss": 0.089, "step": 2598 }, { "epoch": 0.2884871339840284, "grad_norm": 1.1683807373046875, "learning_rate": 1.901774710377126e-05, "loss": 0.0588, "step": 2601 }, { "epoch": 0.2888198757763975, "grad_norm": 0.570400595664978, "learning_rate": 1.9014049790485582e-05, "loss": 0.0699, "step": 2604 }, { "epoch": 0.28915261756876665, "grad_norm": 0.4544609785079956, "learning_rate": 1.9010352477199903e-05, "loss": 0.0528, "step": 2607 }, { "epoch": 0.2894853593611358, "grad_norm": 0.6532331109046936, "learning_rate": 1.9006655163914224e-05, "loss": 0.0851, "step": 2610 }, { "epoch": 0.28981810115350487, "grad_norm": 0.5925371646881104, "learning_rate": 1.9002957850628544e-05, "loss": 0.06, "step": 2613 }, { "epoch": 0.290150842945874, "grad_norm": 0.5388638973236084, "learning_rate": 1.8999260537342865e-05, "loss": 0.0576, "step": 2616 }, { "epoch": 0.29048358473824315, "grad_norm": 0.44572585821151733, "learning_rate": 1.8995563224057186e-05, "loss": 0.0793, "step": 2619 }, { "epoch": 0.29081632653061223, "grad_norm": 0.7116209864616394, "learning_rate": 1.8991865910771507e-05, "loss": 0.0571, "step": 2622 }, { "epoch": 0.2911490683229814, "grad_norm": 0.6069960594177246, "learning_rate": 1.8988168597485828e-05, "loss": 0.0351, "step": 2625 }, { "epoch": 0.2914818101153505, "grad_norm": 0.7012966275215149, "learning_rate": 1.898447128420015e-05, "loss": 0.0629, "step": 2628 }, { "epoch": 0.2918145519077196, "grad_norm": 0.6790446639060974, "learning_rate": 1.898077397091447e-05, "loss": 0.0659, "step": 2631 }, { "epoch": 0.29214729370008874, "grad_norm": 0.9958663582801819, "learning_rate": 1.8977076657628794e-05, "loss": 0.0611, "step": 2634 }, { "epoch": 0.2924800354924579, "grad_norm": 0.6715267300605774, "learning_rate": 1.897337934434311e-05, "loss": 0.0497, "step": 2637 }, { "epoch": 0.29281277728482696, "grad_norm": 1.0760561227798462, "learning_rate": 1.8969682031057432e-05, "loss": 0.0551, "step": 2640 }, { "epoch": 0.2931455190771961, "grad_norm": 0.69921875, "learning_rate": 1.8965984717771756e-05, "loss": 0.0649, "step": 2643 }, { "epoch": 0.29347826086956524, "grad_norm": 0.49696269631385803, "learning_rate": 1.8962287404486073e-05, "loss": 0.0308, "step": 2646 }, { "epoch": 0.29381100266193433, "grad_norm": 1.5586237907409668, "learning_rate": 1.8958590091200394e-05, "loss": 0.0602, "step": 2649 }, { "epoch": 0.29414374445430347, "grad_norm": 0.49473321437835693, "learning_rate": 1.895489277791472e-05, "loss": 0.0586, "step": 2652 }, { "epoch": 0.2944764862466726, "grad_norm": 0.8783785700798035, "learning_rate": 1.8951195464629036e-05, "loss": 0.0752, "step": 2655 }, { "epoch": 0.2948092280390417, "grad_norm": 0.5477330088615417, "learning_rate": 1.894749815134336e-05, "loss": 0.0531, "step": 2658 }, { "epoch": 0.29514196983141083, "grad_norm": 0.7200953364372253, "learning_rate": 1.894380083805768e-05, "loss": 0.0572, "step": 2661 }, { "epoch": 0.29547471162378, "grad_norm": 0.6766883134841919, "learning_rate": 1.8940103524772e-05, "loss": 0.0543, "step": 2664 }, { "epoch": 0.29580745341614906, "grad_norm": 1.0133357048034668, "learning_rate": 1.8936406211486323e-05, "loss": 0.0711, "step": 2667 }, { "epoch": 0.2961401952085182, "grad_norm": 1.081597089767456, "learning_rate": 1.8932708898200643e-05, "loss": 0.0564, "step": 2670 }, { "epoch": 0.29647293700088734, "grad_norm": 0.9823821783065796, "learning_rate": 1.8929011584914964e-05, "loss": 0.0692, "step": 2673 }, { "epoch": 0.2968056787932564, "grad_norm": 0.731998860836029, "learning_rate": 1.8925314271629285e-05, "loss": 0.0634, "step": 2676 }, { "epoch": 0.29713842058562556, "grad_norm": 0.32308724522590637, "learning_rate": 1.8921616958343606e-05, "loss": 0.0763, "step": 2679 }, { "epoch": 0.2974711623779947, "grad_norm": 0.3876584768295288, "learning_rate": 1.8917919645057927e-05, "loss": 0.0477, "step": 2682 }, { "epoch": 0.2978039041703638, "grad_norm": 0.748982310295105, "learning_rate": 1.8914222331772247e-05, "loss": 0.0847, "step": 2685 }, { "epoch": 0.2981366459627329, "grad_norm": 0.9676303863525391, "learning_rate": 1.8910525018486568e-05, "loss": 0.0814, "step": 2688 }, { "epoch": 0.29846938775510207, "grad_norm": 1.228436827659607, "learning_rate": 1.890682770520089e-05, "loss": 0.0806, "step": 2691 }, { "epoch": 0.29880212954747115, "grad_norm": 0.5400599837303162, "learning_rate": 1.890313039191521e-05, "loss": 0.0517, "step": 2694 }, { "epoch": 0.2991348713398403, "grad_norm": 0.937278151512146, "learning_rate": 1.889943307862953e-05, "loss": 0.0883, "step": 2697 }, { "epoch": 0.29946761313220943, "grad_norm": 1.1797727346420288, "learning_rate": 1.889573576534385e-05, "loss": 0.0749, "step": 2700 }, { "epoch": 0.2998003549245785, "grad_norm": 0.9097458720207214, "learning_rate": 1.8892038452058172e-05, "loss": 0.0907, "step": 2703 }, { "epoch": 0.30013309671694766, "grad_norm": 0.818001389503479, "learning_rate": 1.8888341138772493e-05, "loss": 0.0342, "step": 2706 }, { "epoch": 0.3004658385093168, "grad_norm": 0.9021499752998352, "learning_rate": 1.8884643825486814e-05, "loss": 0.0831, "step": 2709 }, { "epoch": 0.3007985803016859, "grad_norm": 0.4656893312931061, "learning_rate": 1.8880946512201135e-05, "loss": 0.0338, "step": 2712 }, { "epoch": 0.301131322094055, "grad_norm": 0.6076945066452026, "learning_rate": 1.8877249198915456e-05, "loss": 0.0504, "step": 2715 }, { "epoch": 0.30146406388642416, "grad_norm": 0.8216308355331421, "learning_rate": 1.8873551885629776e-05, "loss": 0.0279, "step": 2718 }, { "epoch": 0.30179680567879325, "grad_norm": 0.6028926968574524, "learning_rate": 1.8869854572344097e-05, "loss": 0.0897, "step": 2721 }, { "epoch": 0.3021295474711624, "grad_norm": 0.8631056547164917, "learning_rate": 1.8866157259058418e-05, "loss": 0.0455, "step": 2724 }, { "epoch": 0.3024622892635315, "grad_norm": 0.7998207807540894, "learning_rate": 1.886245994577274e-05, "loss": 0.0883, "step": 2727 }, { "epoch": 0.3027950310559006, "grad_norm": 1.000796914100647, "learning_rate": 1.885876263248706e-05, "loss": 0.0859, "step": 2730 }, { "epoch": 0.30312777284826975, "grad_norm": 0.9148082733154297, "learning_rate": 1.885506531920138e-05, "loss": 0.0611, "step": 2733 }, { "epoch": 0.3034605146406389, "grad_norm": 0.8558679819107056, "learning_rate": 1.88513680059157e-05, "loss": 0.0759, "step": 2736 }, { "epoch": 0.303793256433008, "grad_norm": 0.7146349549293518, "learning_rate": 1.8847670692630026e-05, "loss": 0.0658, "step": 2739 }, { "epoch": 0.3041259982253771, "grad_norm": 0.7332908511161804, "learning_rate": 1.8843973379344343e-05, "loss": 0.0491, "step": 2742 }, { "epoch": 0.30445874001774625, "grad_norm": 0.565182089805603, "learning_rate": 1.8840276066058664e-05, "loss": 0.0874, "step": 2745 }, { "epoch": 0.30479148181011534, "grad_norm": 0.9798440337181091, "learning_rate": 1.8836578752772988e-05, "loss": 0.0931, "step": 2748 }, { "epoch": 0.3051242236024845, "grad_norm": 0.6186816096305847, "learning_rate": 1.8832881439487306e-05, "loss": 0.0713, "step": 2751 }, { "epoch": 0.3054569653948536, "grad_norm": 0.6589553952217102, "learning_rate": 1.8829184126201626e-05, "loss": 0.0418, "step": 2754 }, { "epoch": 0.3057897071872227, "grad_norm": 0.6093562245368958, "learning_rate": 1.882548681291595e-05, "loss": 0.0708, "step": 2757 }, { "epoch": 0.30612244897959184, "grad_norm": 1.2100138664245605, "learning_rate": 1.882178949963027e-05, "loss": 0.076, "step": 2760 }, { "epoch": 0.306455190771961, "grad_norm": 0.6600022315979004, "learning_rate": 1.8818092186344592e-05, "loss": 0.0675, "step": 2763 }, { "epoch": 0.30678793256433007, "grad_norm": 0.7239612936973572, "learning_rate": 1.8814394873058913e-05, "loss": 0.0763, "step": 2766 }, { "epoch": 0.3071206743566992, "grad_norm": 0.45550718903541565, "learning_rate": 1.8810697559773234e-05, "loss": 0.0559, "step": 2769 }, { "epoch": 0.30745341614906835, "grad_norm": 0.931496262550354, "learning_rate": 1.8807000246487555e-05, "loss": 0.0646, "step": 2772 }, { "epoch": 0.30778615794143743, "grad_norm": 0.6480677127838135, "learning_rate": 1.8803302933201875e-05, "loss": 0.0639, "step": 2775 }, { "epoch": 0.3081188997338066, "grad_norm": 0.7240896224975586, "learning_rate": 1.8799605619916196e-05, "loss": 0.0415, "step": 2778 }, { "epoch": 0.3084516415261757, "grad_norm": 2.9924631118774414, "learning_rate": 1.8795908306630517e-05, "loss": 0.1286, "step": 2781 }, { "epoch": 0.3087843833185448, "grad_norm": 0.6245113611221313, "learning_rate": 1.8792210993344838e-05, "loss": 0.0714, "step": 2784 }, { "epoch": 0.30911712511091394, "grad_norm": 0.7358962893486023, "learning_rate": 1.878851368005916e-05, "loss": 0.1041, "step": 2787 }, { "epoch": 0.3094498669032831, "grad_norm": 0.42506420612335205, "learning_rate": 1.878481636677348e-05, "loss": 0.0633, "step": 2790 }, { "epoch": 0.30978260869565216, "grad_norm": 0.6131051778793335, "learning_rate": 1.87811190534878e-05, "loss": 0.0681, "step": 2793 }, { "epoch": 0.3101153504880213, "grad_norm": 1.4154939651489258, "learning_rate": 1.877742174020212e-05, "loss": 0.0608, "step": 2796 }, { "epoch": 0.31044809228039044, "grad_norm": 0.9665355682373047, "learning_rate": 1.8773724426916442e-05, "loss": 0.0473, "step": 2799 }, { "epoch": 0.3107808340727595, "grad_norm": 0.8604215383529663, "learning_rate": 1.8770027113630763e-05, "loss": 0.0444, "step": 2802 }, { "epoch": 0.31111357586512867, "grad_norm": 0.3589310050010681, "learning_rate": 1.8766329800345084e-05, "loss": 0.0358, "step": 2805 }, { "epoch": 0.3114463176574978, "grad_norm": 1.2165489196777344, "learning_rate": 1.8762632487059404e-05, "loss": 0.0531, "step": 2808 }, { "epoch": 0.3117790594498669, "grad_norm": 1.1086772680282593, "learning_rate": 1.8758935173773725e-05, "loss": 0.0893, "step": 2811 }, { "epoch": 0.31211180124223603, "grad_norm": 0.7608538866043091, "learning_rate": 1.8755237860488046e-05, "loss": 0.0729, "step": 2814 }, { "epoch": 0.31244454303460517, "grad_norm": 0.785419762134552, "learning_rate": 1.8751540547202367e-05, "loss": 0.065, "step": 2817 }, { "epoch": 0.31277728482697426, "grad_norm": 1.1765352487564087, "learning_rate": 1.8747843233916688e-05, "loss": 0.0889, "step": 2820 }, { "epoch": 0.3131100266193434, "grad_norm": 1.0763872861862183, "learning_rate": 1.874414592063101e-05, "loss": 0.0798, "step": 2823 }, { "epoch": 0.31344276841171254, "grad_norm": 0.4707179367542267, "learning_rate": 1.874044860734533e-05, "loss": 0.0513, "step": 2826 }, { "epoch": 0.3137755102040816, "grad_norm": 1.0982511043548584, "learning_rate": 1.873675129405965e-05, "loss": 0.0915, "step": 2829 }, { "epoch": 0.31410825199645076, "grad_norm": 0.507606565952301, "learning_rate": 1.873305398077397e-05, "loss": 0.0925, "step": 2832 }, { "epoch": 0.3144409937888199, "grad_norm": 0.5819019079208374, "learning_rate": 1.8729356667488295e-05, "loss": 0.0445, "step": 2835 }, { "epoch": 0.314773735581189, "grad_norm": 0.7455946803092957, "learning_rate": 1.8725659354202616e-05, "loss": 0.0939, "step": 2838 }, { "epoch": 0.3151064773735581, "grad_norm": 1.2425296306610107, "learning_rate": 1.8721962040916934e-05, "loss": 0.0464, "step": 2841 }, { "epoch": 0.31543921916592726, "grad_norm": 0.8738220930099487, "learning_rate": 1.8718264727631258e-05, "loss": 0.0888, "step": 2844 }, { "epoch": 0.31577196095829635, "grad_norm": 0.4375736117362976, "learning_rate": 1.871456741434558e-05, "loss": 0.0904, "step": 2847 }, { "epoch": 0.3161047027506655, "grad_norm": 0.9705800414085388, "learning_rate": 1.8710870101059896e-05, "loss": 0.0797, "step": 2850 }, { "epoch": 0.31643744454303463, "grad_norm": 0.47795581817626953, "learning_rate": 1.870717278777422e-05, "loss": 0.0437, "step": 2853 }, { "epoch": 0.3167701863354037, "grad_norm": 0.47983092069625854, "learning_rate": 1.870347547448854e-05, "loss": 0.0583, "step": 2856 }, { "epoch": 0.31710292812777285, "grad_norm": 0.48118361830711365, "learning_rate": 1.8699778161202862e-05, "loss": 0.0525, "step": 2859 }, { "epoch": 0.317435669920142, "grad_norm": 0.559276819229126, "learning_rate": 1.8696080847917183e-05, "loss": 0.0556, "step": 2862 }, { "epoch": 0.3177684117125111, "grad_norm": 0.885802686214447, "learning_rate": 1.8692383534631503e-05, "loss": 0.0591, "step": 2865 }, { "epoch": 0.3181011535048802, "grad_norm": 0.5892051458358765, "learning_rate": 1.8688686221345824e-05, "loss": 0.0729, "step": 2868 }, { "epoch": 0.31843389529724936, "grad_norm": 1.0395808219909668, "learning_rate": 1.8684988908060145e-05, "loss": 0.0517, "step": 2871 }, { "epoch": 0.31876663708961844, "grad_norm": 0.2998667359352112, "learning_rate": 1.8681291594774466e-05, "loss": 0.0719, "step": 2874 }, { "epoch": 0.3190993788819876, "grad_norm": 0.794590413570404, "learning_rate": 1.8677594281488787e-05, "loss": 0.0875, "step": 2877 }, { "epoch": 0.3194321206743567, "grad_norm": 0.42278173565864563, "learning_rate": 1.8673896968203108e-05, "loss": 0.0597, "step": 2880 }, { "epoch": 0.3197648624667258, "grad_norm": 0.8409509062767029, "learning_rate": 1.867019965491743e-05, "loss": 0.0796, "step": 2883 }, { "epoch": 0.32009760425909495, "grad_norm": 0.5023472309112549, "learning_rate": 1.866650234163175e-05, "loss": 0.0387, "step": 2886 }, { "epoch": 0.3204303460514641, "grad_norm": 0.5487576723098755, "learning_rate": 1.866280502834607e-05, "loss": 0.048, "step": 2889 }, { "epoch": 0.3207630878438332, "grad_norm": 0.7946828603744507, "learning_rate": 1.865910771506039e-05, "loss": 0.086, "step": 2892 }, { "epoch": 0.3210958296362023, "grad_norm": 0.556452214717865, "learning_rate": 1.865541040177471e-05, "loss": 0.0849, "step": 2895 }, { "epoch": 0.32142857142857145, "grad_norm": 0.6310257315635681, "learning_rate": 1.8651713088489032e-05, "loss": 0.0665, "step": 2898 }, { "epoch": 0.32176131322094054, "grad_norm": 0.6094896197319031, "learning_rate": 1.8648015775203353e-05, "loss": 0.0514, "step": 2901 }, { "epoch": 0.3220940550133097, "grad_norm": 0.8641183376312256, "learning_rate": 1.8644318461917674e-05, "loss": 0.0585, "step": 2904 }, { "epoch": 0.3224267968056788, "grad_norm": 0.4487057030200958, "learning_rate": 1.8640621148631995e-05, "loss": 0.0615, "step": 2907 }, { "epoch": 0.3227595385980479, "grad_norm": 0.7011855840682983, "learning_rate": 1.8636923835346316e-05, "loss": 0.061, "step": 2910 }, { "epoch": 0.32309228039041704, "grad_norm": 1.1589741706848145, "learning_rate": 1.8633226522060637e-05, "loss": 0.0635, "step": 2913 }, { "epoch": 0.3234250221827862, "grad_norm": 0.5549185276031494, "learning_rate": 1.8629529208774957e-05, "loss": 0.0531, "step": 2916 }, { "epoch": 0.32375776397515527, "grad_norm": 1.1407326459884644, "learning_rate": 1.8625831895489278e-05, "loss": 0.0723, "step": 2919 }, { "epoch": 0.3240905057675244, "grad_norm": 0.9473936557769775, "learning_rate": 1.86221345822036e-05, "loss": 0.0423, "step": 2922 }, { "epoch": 0.32442324755989355, "grad_norm": 0.7252969145774841, "learning_rate": 1.8618437268917923e-05, "loss": 0.0485, "step": 2925 }, { "epoch": 0.32475598935226263, "grad_norm": 0.7186986804008484, "learning_rate": 1.861473995563224e-05, "loss": 0.1046, "step": 2928 }, { "epoch": 0.32508873114463177, "grad_norm": 0.5465221405029297, "learning_rate": 1.8611042642346565e-05, "loss": 0.0828, "step": 2931 }, { "epoch": 0.3254214729370009, "grad_norm": 0.590917706489563, "learning_rate": 1.8607345329060886e-05, "loss": 0.0477, "step": 2934 }, { "epoch": 0.32575421472937, "grad_norm": 0.7593115568161011, "learning_rate": 1.8603648015775203e-05, "loss": 0.045, "step": 2937 }, { "epoch": 0.32608695652173914, "grad_norm": 1.0391074419021606, "learning_rate": 1.8599950702489527e-05, "loss": 0.0675, "step": 2940 }, { "epoch": 0.3264196983141083, "grad_norm": 0.44134989380836487, "learning_rate": 1.8596253389203848e-05, "loss": 0.0734, "step": 2943 }, { "epoch": 0.32675244010647736, "grad_norm": 0.5312244296073914, "learning_rate": 1.8592556075918166e-05, "loss": 0.0367, "step": 2946 }, { "epoch": 0.3270851818988465, "grad_norm": 0.2013341635465622, "learning_rate": 1.858885876263249e-05, "loss": 0.0621, "step": 2949 }, { "epoch": 0.32741792369121564, "grad_norm": 0.4060787558555603, "learning_rate": 1.858516144934681e-05, "loss": 0.0483, "step": 2952 }, { "epoch": 0.3277506654835847, "grad_norm": 0.758514404296875, "learning_rate": 1.858146413606113e-05, "loss": 0.0544, "step": 2955 }, { "epoch": 0.32808340727595386, "grad_norm": 0.9805461168289185, "learning_rate": 1.8577766822775452e-05, "loss": 0.049, "step": 2958 }, { "epoch": 0.328416149068323, "grad_norm": 0.7362799048423767, "learning_rate": 1.8574069509489773e-05, "loss": 0.081, "step": 2961 }, { "epoch": 0.3287488908606921, "grad_norm": 1.0321418046951294, "learning_rate": 1.8570372196204094e-05, "loss": 0.0662, "step": 2964 }, { "epoch": 0.32908163265306123, "grad_norm": 0.6665422916412354, "learning_rate": 1.8566674882918415e-05, "loss": 0.0477, "step": 2967 }, { "epoch": 0.32941437444543037, "grad_norm": 0.8438568115234375, "learning_rate": 1.8562977569632736e-05, "loss": 0.0381, "step": 2970 }, { "epoch": 0.32974711623779945, "grad_norm": 0.9471597671508789, "learning_rate": 1.8559280256347056e-05, "loss": 0.0555, "step": 2973 }, { "epoch": 0.3300798580301686, "grad_norm": 0.8542424440383911, "learning_rate": 1.8555582943061377e-05, "loss": 0.0545, "step": 2976 }, { "epoch": 0.33041259982253773, "grad_norm": 0.8964595794677734, "learning_rate": 1.8551885629775698e-05, "loss": 0.0628, "step": 2979 }, { "epoch": 0.3307453416149068, "grad_norm": 0.9441608190536499, "learning_rate": 1.854818831649002e-05, "loss": 0.0507, "step": 2982 }, { "epoch": 0.33107808340727596, "grad_norm": 1.6162463426589966, "learning_rate": 1.854449100320434e-05, "loss": 0.0597, "step": 2985 }, { "epoch": 0.3314108251996451, "grad_norm": 0.9303036332130432, "learning_rate": 1.854079368991866e-05, "loss": 0.08, "step": 2988 }, { "epoch": 0.3317435669920142, "grad_norm": 0.6820287108421326, "learning_rate": 1.853709637663298e-05, "loss": 0.0869, "step": 2991 }, { "epoch": 0.3320763087843833, "grad_norm": 0.713672399520874, "learning_rate": 1.8533399063347302e-05, "loss": 0.0577, "step": 2994 }, { "epoch": 0.33240905057675246, "grad_norm": 0.49177172780036926, "learning_rate": 1.8529701750061623e-05, "loss": 0.0322, "step": 2997 }, { "epoch": 0.33274179236912155, "grad_norm": 0.881343424320221, "learning_rate": 1.8526004436775944e-05, "loss": 0.0612, "step": 3000 }, { "epoch": 0.3330745341614907, "grad_norm": 0.5112404227256775, "learning_rate": 1.8522307123490268e-05, "loss": 0.0407, "step": 3003 }, { "epoch": 0.3334072759538598, "grad_norm": 0.3444555699825287, "learning_rate": 1.8518609810204585e-05, "loss": 0.0456, "step": 3006 }, { "epoch": 0.3337400177462289, "grad_norm": 0.7625641822814941, "learning_rate": 1.8514912496918906e-05, "loss": 0.0707, "step": 3009 }, { "epoch": 0.33407275953859805, "grad_norm": 0.5832079648971558, "learning_rate": 1.851121518363323e-05, "loss": 0.0548, "step": 3012 }, { "epoch": 0.3344055013309672, "grad_norm": 0.5985016822814941, "learning_rate": 1.8507517870347548e-05, "loss": 0.0577, "step": 3015 }, { "epoch": 0.3347382431233363, "grad_norm": 0.8900985717773438, "learning_rate": 1.850382055706187e-05, "loss": 0.0658, "step": 3018 }, { "epoch": 0.3350709849157054, "grad_norm": 0.6362046003341675, "learning_rate": 1.8500123243776193e-05, "loss": 0.0479, "step": 3021 }, { "epoch": 0.33540372670807456, "grad_norm": 0.6196051836013794, "learning_rate": 1.849642593049051e-05, "loss": 0.0887, "step": 3024 }, { "epoch": 0.33573646850044364, "grad_norm": 0.5766664147377014, "learning_rate": 1.8492728617204835e-05, "loss": 0.0618, "step": 3027 }, { "epoch": 0.3360692102928128, "grad_norm": 0.6710587739944458, "learning_rate": 1.8489031303919155e-05, "loss": 0.0648, "step": 3030 }, { "epoch": 0.3364019520851819, "grad_norm": 0.6429928541183472, "learning_rate": 1.8485333990633473e-05, "loss": 0.0943, "step": 3033 }, { "epoch": 0.336734693877551, "grad_norm": 0.46570590138435364, "learning_rate": 1.8481636677347797e-05, "loss": 0.0679, "step": 3036 }, { "epoch": 0.33706743566992015, "grad_norm": 0.8101410269737244, "learning_rate": 1.8477939364062118e-05, "loss": 0.0437, "step": 3039 }, { "epoch": 0.3374001774622893, "grad_norm": 0.5311592221260071, "learning_rate": 1.8474242050776435e-05, "loss": 0.0392, "step": 3042 }, { "epoch": 0.33773291925465837, "grad_norm": 0.3554839789867401, "learning_rate": 1.847054473749076e-05, "loss": 0.0455, "step": 3045 }, { "epoch": 0.3380656610470275, "grad_norm": 1.8063290119171143, "learning_rate": 1.846684742420508e-05, "loss": 0.033, "step": 3048 }, { "epoch": 0.33839840283939665, "grad_norm": 1.0300588607788086, "learning_rate": 1.84631501109194e-05, "loss": 0.0783, "step": 3051 }, { "epoch": 0.33873114463176573, "grad_norm": 0.790448784828186, "learning_rate": 1.8459452797633722e-05, "loss": 0.0821, "step": 3054 }, { "epoch": 0.3390638864241349, "grad_norm": 0.7456640005111694, "learning_rate": 1.8455755484348043e-05, "loss": 0.0408, "step": 3057 }, { "epoch": 0.339396628216504, "grad_norm": 0.48435062170028687, "learning_rate": 1.8452058171062364e-05, "loss": 0.0252, "step": 3060 }, { "epoch": 0.3397293700088731, "grad_norm": 0.6230789422988892, "learning_rate": 1.8448360857776684e-05, "loss": 0.0488, "step": 3063 }, { "epoch": 0.34006211180124224, "grad_norm": 0.9650553464889526, "learning_rate": 1.8444663544491005e-05, "loss": 0.0587, "step": 3066 }, { "epoch": 0.3403948535936114, "grad_norm": 0.6961053609848022, "learning_rate": 1.8440966231205326e-05, "loss": 0.058, "step": 3069 }, { "epoch": 0.34072759538598046, "grad_norm": 1.014679193496704, "learning_rate": 1.8437268917919647e-05, "loss": 0.0977, "step": 3072 }, { "epoch": 0.3410603371783496, "grad_norm": 0.4624837040901184, "learning_rate": 1.8433571604633968e-05, "loss": 0.0464, "step": 3075 }, { "epoch": 0.34139307897071874, "grad_norm": 0.5442583560943604, "learning_rate": 1.842987429134829e-05, "loss": 0.0517, "step": 3078 }, { "epoch": 0.34172582076308783, "grad_norm": 0.559546709060669, "learning_rate": 1.842617697806261e-05, "loss": 0.0526, "step": 3081 }, { "epoch": 0.34205856255545697, "grad_norm": 1.5782984495162964, "learning_rate": 1.842247966477693e-05, "loss": 0.0465, "step": 3084 }, { "epoch": 0.3423913043478261, "grad_norm": 0.8984056115150452, "learning_rate": 1.841878235149125e-05, "loss": 0.0892, "step": 3087 }, { "epoch": 0.3427240461401952, "grad_norm": 0.8382478952407837, "learning_rate": 1.8415085038205572e-05, "loss": 0.0447, "step": 3090 }, { "epoch": 0.34305678793256433, "grad_norm": 0.533497154712677, "learning_rate": 1.8411387724919893e-05, "loss": 0.0628, "step": 3093 }, { "epoch": 0.3433895297249335, "grad_norm": 0.6629424095153809, "learning_rate": 1.8407690411634213e-05, "loss": 0.0658, "step": 3096 }, { "epoch": 0.34372227151730256, "grad_norm": 1.0164272785186768, "learning_rate": 1.8403993098348538e-05, "loss": 0.0899, "step": 3099 }, { "epoch": 0.3440550133096717, "grad_norm": 0.7909255623817444, "learning_rate": 1.8400295785062855e-05, "loss": 0.0515, "step": 3102 }, { "epoch": 0.34438775510204084, "grad_norm": 0.4436783194541931, "learning_rate": 1.8396598471777176e-05, "loss": 0.0423, "step": 3105 }, { "epoch": 0.3447204968944099, "grad_norm": 1.1336710453033447, "learning_rate": 1.83929011584915e-05, "loss": 0.057, "step": 3108 }, { "epoch": 0.34505323868677906, "grad_norm": 0.35963669419288635, "learning_rate": 1.8389203845205817e-05, "loss": 0.0565, "step": 3111 }, { "epoch": 0.3453859804791482, "grad_norm": 0.8173931837081909, "learning_rate": 1.838550653192014e-05, "loss": 0.0681, "step": 3114 }, { "epoch": 0.3457187222715173, "grad_norm": 0.5774978995323181, "learning_rate": 1.8381809218634463e-05, "loss": 0.0655, "step": 3117 }, { "epoch": 0.3460514640638864, "grad_norm": 0.5434301495552063, "learning_rate": 1.837811190534878e-05, "loss": 0.065, "step": 3120 }, { "epoch": 0.34638420585625557, "grad_norm": 0.5795627236366272, "learning_rate": 1.8374414592063104e-05, "loss": 0.0337, "step": 3123 }, { "epoch": 0.34671694764862465, "grad_norm": 0.337461918592453, "learning_rate": 1.8370717278777425e-05, "loss": 0.0205, "step": 3126 }, { "epoch": 0.3470496894409938, "grad_norm": 1.2700674533843994, "learning_rate": 1.8367019965491742e-05, "loss": 0.0995, "step": 3129 }, { "epoch": 0.34738243123336293, "grad_norm": 0.5842007994651794, "learning_rate": 1.8363322652206067e-05, "loss": 0.0495, "step": 3132 }, { "epoch": 0.347715173025732, "grad_norm": 0.4923354983329773, "learning_rate": 1.8359625338920387e-05, "loss": 0.0305, "step": 3135 }, { "epoch": 0.34804791481810116, "grad_norm": 0.3931798040866852, "learning_rate": 1.8355928025634705e-05, "loss": 0.0588, "step": 3138 }, { "epoch": 0.3483806566104703, "grad_norm": 0.45947206020355225, "learning_rate": 1.835223071234903e-05, "loss": 0.0641, "step": 3141 }, { "epoch": 0.3487133984028394, "grad_norm": 0.7210850119590759, "learning_rate": 1.834853339906335e-05, "loss": 0.0402, "step": 3144 }, { "epoch": 0.3490461401952085, "grad_norm": 0.43192970752716064, "learning_rate": 1.834483608577767e-05, "loss": 0.0647, "step": 3147 }, { "epoch": 0.34937888198757766, "grad_norm": 0.5771254301071167, "learning_rate": 1.834113877249199e-05, "loss": 0.0369, "step": 3150 }, { "epoch": 0.34971162377994675, "grad_norm": 0.5738529562950134, "learning_rate": 1.8337441459206312e-05, "loss": 0.0566, "step": 3153 }, { "epoch": 0.3500443655723159, "grad_norm": 0.5538614392280579, "learning_rate": 1.8333744145920633e-05, "loss": 0.0506, "step": 3156 }, { "epoch": 0.350377107364685, "grad_norm": 0.9904794096946716, "learning_rate": 1.8330046832634954e-05, "loss": 0.0515, "step": 3159 }, { "epoch": 0.3507098491570541, "grad_norm": 0.7667285799980164, "learning_rate": 1.8326349519349275e-05, "loss": 0.0666, "step": 3162 }, { "epoch": 0.35104259094942325, "grad_norm": 0.2695339024066925, "learning_rate": 1.8322652206063596e-05, "loss": 0.04, "step": 3165 }, { "epoch": 0.3513753327417924, "grad_norm": 0.6790674328804016, "learning_rate": 1.8318954892777916e-05, "loss": 0.0791, "step": 3168 }, { "epoch": 0.3517080745341615, "grad_norm": 0.745039165019989, "learning_rate": 1.8315257579492237e-05, "loss": 0.0826, "step": 3171 }, { "epoch": 0.3520408163265306, "grad_norm": 0.3177897036075592, "learning_rate": 1.8311560266206558e-05, "loss": 0.0605, "step": 3174 }, { "epoch": 0.35237355811889975, "grad_norm": 0.9375316500663757, "learning_rate": 1.830786295292088e-05, "loss": 0.0658, "step": 3177 }, { "epoch": 0.35270629991126884, "grad_norm": 0.48140013217926025, "learning_rate": 1.83041656396352e-05, "loss": 0.043, "step": 3180 }, { "epoch": 0.353039041703638, "grad_norm": 1.10736083984375, "learning_rate": 1.830046832634952e-05, "loss": 0.0414, "step": 3183 }, { "epoch": 0.3533717834960071, "grad_norm": 0.4090302288532257, "learning_rate": 1.829677101306384e-05, "loss": 0.0777, "step": 3186 }, { "epoch": 0.3537045252883762, "grad_norm": 1.2782487869262695, "learning_rate": 1.8293073699778162e-05, "loss": 0.0579, "step": 3189 }, { "epoch": 0.35403726708074534, "grad_norm": 1.884108066558838, "learning_rate": 1.8289376386492483e-05, "loss": 0.1587, "step": 3192 }, { "epoch": 0.3543700088731145, "grad_norm": 0.8434795141220093, "learning_rate": 1.8285679073206804e-05, "loss": 0.045, "step": 3195 }, { "epoch": 0.35470275066548357, "grad_norm": 0.6141440272331238, "learning_rate": 1.8281981759921125e-05, "loss": 0.0676, "step": 3198 }, { "epoch": 0.3550354924578527, "grad_norm": 0.5078058242797852, "learning_rate": 1.8278284446635445e-05, "loss": 0.1063, "step": 3201 }, { "epoch": 0.35536823425022185, "grad_norm": 0.720430314540863, "learning_rate": 1.827458713334977e-05, "loss": 0.0552, "step": 3204 }, { "epoch": 0.35570097604259093, "grad_norm": 0.7049136757850647, "learning_rate": 1.8270889820064087e-05, "loss": 0.0454, "step": 3207 }, { "epoch": 0.3560337178349601, "grad_norm": 0.4564612805843353, "learning_rate": 1.8267192506778408e-05, "loss": 0.0583, "step": 3210 }, { "epoch": 0.3563664596273292, "grad_norm": 0.6002784371376038, "learning_rate": 1.8263495193492732e-05, "loss": 0.0652, "step": 3213 }, { "epoch": 0.3566992014196983, "grad_norm": 0.684148907661438, "learning_rate": 1.825979788020705e-05, "loss": 0.0972, "step": 3216 }, { "epoch": 0.35703194321206744, "grad_norm": 0.39668551087379456, "learning_rate": 1.825610056692137e-05, "loss": 0.0449, "step": 3219 }, { "epoch": 0.3573646850044366, "grad_norm": 0.8330065011978149, "learning_rate": 1.8252403253635695e-05, "loss": 0.0485, "step": 3222 }, { "epoch": 0.35769742679680566, "grad_norm": 0.39779818058013916, "learning_rate": 1.8248705940350012e-05, "loss": 0.0572, "step": 3225 }, { "epoch": 0.3580301685891748, "grad_norm": 0.6342604756355286, "learning_rate": 1.8245008627064336e-05, "loss": 0.0563, "step": 3228 }, { "epoch": 0.35836291038154394, "grad_norm": 0.40643060207366943, "learning_rate": 1.8241311313778657e-05, "loss": 0.0701, "step": 3231 }, { "epoch": 0.358695652173913, "grad_norm": 0.8875868916511536, "learning_rate": 1.8237614000492975e-05, "loss": 0.0855, "step": 3234 }, { "epoch": 0.35902839396628217, "grad_norm": 0.6023029088973999, "learning_rate": 1.82339166872073e-05, "loss": 0.0441, "step": 3237 }, { "epoch": 0.3593611357586513, "grad_norm": 0.9316222667694092, "learning_rate": 1.823021937392162e-05, "loss": 0.072, "step": 3240 }, { "epoch": 0.3596938775510204, "grad_norm": 0.6170175075531006, "learning_rate": 1.8226522060635937e-05, "loss": 0.0749, "step": 3243 }, { "epoch": 0.36002661934338953, "grad_norm": 0.5921156406402588, "learning_rate": 1.822282474735026e-05, "loss": 0.0539, "step": 3246 }, { "epoch": 0.36035936113575867, "grad_norm": 0.7311082482337952, "learning_rate": 1.8219127434064582e-05, "loss": 0.0526, "step": 3249 }, { "epoch": 0.36069210292812776, "grad_norm": 0.4799564480781555, "learning_rate": 1.8215430120778903e-05, "loss": 0.0383, "step": 3252 }, { "epoch": 0.3610248447204969, "grad_norm": 0.45811474323272705, "learning_rate": 1.8211732807493224e-05, "loss": 0.0706, "step": 3255 }, { "epoch": 0.36135758651286604, "grad_norm": 0.5112932920455933, "learning_rate": 1.8208035494207544e-05, "loss": 0.0822, "step": 3258 }, { "epoch": 0.3616903283052351, "grad_norm": 1.3661926984786987, "learning_rate": 1.8204338180921865e-05, "loss": 0.0561, "step": 3261 }, { "epoch": 0.36202307009760426, "grad_norm": 0.28669050335884094, "learning_rate": 1.8200640867636186e-05, "loss": 0.0524, "step": 3264 }, { "epoch": 0.3623558118899734, "grad_norm": 0.7224727272987366, "learning_rate": 1.8196943554350507e-05, "loss": 0.0649, "step": 3267 }, { "epoch": 0.3626885536823425, "grad_norm": 0.3878195285797119, "learning_rate": 1.8193246241064828e-05, "loss": 0.0755, "step": 3270 }, { "epoch": 0.3630212954747116, "grad_norm": 0.5289884209632874, "learning_rate": 1.818954892777915e-05, "loss": 0.027, "step": 3273 }, { "epoch": 0.36335403726708076, "grad_norm": 0.8436374664306641, "learning_rate": 1.818585161449347e-05, "loss": 0.0478, "step": 3276 }, { "epoch": 0.36368677905944985, "grad_norm": 1.5234328508377075, "learning_rate": 1.818215430120779e-05, "loss": 0.1045, "step": 3279 }, { "epoch": 0.364019520851819, "grad_norm": 0.8626707792282104, "learning_rate": 1.817845698792211e-05, "loss": 0.0541, "step": 3282 }, { "epoch": 0.36435226264418813, "grad_norm": 1.5206340551376343, "learning_rate": 1.8174759674636432e-05, "loss": 0.0464, "step": 3285 }, { "epoch": 0.3646850044365572, "grad_norm": 0.8989534974098206, "learning_rate": 1.8171062361350753e-05, "loss": 0.0842, "step": 3288 }, { "epoch": 0.36501774622892635, "grad_norm": 1.474443793296814, "learning_rate": 1.8167365048065073e-05, "loss": 0.1028, "step": 3291 }, { "epoch": 0.3653504880212955, "grad_norm": 0.7845585942268372, "learning_rate": 1.8163667734779394e-05, "loss": 0.0611, "step": 3294 }, { "epoch": 0.3656832298136646, "grad_norm": 1.3180601596832275, "learning_rate": 1.8159970421493715e-05, "loss": 0.0968, "step": 3297 }, { "epoch": 0.3660159716060337, "grad_norm": 0.7438332438468933, "learning_rate": 1.815627310820804e-05, "loss": 0.0389, "step": 3300 }, { "epoch": 0.36634871339840286, "grad_norm": 0.6601050496101379, "learning_rate": 1.8152575794922357e-05, "loss": 0.0471, "step": 3303 }, { "epoch": 0.36668145519077194, "grad_norm": 1.1433058977127075, "learning_rate": 1.8148878481636678e-05, "loss": 0.053, "step": 3306 }, { "epoch": 0.3670141969831411, "grad_norm": 0.49878254532814026, "learning_rate": 1.8145181168351002e-05, "loss": 0.0934, "step": 3309 }, { "epoch": 0.3673469387755102, "grad_norm": 0.36676275730133057, "learning_rate": 1.814148385506532e-05, "loss": 0.0285, "step": 3312 }, { "epoch": 0.3676796805678793, "grad_norm": 0.2649681568145752, "learning_rate": 1.813778654177964e-05, "loss": 0.0771, "step": 3315 }, { "epoch": 0.36801242236024845, "grad_norm": 0.8773745894432068, "learning_rate": 1.8134089228493964e-05, "loss": 0.0556, "step": 3318 }, { "epoch": 0.3683451641526176, "grad_norm": 0.5074605345726013, "learning_rate": 1.813039191520828e-05, "loss": 0.0585, "step": 3321 }, { "epoch": 0.3686779059449867, "grad_norm": 0.36075180768966675, "learning_rate": 1.8126694601922606e-05, "loss": 0.0657, "step": 3324 }, { "epoch": 0.3690106477373558, "grad_norm": 0.622616708278656, "learning_rate": 1.8122997288636927e-05, "loss": 0.0362, "step": 3327 }, { "epoch": 0.36934338952972495, "grad_norm": 0.7218829393386841, "learning_rate": 1.8119299975351244e-05, "loss": 0.0458, "step": 3330 }, { "epoch": 0.36967613132209404, "grad_norm": 0.23826220631599426, "learning_rate": 1.811560266206557e-05, "loss": 0.0495, "step": 3333 }, { "epoch": 0.3700088731144632, "grad_norm": 1.2963671684265137, "learning_rate": 1.811190534877989e-05, "loss": 0.1058, "step": 3336 }, { "epoch": 0.3703416149068323, "grad_norm": 3.397277593612671, "learning_rate": 1.8108208035494207e-05, "loss": 0.0788, "step": 3339 }, { "epoch": 0.3706743566992014, "grad_norm": 0.473672091960907, "learning_rate": 1.810451072220853e-05, "loss": 0.0585, "step": 3342 }, { "epoch": 0.37100709849157054, "grad_norm": 1.3728526830673218, "learning_rate": 1.810081340892285e-05, "loss": 0.1065, "step": 3345 }, { "epoch": 0.3713398402839397, "grad_norm": 0.6538196802139282, "learning_rate": 1.8097116095637172e-05, "loss": 0.0529, "step": 3348 }, { "epoch": 0.37167258207630877, "grad_norm": 0.5487240552902222, "learning_rate": 1.8093418782351493e-05, "loss": 0.062, "step": 3351 }, { "epoch": 0.3720053238686779, "grad_norm": 0.759758472442627, "learning_rate": 1.8089721469065814e-05, "loss": 0.0813, "step": 3354 }, { "epoch": 0.37233806566104705, "grad_norm": 0.6185902953147888, "learning_rate": 1.8086024155780135e-05, "loss": 0.0401, "step": 3357 }, { "epoch": 0.37267080745341613, "grad_norm": 0.39080774784088135, "learning_rate": 1.8082326842494456e-05, "loss": 0.0423, "step": 3360 }, { "epoch": 0.37300354924578527, "grad_norm": 0.2491219937801361, "learning_rate": 1.8078629529208777e-05, "loss": 0.0565, "step": 3363 }, { "epoch": 0.3733362910381544, "grad_norm": 0.8301553726196289, "learning_rate": 1.8074932215923097e-05, "loss": 0.0286, "step": 3366 }, { "epoch": 0.3736690328305235, "grad_norm": 0.36005425453186035, "learning_rate": 1.8071234902637418e-05, "loss": 0.0681, "step": 3369 }, { "epoch": 0.37400177462289264, "grad_norm": 0.5035695433616638, "learning_rate": 1.806753758935174e-05, "loss": 0.0539, "step": 3372 }, { "epoch": 0.3743345164152618, "grad_norm": 0.30439072847366333, "learning_rate": 1.806384027606606e-05, "loss": 0.0215, "step": 3375 }, { "epoch": 0.37466725820763086, "grad_norm": 1.0396480560302734, "learning_rate": 1.806014296278038e-05, "loss": 0.0495, "step": 3378 }, { "epoch": 0.375, "grad_norm": 0.5580437779426575, "learning_rate": 1.80564456494947e-05, "loss": 0.04, "step": 3381 }, { "epoch": 0.37533274179236914, "grad_norm": 0.8583841919898987, "learning_rate": 1.8052748336209022e-05, "loss": 0.0649, "step": 3384 }, { "epoch": 0.3756654835847382, "grad_norm": 1.7994314432144165, "learning_rate": 1.8049051022923343e-05, "loss": 0.1237, "step": 3387 }, { "epoch": 0.37599822537710736, "grad_norm": 1.1027811765670776, "learning_rate": 1.8045353709637664e-05, "loss": 0.059, "step": 3390 }, { "epoch": 0.3763309671694765, "grad_norm": 0.7907701730728149, "learning_rate": 1.8041656396351985e-05, "loss": 0.0487, "step": 3393 }, { "epoch": 0.3766637089618456, "grad_norm": 1.1048544645309448, "learning_rate": 1.803795908306631e-05, "loss": 0.1057, "step": 3396 }, { "epoch": 0.37699645075421473, "grad_norm": 2.0303831100463867, "learning_rate": 1.8034261769780626e-05, "loss": 0.0556, "step": 3399 }, { "epoch": 0.37732919254658387, "grad_norm": 0.7282113432884216, "learning_rate": 1.8030564456494947e-05, "loss": 0.0783, "step": 3402 }, { "epoch": 0.37766193433895295, "grad_norm": 0.8433974981307983, "learning_rate": 1.802686714320927e-05, "loss": 0.0707, "step": 3405 }, { "epoch": 0.3779946761313221, "grad_norm": 0.3144012689590454, "learning_rate": 1.802316982992359e-05, "loss": 0.0371, "step": 3408 }, { "epoch": 0.37832741792369123, "grad_norm": 0.6913673877716064, "learning_rate": 1.801947251663791e-05, "loss": 0.0435, "step": 3411 }, { "epoch": 0.3786601597160603, "grad_norm": 0.9416883587837219, "learning_rate": 1.8015775203352234e-05, "loss": 0.0542, "step": 3414 }, { "epoch": 0.37899290150842946, "grad_norm": 0.9501073956489563, "learning_rate": 1.801207789006655e-05, "loss": 0.0887, "step": 3417 }, { "epoch": 0.3793256433007986, "grad_norm": 1.200939655303955, "learning_rate": 1.8008380576780876e-05, "loss": 0.0511, "step": 3420 }, { "epoch": 0.3796583850931677, "grad_norm": 0.7048559784889221, "learning_rate": 1.8004683263495196e-05, "loss": 0.0509, "step": 3423 }, { "epoch": 0.3799911268855368, "grad_norm": 0.7115128040313721, "learning_rate": 1.8000985950209514e-05, "loss": 0.0371, "step": 3426 }, { "epoch": 0.38032386867790596, "grad_norm": 0.16066934168338776, "learning_rate": 1.7997288636923838e-05, "loss": 0.0209, "step": 3429 }, { "epoch": 0.38065661047027505, "grad_norm": 1.423419713973999, "learning_rate": 1.799359132363816e-05, "loss": 0.0786, "step": 3432 }, { "epoch": 0.3809893522626442, "grad_norm": 0.6574434041976929, "learning_rate": 1.7989894010352476e-05, "loss": 0.0472, "step": 3435 }, { "epoch": 0.3813220940550133, "grad_norm": 0.3235493302345276, "learning_rate": 1.79861966970668e-05, "loss": 0.0486, "step": 3438 }, { "epoch": 0.3816548358473824, "grad_norm": 0.446844220161438, "learning_rate": 1.798249938378112e-05, "loss": 0.0572, "step": 3441 }, { "epoch": 0.38198757763975155, "grad_norm": 1.4328289031982422, "learning_rate": 1.7978802070495442e-05, "loss": 0.067, "step": 3444 }, { "epoch": 0.3823203194321207, "grad_norm": 0.9448025822639465, "learning_rate": 1.7975104757209763e-05, "loss": 0.0851, "step": 3447 }, { "epoch": 0.3826530612244898, "grad_norm": 0.5035567879676819, "learning_rate": 1.7971407443924084e-05, "loss": 0.0822, "step": 3450 }, { "epoch": 0.3829858030168589, "grad_norm": 0.3910475969314575, "learning_rate": 1.7967710130638405e-05, "loss": 0.0377, "step": 3453 }, { "epoch": 0.38331854480922806, "grad_norm": 0.8109917044639587, "learning_rate": 1.7964012817352725e-05, "loss": 0.0597, "step": 3456 }, { "epoch": 0.38365128660159714, "grad_norm": 0.7839170098304749, "learning_rate": 1.7960315504067046e-05, "loss": 0.0678, "step": 3459 }, { "epoch": 0.3839840283939663, "grad_norm": 0.4514918923377991, "learning_rate": 1.7956618190781367e-05, "loss": 0.0516, "step": 3462 }, { "epoch": 0.3843167701863354, "grad_norm": 0.45707622170448303, "learning_rate": 1.7952920877495688e-05, "loss": 0.0749, "step": 3465 }, { "epoch": 0.3846495119787045, "grad_norm": 0.7012194991111755, "learning_rate": 1.794922356421001e-05, "loss": 0.0521, "step": 3468 }, { "epoch": 0.38498225377107365, "grad_norm": 0.7776585817337036, "learning_rate": 1.794552625092433e-05, "loss": 0.0767, "step": 3471 }, { "epoch": 0.3853149955634428, "grad_norm": 0.6300773024559021, "learning_rate": 1.794182893763865e-05, "loss": 0.0866, "step": 3474 }, { "epoch": 0.38564773735581187, "grad_norm": 0.6011560559272766, "learning_rate": 1.793813162435297e-05, "loss": 0.0596, "step": 3477 }, { "epoch": 0.385980479148181, "grad_norm": 0.5219936370849609, "learning_rate": 1.7934434311067292e-05, "loss": 0.0458, "step": 3480 }, { "epoch": 0.38631322094055015, "grad_norm": 0.7272171378135681, "learning_rate": 1.7930736997781613e-05, "loss": 0.0465, "step": 3483 }, { "epoch": 0.38664596273291924, "grad_norm": 0.9067965149879456, "learning_rate": 1.7927039684495934e-05, "loss": 0.0579, "step": 3486 }, { "epoch": 0.3869787045252884, "grad_norm": 0.7664728164672852, "learning_rate": 1.7923342371210254e-05, "loss": 0.0623, "step": 3489 }, { "epoch": 0.3873114463176575, "grad_norm": 0.8000106811523438, "learning_rate": 1.791964505792458e-05, "loss": 0.069, "step": 3492 }, { "epoch": 0.3876441881100266, "grad_norm": 0.9728484153747559, "learning_rate": 1.7915947744638896e-05, "loss": 0.0467, "step": 3495 }, { "epoch": 0.38797692990239574, "grad_norm": 0.9625365138053894, "learning_rate": 1.7912250431353217e-05, "loss": 0.0637, "step": 3498 }, { "epoch": 0.3883096716947649, "grad_norm": 0.8005168437957764, "learning_rate": 1.790855311806754e-05, "loss": 0.0386, "step": 3501 }, { "epoch": 0.38864241348713396, "grad_norm": 0.8739514946937561, "learning_rate": 1.790485580478186e-05, "loss": 0.0556, "step": 3504 }, { "epoch": 0.3889751552795031, "grad_norm": 0.6235243678092957, "learning_rate": 1.790115849149618e-05, "loss": 0.0429, "step": 3507 }, { "epoch": 0.38930789707187224, "grad_norm": 1.2025806903839111, "learning_rate": 1.7897461178210504e-05, "loss": 0.0393, "step": 3510 }, { "epoch": 0.38964063886424133, "grad_norm": 1.1207146644592285, "learning_rate": 1.789376386492482e-05, "loss": 0.065, "step": 3513 }, { "epoch": 0.38997338065661047, "grad_norm": 0.7098045349121094, "learning_rate": 1.7890066551639145e-05, "loss": 0.0565, "step": 3516 }, { "epoch": 0.3903061224489796, "grad_norm": 0.6282660961151123, "learning_rate": 1.7886369238353466e-05, "loss": 0.0489, "step": 3519 }, { "epoch": 0.3906388642413487, "grad_norm": 0.44596895575523376, "learning_rate": 1.7882671925067783e-05, "loss": 0.0286, "step": 3522 }, { "epoch": 0.39097160603371783, "grad_norm": 1.2647145986557007, "learning_rate": 1.7878974611782108e-05, "loss": 0.0794, "step": 3525 }, { "epoch": 0.391304347826087, "grad_norm": 0.6745908856391907, "learning_rate": 1.787527729849643e-05, "loss": 0.0684, "step": 3528 }, { "epoch": 0.39163708961845606, "grad_norm": 1.2374485731124878, "learning_rate": 1.7871579985210746e-05, "loss": 0.0616, "step": 3531 }, { "epoch": 0.3919698314108252, "grad_norm": 0.638905942440033, "learning_rate": 1.786788267192507e-05, "loss": 0.0353, "step": 3534 }, { "epoch": 0.39230257320319434, "grad_norm": 0.557572603225708, "learning_rate": 1.786418535863939e-05, "loss": 0.0371, "step": 3537 }, { "epoch": 0.3926353149955634, "grad_norm": 1.206235408782959, "learning_rate": 1.7860488045353712e-05, "loss": 0.064, "step": 3540 }, { "epoch": 0.39296805678793256, "grad_norm": 0.9012829065322876, "learning_rate": 1.7856790732068033e-05, "loss": 0.0707, "step": 3543 }, { "epoch": 0.3933007985803017, "grad_norm": 0.9433451294898987, "learning_rate": 1.7853093418782353e-05, "loss": 0.0712, "step": 3546 }, { "epoch": 0.3936335403726708, "grad_norm": 0.4888119399547577, "learning_rate": 1.7849396105496674e-05, "loss": 0.0408, "step": 3549 }, { "epoch": 0.3939662821650399, "grad_norm": 0.5229927897453308, "learning_rate": 1.7845698792210995e-05, "loss": 0.048, "step": 3552 }, { "epoch": 0.39429902395740907, "grad_norm": 0.7366634011268616, "learning_rate": 1.7842001478925316e-05, "loss": 0.0715, "step": 3555 }, { "epoch": 0.39463176574977815, "grad_norm": 1.197596788406372, "learning_rate": 1.7838304165639637e-05, "loss": 0.0699, "step": 3558 }, { "epoch": 0.3949645075421473, "grad_norm": 0.4324573278427124, "learning_rate": 1.7834606852353957e-05, "loss": 0.0486, "step": 3561 }, { "epoch": 0.39529724933451643, "grad_norm": 0.5101915001869202, "learning_rate": 1.7830909539068278e-05, "loss": 0.0767, "step": 3564 }, { "epoch": 0.3956299911268855, "grad_norm": 1.3277356624603271, "learning_rate": 1.78272122257826e-05, "loss": 0.1211, "step": 3567 }, { "epoch": 0.39596273291925466, "grad_norm": 0.5068720579147339, "learning_rate": 1.782351491249692e-05, "loss": 0.0621, "step": 3570 }, { "epoch": 0.3962954747116238, "grad_norm": 0.7046349048614502, "learning_rate": 1.781981759921124e-05, "loss": 0.0546, "step": 3573 }, { "epoch": 0.3966282165039929, "grad_norm": 1.3062855005264282, "learning_rate": 1.781612028592556e-05, "loss": 0.0908, "step": 3576 }, { "epoch": 0.396960958296362, "grad_norm": 0.766740620136261, "learning_rate": 1.7812422972639882e-05, "loss": 0.0566, "step": 3579 }, { "epoch": 0.39729370008873116, "grad_norm": 1.7100414037704468, "learning_rate": 1.7808725659354203e-05, "loss": 0.0375, "step": 3582 }, { "epoch": 0.39762644188110025, "grad_norm": 0.33701378107070923, "learning_rate": 1.7805028346068524e-05, "loss": 0.0377, "step": 3585 }, { "epoch": 0.3979591836734694, "grad_norm": 0.5611671209335327, "learning_rate": 1.7801331032782848e-05, "loss": 0.0536, "step": 3588 }, { "epoch": 0.3982919254658385, "grad_norm": 1.0105739831924438, "learning_rate": 1.7797633719497166e-05, "loss": 0.0997, "step": 3591 }, { "epoch": 0.3986246672582076, "grad_norm": 0.6941092610359192, "learning_rate": 1.7793936406211486e-05, "loss": 0.057, "step": 3594 }, { "epoch": 0.39895740905057675, "grad_norm": 0.4937894940376282, "learning_rate": 1.779023909292581e-05, "loss": 0.0686, "step": 3597 }, { "epoch": 0.3992901508429459, "grad_norm": 0.6589914560317993, "learning_rate": 1.7786541779640128e-05, "loss": 0.0546, "step": 3600 }, { "epoch": 0.399622892635315, "grad_norm": 0.8047364354133606, "learning_rate": 1.778284446635445e-05, "loss": 0.061, "step": 3603 }, { "epoch": 0.3999556344276841, "grad_norm": 0.5173530578613281, "learning_rate": 1.7779147153068773e-05, "loss": 0.0637, "step": 3606 }, { "epoch": 0.40028837622005325, "grad_norm": 0.527626633644104, "learning_rate": 1.777544983978309e-05, "loss": 0.0475, "step": 3609 }, { "epoch": 0.40062111801242234, "grad_norm": 0.6563243865966797, "learning_rate": 1.777175252649741e-05, "loss": 0.0534, "step": 3612 }, { "epoch": 0.4009538598047915, "grad_norm": 0.9174769520759583, "learning_rate": 1.7768055213211736e-05, "loss": 0.057, "step": 3615 }, { "epoch": 0.4012866015971606, "grad_norm": 0.9062615633010864, "learning_rate": 1.7764357899926053e-05, "loss": 0.0469, "step": 3618 }, { "epoch": 0.4016193433895297, "grad_norm": 0.5150889754295349, "learning_rate": 1.7760660586640377e-05, "loss": 0.0358, "step": 3621 }, { "epoch": 0.40195208518189884, "grad_norm": 0.14240069687366486, "learning_rate": 1.7756963273354698e-05, "loss": 0.0297, "step": 3624 }, { "epoch": 0.402284826974268, "grad_norm": 1.0859086513519287, "learning_rate": 1.7753265960069016e-05, "loss": 0.1202, "step": 3627 }, { "epoch": 0.40261756876663707, "grad_norm": 0.9697964787483215, "learning_rate": 1.774956864678334e-05, "loss": 0.0544, "step": 3630 }, { "epoch": 0.4029503105590062, "grad_norm": 0.6184844374656677, "learning_rate": 1.774587133349766e-05, "loss": 0.0485, "step": 3633 }, { "epoch": 0.40328305235137535, "grad_norm": 0.715046763420105, "learning_rate": 1.7742174020211978e-05, "loss": 0.0708, "step": 3636 }, { "epoch": 0.40361579414374443, "grad_norm": 0.5666434168815613, "learning_rate": 1.7738476706926302e-05, "loss": 0.049, "step": 3639 }, { "epoch": 0.4039485359361136, "grad_norm": 0.6715440154075623, "learning_rate": 1.7734779393640623e-05, "loss": 0.0794, "step": 3642 }, { "epoch": 0.4042812777284827, "grad_norm": 0.8222674131393433, "learning_rate": 1.7731082080354944e-05, "loss": 0.0496, "step": 3645 }, { "epoch": 0.4046140195208518, "grad_norm": 0.6746175289154053, "learning_rate": 1.7727384767069265e-05, "loss": 0.0484, "step": 3648 }, { "epoch": 0.40494676131322094, "grad_norm": 1.5096570253372192, "learning_rate": 1.7723687453783585e-05, "loss": 0.0894, "step": 3651 }, { "epoch": 0.4052795031055901, "grad_norm": 0.9180783033370972, "learning_rate": 1.7719990140497906e-05, "loss": 0.1087, "step": 3654 }, { "epoch": 0.40561224489795916, "grad_norm": 0.7353931665420532, "learning_rate": 1.7716292827212227e-05, "loss": 0.0707, "step": 3657 }, { "epoch": 0.4059449866903283, "grad_norm": 0.4709693491458893, "learning_rate": 1.7712595513926548e-05, "loss": 0.0603, "step": 3660 }, { "epoch": 0.40627772848269744, "grad_norm": 0.7231876254081726, "learning_rate": 1.770889820064087e-05, "loss": 0.0537, "step": 3663 }, { "epoch": 0.4066104702750665, "grad_norm": 0.8927720189094543, "learning_rate": 1.770520088735519e-05, "loss": 0.0719, "step": 3666 }, { "epoch": 0.40694321206743567, "grad_norm": 0.5319972634315491, "learning_rate": 1.770150357406951e-05, "loss": 0.053, "step": 3669 }, { "epoch": 0.4072759538598048, "grad_norm": 1.1626750230789185, "learning_rate": 1.769780626078383e-05, "loss": 0.0872, "step": 3672 }, { "epoch": 0.4076086956521739, "grad_norm": 0.6986317038536072, "learning_rate": 1.7694108947498152e-05, "loss": 0.0429, "step": 3675 }, { "epoch": 0.40794143744454303, "grad_norm": 0.38828516006469727, "learning_rate": 1.7690411634212473e-05, "loss": 0.0725, "step": 3678 }, { "epoch": 0.40827417923691217, "grad_norm": 0.2901146411895752, "learning_rate": 1.7686714320926794e-05, "loss": 0.0737, "step": 3681 }, { "epoch": 0.40860692102928126, "grad_norm": 0.4641319513320923, "learning_rate": 1.7683017007641114e-05, "loss": 0.0456, "step": 3684 }, { "epoch": 0.4089396628216504, "grad_norm": 0.52308189868927, "learning_rate": 1.7679319694355435e-05, "loss": 0.0492, "step": 3687 }, { "epoch": 0.40927240461401954, "grad_norm": 1.0038647651672363, "learning_rate": 1.7675622381069756e-05, "loss": 0.0734, "step": 3690 }, { "epoch": 0.4096051464063886, "grad_norm": 0.45110708475112915, "learning_rate": 1.767192506778408e-05, "loss": 0.047, "step": 3693 }, { "epoch": 0.40993788819875776, "grad_norm": 1.0589408874511719, "learning_rate": 1.7668227754498398e-05, "loss": 0.0637, "step": 3696 }, { "epoch": 0.4102706299911269, "grad_norm": 0.6150688529014587, "learning_rate": 1.766453044121272e-05, "loss": 0.0415, "step": 3699 }, { "epoch": 0.410603371783496, "grad_norm": 0.34581321477890015, "learning_rate": 1.7660833127927043e-05, "loss": 0.0653, "step": 3702 }, { "epoch": 0.4109361135758651, "grad_norm": 0.5197736620903015, "learning_rate": 1.765713581464136e-05, "loss": 0.0597, "step": 3705 }, { "epoch": 0.41126885536823427, "grad_norm": 0.5862632989883423, "learning_rate": 1.765343850135568e-05, "loss": 0.0665, "step": 3708 }, { "epoch": 0.41160159716060335, "grad_norm": 0.8498165011405945, "learning_rate": 1.7649741188070005e-05, "loss": 0.028, "step": 3711 }, { "epoch": 0.4119343389529725, "grad_norm": 0.7075044512748718, "learning_rate": 1.7646043874784323e-05, "loss": 0.0665, "step": 3714 }, { "epoch": 0.41226708074534163, "grad_norm": 0.5417982339859009, "learning_rate": 1.7642346561498647e-05, "loss": 0.0537, "step": 3717 }, { "epoch": 0.4125998225377107, "grad_norm": 0.7758318781852722, "learning_rate": 1.7638649248212968e-05, "loss": 0.0343, "step": 3720 }, { "epoch": 0.41293256433007985, "grad_norm": 0.5769911408424377, "learning_rate": 1.763495193492729e-05, "loss": 0.0459, "step": 3723 }, { "epoch": 0.413265306122449, "grad_norm": 0.863990068435669, "learning_rate": 1.763125462164161e-05, "loss": 0.0506, "step": 3726 }, { "epoch": 0.4135980479148181, "grad_norm": 0.6523183584213257, "learning_rate": 1.762755730835593e-05, "loss": 0.081, "step": 3729 }, { "epoch": 0.4139307897071872, "grad_norm": 0.8907710313796997, "learning_rate": 1.762385999507025e-05, "loss": 0.0558, "step": 3732 }, { "epoch": 0.41426353149955636, "grad_norm": 1.1462241411209106, "learning_rate": 1.7620162681784572e-05, "loss": 0.0796, "step": 3735 }, { "epoch": 0.41459627329192544, "grad_norm": 0.5367715358734131, "learning_rate": 1.7616465368498893e-05, "loss": 0.0521, "step": 3738 }, { "epoch": 0.4149290150842946, "grad_norm": 0.8814707398414612, "learning_rate": 1.7612768055213213e-05, "loss": 0.0546, "step": 3741 }, { "epoch": 0.4152617568766637, "grad_norm": 0.3784535527229309, "learning_rate": 1.7609070741927534e-05, "loss": 0.0518, "step": 3744 }, { "epoch": 0.4155944986690328, "grad_norm": 1.1409324407577515, "learning_rate": 1.7605373428641855e-05, "loss": 0.0647, "step": 3747 }, { "epoch": 0.41592724046140195, "grad_norm": 0.39415717124938965, "learning_rate": 1.7601676115356176e-05, "loss": 0.0546, "step": 3750 }, { "epoch": 0.4162599822537711, "grad_norm": 0.3977600634098053, "learning_rate": 1.7597978802070497e-05, "loss": 0.0523, "step": 3753 }, { "epoch": 0.4165927240461402, "grad_norm": 0.6946219205856323, "learning_rate": 1.7594281488784818e-05, "loss": 0.0441, "step": 3756 }, { "epoch": 0.4169254658385093, "grad_norm": 0.7950144410133362, "learning_rate": 1.759058417549914e-05, "loss": 0.0486, "step": 3759 }, { "epoch": 0.41725820763087845, "grad_norm": 1.0804157257080078, "learning_rate": 1.758688686221346e-05, "loss": 0.05, "step": 3762 }, { "epoch": 0.41759094942324754, "grad_norm": 0.7784591317176819, "learning_rate": 1.758318954892778e-05, "loss": 0.0463, "step": 3765 }, { "epoch": 0.4179236912156167, "grad_norm": 0.6022929549217224, "learning_rate": 1.75794922356421e-05, "loss": 0.0783, "step": 3768 }, { "epoch": 0.4182564330079858, "grad_norm": 1.0312753915786743, "learning_rate": 1.757579492235642e-05, "loss": 0.0879, "step": 3771 }, { "epoch": 0.4185891748003549, "grad_norm": 0.9056363105773926, "learning_rate": 1.7572097609070742e-05, "loss": 0.0533, "step": 3774 }, { "epoch": 0.41892191659272404, "grad_norm": 0.5045592784881592, "learning_rate": 1.7568400295785063e-05, "loss": 0.0566, "step": 3777 }, { "epoch": 0.4192546583850932, "grad_norm": 0.8076795935630798, "learning_rate": 1.7564702982499384e-05, "loss": 0.0782, "step": 3780 }, { "epoch": 0.41958740017746227, "grad_norm": 0.7119938731193542, "learning_rate": 1.7561005669213705e-05, "loss": 0.0488, "step": 3783 }, { "epoch": 0.4199201419698314, "grad_norm": 0.34946173429489136, "learning_rate": 1.7557308355928026e-05, "loss": 0.0528, "step": 3786 }, { "epoch": 0.42025288376220055, "grad_norm": 0.6055382490158081, "learning_rate": 1.755361104264235e-05, "loss": 0.0324, "step": 3789 }, { "epoch": 0.42058562555456963, "grad_norm": 1.0226588249206543, "learning_rate": 1.7549913729356667e-05, "loss": 0.0591, "step": 3792 }, { "epoch": 0.42091836734693877, "grad_norm": 0.6380976438522339, "learning_rate": 1.7546216416070988e-05, "loss": 0.0709, "step": 3795 }, { "epoch": 0.4212511091393079, "grad_norm": 0.47581905126571655, "learning_rate": 1.7542519102785312e-05, "loss": 0.0453, "step": 3798 }, { "epoch": 0.421583850931677, "grad_norm": 0.938947319984436, "learning_rate": 1.753882178949963e-05, "loss": 0.0756, "step": 3801 }, { "epoch": 0.42191659272404614, "grad_norm": 0.8422492742538452, "learning_rate": 1.753512447621395e-05, "loss": 0.0612, "step": 3804 }, { "epoch": 0.4222493345164153, "grad_norm": 0.6790269017219543, "learning_rate": 1.7531427162928275e-05, "loss": 0.0323, "step": 3807 }, { "epoch": 0.42258207630878436, "grad_norm": 0.8498281240463257, "learning_rate": 1.7527729849642596e-05, "loss": 0.0496, "step": 3810 }, { "epoch": 0.4229148181011535, "grad_norm": 0.7459282279014587, "learning_rate": 1.7524032536356917e-05, "loss": 0.0712, "step": 3813 }, { "epoch": 0.42324755989352264, "grad_norm": 0.746533989906311, "learning_rate": 1.7520335223071237e-05, "loss": 0.0569, "step": 3816 }, { "epoch": 0.4235803016858917, "grad_norm": 0.8518890142440796, "learning_rate": 1.7516637909785558e-05, "loss": 0.0913, "step": 3819 }, { "epoch": 0.42391304347826086, "grad_norm": 0.5791594982147217, "learning_rate": 1.751294059649988e-05, "loss": 0.0224, "step": 3822 }, { "epoch": 0.42424578527063, "grad_norm": 0.480285108089447, "learning_rate": 1.75092432832142e-05, "loss": 0.077, "step": 3825 }, { "epoch": 0.4245785270629991, "grad_norm": 0.48974260687828064, "learning_rate": 1.750554596992852e-05, "loss": 0.0334, "step": 3828 }, { "epoch": 0.42491126885536823, "grad_norm": 1.1002445220947266, "learning_rate": 1.750184865664284e-05, "loss": 0.0609, "step": 3831 }, { "epoch": 0.42524401064773737, "grad_norm": 1.1830039024353027, "learning_rate": 1.7498151343357162e-05, "loss": 0.0612, "step": 3834 }, { "epoch": 0.42557675244010645, "grad_norm": 0.9262354969978333, "learning_rate": 1.7494454030071483e-05, "loss": 0.0772, "step": 3837 }, { "epoch": 0.4259094942324756, "grad_norm": 0.6812543272972107, "learning_rate": 1.7490756716785804e-05, "loss": 0.0421, "step": 3840 }, { "epoch": 0.42624223602484473, "grad_norm": 1.111580491065979, "learning_rate": 1.7487059403500125e-05, "loss": 0.0712, "step": 3843 }, { "epoch": 0.4265749778172138, "grad_norm": 0.4315716624259949, "learning_rate": 1.7483362090214446e-05, "loss": 0.0737, "step": 3846 }, { "epoch": 0.42690771960958296, "grad_norm": 0.9142162203788757, "learning_rate": 1.7479664776928766e-05, "loss": 0.071, "step": 3849 }, { "epoch": 0.4272404614019521, "grad_norm": 0.7653815150260925, "learning_rate": 1.7475967463643087e-05, "loss": 0.0646, "step": 3852 }, { "epoch": 0.4275732031943212, "grad_norm": 0.7537217736244202, "learning_rate": 1.7472270150357408e-05, "loss": 0.0934, "step": 3855 }, { "epoch": 0.4279059449866903, "grad_norm": 0.7603818774223328, "learning_rate": 1.746857283707173e-05, "loss": 0.062, "step": 3858 }, { "epoch": 0.42823868677905946, "grad_norm": 0.7106790542602539, "learning_rate": 1.746487552378605e-05, "loss": 0.0596, "step": 3861 }, { "epoch": 0.42857142857142855, "grad_norm": 0.9363869428634644, "learning_rate": 1.746117821050037e-05, "loss": 0.0993, "step": 3864 }, { "epoch": 0.4289041703637977, "grad_norm": 1.6872508525848389, "learning_rate": 1.745748089721469e-05, "loss": 0.0482, "step": 3867 }, { "epoch": 0.4292369121561668, "grad_norm": 0.4166531264781952, "learning_rate": 1.7453783583929012e-05, "loss": 0.053, "step": 3870 }, { "epoch": 0.4295696539485359, "grad_norm": 0.4297630488872528, "learning_rate": 1.7450086270643333e-05, "loss": 0.0539, "step": 3873 }, { "epoch": 0.42990239574090505, "grad_norm": 0.41438037157058716, "learning_rate": 1.7446388957357654e-05, "loss": 0.0548, "step": 3876 }, { "epoch": 0.4302351375332742, "grad_norm": 0.8310109376907349, "learning_rate": 1.7442691644071975e-05, "loss": 0.0568, "step": 3879 }, { "epoch": 0.4305678793256433, "grad_norm": 0.5064313411712646, "learning_rate": 1.7438994330786295e-05, "loss": 0.0396, "step": 3882 }, { "epoch": 0.4309006211180124, "grad_norm": 0.4814201891422272, "learning_rate": 1.743529701750062e-05, "loss": 0.056, "step": 3885 }, { "epoch": 0.43123336291038156, "grad_norm": 1.1425631046295166, "learning_rate": 1.743159970421494e-05, "loss": 0.054, "step": 3888 }, { "epoch": 0.43156610470275064, "grad_norm": 0.3250797688961029, "learning_rate": 1.7427902390929258e-05, "loss": 0.055, "step": 3891 }, { "epoch": 0.4318988464951198, "grad_norm": 0.4442632496356964, "learning_rate": 1.7424205077643582e-05, "loss": 0.0469, "step": 3894 }, { "epoch": 0.4322315882874889, "grad_norm": 0.9708724021911621, "learning_rate": 1.7420507764357903e-05, "loss": 0.0702, "step": 3897 }, { "epoch": 0.432564330079858, "grad_norm": 0.7600905895233154, "learning_rate": 1.741681045107222e-05, "loss": 0.0517, "step": 3900 }, { "epoch": 0.43289707187222715, "grad_norm": 0.7299237251281738, "learning_rate": 1.7413113137786545e-05, "loss": 0.0323, "step": 3903 }, { "epoch": 0.4332298136645963, "grad_norm": 1.6565771102905273, "learning_rate": 1.7409415824500865e-05, "loss": 0.0653, "step": 3906 }, { "epoch": 0.43356255545696537, "grad_norm": 0.6974082589149475, "learning_rate": 1.7405718511215186e-05, "loss": 0.058, "step": 3909 }, { "epoch": 0.4338952972493345, "grad_norm": 0.5380098223686218, "learning_rate": 1.7402021197929507e-05, "loss": 0.0513, "step": 3912 }, { "epoch": 0.43422803904170365, "grad_norm": 0.29043346643447876, "learning_rate": 1.7398323884643828e-05, "loss": 0.0266, "step": 3915 }, { "epoch": 0.43456078083407274, "grad_norm": 0.43383288383483887, "learning_rate": 1.739462657135815e-05, "loss": 0.0496, "step": 3918 }, { "epoch": 0.4348935226264419, "grad_norm": 0.7824199795722961, "learning_rate": 1.739092925807247e-05, "loss": 0.056, "step": 3921 }, { "epoch": 0.435226264418811, "grad_norm": 0.4503268897533417, "learning_rate": 1.738723194478679e-05, "loss": 0.034, "step": 3924 }, { "epoch": 0.4355590062111801, "grad_norm": 0.7555467486381531, "learning_rate": 1.738353463150111e-05, "loss": 0.0768, "step": 3927 }, { "epoch": 0.43589174800354924, "grad_norm": 0.8225175738334656, "learning_rate": 1.7379837318215432e-05, "loss": 0.0452, "step": 3930 }, { "epoch": 0.4362244897959184, "grad_norm": 0.3591526746749878, "learning_rate": 1.7376140004929753e-05, "loss": 0.0422, "step": 3933 }, { "epoch": 0.43655723158828746, "grad_norm": 0.8831102252006531, "learning_rate": 1.7372442691644074e-05, "loss": 0.0469, "step": 3936 }, { "epoch": 0.4368899733806566, "grad_norm": 0.5070654153823853, "learning_rate": 1.7368745378358394e-05, "loss": 0.0385, "step": 3939 }, { "epoch": 0.43722271517302574, "grad_norm": 0.36940258741378784, "learning_rate": 1.7365048065072715e-05, "loss": 0.0454, "step": 3942 }, { "epoch": 0.43755545696539483, "grad_norm": 0.64142245054245, "learning_rate": 1.7361350751787036e-05, "loss": 0.0607, "step": 3945 }, { "epoch": 0.43788819875776397, "grad_norm": 0.2682092487812042, "learning_rate": 1.7357653438501357e-05, "loss": 0.0386, "step": 3948 }, { "epoch": 0.4382209405501331, "grad_norm": 0.2845715582370758, "learning_rate": 1.7353956125215678e-05, "loss": 0.0485, "step": 3951 }, { "epoch": 0.4385536823425022, "grad_norm": 0.8124598860740662, "learning_rate": 1.735025881193e-05, "loss": 0.0307, "step": 3954 }, { "epoch": 0.43888642413487133, "grad_norm": 0.29823020100593567, "learning_rate": 1.734656149864432e-05, "loss": 0.0608, "step": 3957 }, { "epoch": 0.4392191659272405, "grad_norm": 0.5400733351707458, "learning_rate": 1.734286418535864e-05, "loss": 0.0284, "step": 3960 }, { "epoch": 0.43955190771960956, "grad_norm": 1.197150707244873, "learning_rate": 1.733916687207296e-05, "loss": 0.096, "step": 3963 }, { "epoch": 0.4398846495119787, "grad_norm": 0.30925944447517395, "learning_rate": 1.7335469558787282e-05, "loss": 0.0563, "step": 3966 }, { "epoch": 0.44021739130434784, "grad_norm": 0.6817483305931091, "learning_rate": 1.7331772245501603e-05, "loss": 0.0411, "step": 3969 }, { "epoch": 0.4405501330967169, "grad_norm": 0.7902812957763672, "learning_rate": 1.7328074932215923e-05, "loss": 0.0525, "step": 3972 }, { "epoch": 0.44088287488908606, "grad_norm": 0.37704625725746155, "learning_rate": 1.7324377618930248e-05, "loss": 0.065, "step": 3975 }, { "epoch": 0.4412156166814552, "grad_norm": 0.3233267068862915, "learning_rate": 1.7320680305644565e-05, "loss": 0.056, "step": 3978 }, { "epoch": 0.4415483584738243, "grad_norm": 1.2641096115112305, "learning_rate": 1.731698299235889e-05, "loss": 0.0663, "step": 3981 }, { "epoch": 0.4418811002661934, "grad_norm": 0.7362521290779114, "learning_rate": 1.731328567907321e-05, "loss": 0.0714, "step": 3984 }, { "epoch": 0.44221384205856257, "grad_norm": 0.6692554354667664, "learning_rate": 1.7309588365787527e-05, "loss": 0.0666, "step": 3987 }, { "epoch": 0.44254658385093165, "grad_norm": 0.6485722661018372, "learning_rate": 1.730589105250185e-05, "loss": 0.0429, "step": 3990 }, { "epoch": 0.4428793256433008, "grad_norm": 0.4360663592815399, "learning_rate": 1.7302193739216173e-05, "loss": 0.0346, "step": 3993 }, { "epoch": 0.44321206743566993, "grad_norm": 0.8531869649887085, "learning_rate": 1.729849642593049e-05, "loss": 0.0811, "step": 3996 }, { "epoch": 0.443544809228039, "grad_norm": 0.4099574387073517, "learning_rate": 1.7294799112644814e-05, "loss": 0.0768, "step": 3999 }, { "epoch": 0.44387755102040816, "grad_norm": 0.4004191756248474, "learning_rate": 1.7291101799359135e-05, "loss": 0.0487, "step": 4002 }, { "epoch": 0.4442102928127773, "grad_norm": 0.6145786046981812, "learning_rate": 1.7287404486073456e-05, "loss": 0.0707, "step": 4005 }, { "epoch": 0.4445430346051464, "grad_norm": 1.8027136325836182, "learning_rate": 1.7283707172787777e-05, "loss": 0.0543, "step": 4008 }, { "epoch": 0.4448757763975155, "grad_norm": 0.5146512389183044, "learning_rate": 1.7280009859502097e-05, "loss": 0.0488, "step": 4011 }, { "epoch": 0.44520851818988466, "grad_norm": 0.621516227722168, "learning_rate": 1.7276312546216418e-05, "loss": 0.0516, "step": 4014 }, { "epoch": 0.44554125998225375, "grad_norm": 0.43434441089630127, "learning_rate": 1.727261523293074e-05, "loss": 0.0881, "step": 4017 }, { "epoch": 0.4458740017746229, "grad_norm": 0.5239928364753723, "learning_rate": 1.726891791964506e-05, "loss": 0.0532, "step": 4020 }, { "epoch": 0.446206743566992, "grad_norm": 0.5276407599449158, "learning_rate": 1.726522060635938e-05, "loss": 0.0493, "step": 4023 }, { "epoch": 0.4465394853593611, "grad_norm": 0.45583435893058777, "learning_rate": 1.72615232930737e-05, "loss": 0.0265, "step": 4026 }, { "epoch": 0.44687222715173025, "grad_norm": 0.44824516773223877, "learning_rate": 1.7257825979788022e-05, "loss": 0.0551, "step": 4029 }, { "epoch": 0.4472049689440994, "grad_norm": 0.579571008682251, "learning_rate": 1.7254128666502343e-05, "loss": 0.0308, "step": 4032 }, { "epoch": 0.4475377107364685, "grad_norm": 0.344570130109787, "learning_rate": 1.7250431353216664e-05, "loss": 0.0266, "step": 4035 }, { "epoch": 0.4478704525288376, "grad_norm": 0.4176628589630127, "learning_rate": 1.7246734039930985e-05, "loss": 0.0316, "step": 4038 }, { "epoch": 0.44820319432120675, "grad_norm": 0.4543634355068207, "learning_rate": 1.7243036726645306e-05, "loss": 0.066, "step": 4041 }, { "epoch": 0.44853593611357584, "grad_norm": 0.8310489654541016, "learning_rate": 1.7239339413359626e-05, "loss": 0.0432, "step": 4044 }, { "epoch": 0.448868677905945, "grad_norm": 0.3702966272830963, "learning_rate": 1.7235642100073947e-05, "loss": 0.0475, "step": 4047 }, { "epoch": 0.4492014196983141, "grad_norm": 0.85706627368927, "learning_rate": 1.7231944786788268e-05, "loss": 0.0297, "step": 4050 }, { "epoch": 0.4495341614906832, "grad_norm": 0.7225920557975769, "learning_rate": 1.7228247473502592e-05, "loss": 0.0579, "step": 4053 }, { "epoch": 0.44986690328305234, "grad_norm": 0.32323360443115234, "learning_rate": 1.722455016021691e-05, "loss": 0.0558, "step": 4056 }, { "epoch": 0.4501996450754215, "grad_norm": 0.9024665951728821, "learning_rate": 1.722085284693123e-05, "loss": 0.0346, "step": 4059 }, { "epoch": 0.45053238686779057, "grad_norm": 0.584635853767395, "learning_rate": 1.7217155533645555e-05, "loss": 0.0334, "step": 4062 }, { "epoch": 0.4508651286601597, "grad_norm": 0.17207852005958557, "learning_rate": 1.7213458220359872e-05, "loss": 0.056, "step": 4065 }, { "epoch": 0.45119787045252885, "grad_norm": 1.446393609046936, "learning_rate": 1.7209760907074193e-05, "loss": 0.0517, "step": 4068 }, { "epoch": 0.45153061224489793, "grad_norm": 0.5318716764450073, "learning_rate": 1.7206063593788517e-05, "loss": 0.0423, "step": 4071 }, { "epoch": 0.4518633540372671, "grad_norm": 0.43053629994392395, "learning_rate": 1.7202366280502835e-05, "loss": 0.0644, "step": 4074 }, { "epoch": 0.4521960958296362, "grad_norm": 1.4116004705429077, "learning_rate": 1.7198668967217155e-05, "loss": 0.04, "step": 4077 }, { "epoch": 0.4525288376220053, "grad_norm": 0.7755500078201294, "learning_rate": 1.719497165393148e-05, "loss": 0.0697, "step": 4080 }, { "epoch": 0.45286157941437444, "grad_norm": 0.6876033544540405, "learning_rate": 1.7191274340645797e-05, "loss": 0.059, "step": 4083 }, { "epoch": 0.4531943212067436, "grad_norm": 0.7031375765800476, "learning_rate": 1.718757702736012e-05, "loss": 0.043, "step": 4086 }, { "epoch": 0.45352706299911266, "grad_norm": 1.4166208505630493, "learning_rate": 1.7183879714074442e-05, "loss": 0.0753, "step": 4089 }, { "epoch": 0.4538598047914818, "grad_norm": 0.6340405344963074, "learning_rate": 1.718018240078876e-05, "loss": 0.0456, "step": 4092 }, { "epoch": 0.45419254658385094, "grad_norm": 0.4632711410522461, "learning_rate": 1.7176485087503084e-05, "loss": 0.063, "step": 4095 }, { "epoch": 0.45452528837622, "grad_norm": 0.6647776365280151, "learning_rate": 1.7172787774217405e-05, "loss": 0.0788, "step": 4098 }, { "epoch": 0.45485803016858917, "grad_norm": 1.1480618715286255, "learning_rate": 1.7169090460931722e-05, "loss": 0.0832, "step": 4101 }, { "epoch": 0.4551907719609583, "grad_norm": 0.4367119371891022, "learning_rate": 1.7165393147646046e-05, "loss": 0.036, "step": 4104 }, { "epoch": 0.4555235137533274, "grad_norm": 0.7109994888305664, "learning_rate": 1.7161695834360367e-05, "loss": 0.0548, "step": 4107 }, { "epoch": 0.45585625554569653, "grad_norm": 0.3292393386363983, "learning_rate": 1.7157998521074688e-05, "loss": 0.0299, "step": 4110 }, { "epoch": 0.45618899733806567, "grad_norm": 0.6541246175765991, "learning_rate": 1.715430120778901e-05, "loss": 0.0242, "step": 4113 }, { "epoch": 0.45652173913043476, "grad_norm": 0.441533625125885, "learning_rate": 1.715060389450333e-05, "loss": 0.0215, "step": 4116 }, { "epoch": 0.4568544809228039, "grad_norm": 0.8239404559135437, "learning_rate": 1.714690658121765e-05, "loss": 0.0552, "step": 4119 }, { "epoch": 0.45718722271517304, "grad_norm": 0.8618807792663574, "learning_rate": 1.714320926793197e-05, "loss": 0.0667, "step": 4122 }, { "epoch": 0.4575199645075421, "grad_norm": 1.2294012308120728, "learning_rate": 1.7139511954646292e-05, "loss": 0.053, "step": 4125 }, { "epoch": 0.45785270629991126, "grad_norm": 0.7824467420578003, "learning_rate": 1.7135814641360613e-05, "loss": 0.0484, "step": 4128 }, { "epoch": 0.4581854480922804, "grad_norm": 0.6899873614311218, "learning_rate": 1.7132117328074934e-05, "loss": 0.0482, "step": 4131 }, { "epoch": 0.4585181898846495, "grad_norm": 0.33928051590919495, "learning_rate": 1.7128420014789254e-05, "loss": 0.0469, "step": 4134 }, { "epoch": 0.4588509316770186, "grad_norm": 0.5676553249359131, "learning_rate": 1.7124722701503575e-05, "loss": 0.0421, "step": 4137 }, { "epoch": 0.45918367346938777, "grad_norm": 0.8159649968147278, "learning_rate": 1.7121025388217896e-05, "loss": 0.0396, "step": 4140 }, { "epoch": 0.45951641526175685, "grad_norm": 0.9895645380020142, "learning_rate": 1.7117328074932217e-05, "loss": 0.0353, "step": 4143 }, { "epoch": 0.459849157054126, "grad_norm": 0.4628107249736786, "learning_rate": 1.7113630761646538e-05, "loss": 0.0504, "step": 4146 }, { "epoch": 0.46018189884649513, "grad_norm": 0.45586517453193665, "learning_rate": 1.710993344836086e-05, "loss": 0.078, "step": 4149 }, { "epoch": 0.4605146406388642, "grad_norm": 1.4054371118545532, "learning_rate": 1.710623613507518e-05, "loss": 0.0923, "step": 4152 }, { "epoch": 0.46084738243123335, "grad_norm": 0.5126363635063171, "learning_rate": 1.71025388217895e-05, "loss": 0.0511, "step": 4155 }, { "epoch": 0.4611801242236025, "grad_norm": 0.8845064043998718, "learning_rate": 1.7098841508503824e-05, "loss": 0.0807, "step": 4158 }, { "epoch": 0.4615128660159716, "grad_norm": 0.5239210724830627, "learning_rate": 1.7095144195218142e-05, "loss": 0.0785, "step": 4161 }, { "epoch": 0.4618456078083407, "grad_norm": 0.6703030467033386, "learning_rate": 1.7091446881932463e-05, "loss": 0.0499, "step": 4164 }, { "epoch": 0.46217834960070986, "grad_norm": 0.7956408262252808, "learning_rate": 1.7087749568646787e-05, "loss": 0.0572, "step": 4167 }, { "epoch": 0.46251109139307894, "grad_norm": 0.16909432411193848, "learning_rate": 1.7084052255361104e-05, "loss": 0.0264, "step": 4170 }, { "epoch": 0.4628438331854481, "grad_norm": 1.266257882118225, "learning_rate": 1.7080354942075425e-05, "loss": 0.0599, "step": 4173 }, { "epoch": 0.4631765749778172, "grad_norm": 0.45268452167510986, "learning_rate": 1.707665762878975e-05, "loss": 0.0538, "step": 4176 }, { "epoch": 0.4635093167701863, "grad_norm": 0.34743937849998474, "learning_rate": 1.7072960315504067e-05, "loss": 0.0589, "step": 4179 }, { "epoch": 0.46384205856255545, "grad_norm": 0.7616028785705566, "learning_rate": 1.706926300221839e-05, "loss": 0.0669, "step": 4182 }, { "epoch": 0.4641748003549246, "grad_norm": 0.3596990406513214, "learning_rate": 1.7065565688932712e-05, "loss": 0.0426, "step": 4185 }, { "epoch": 0.4645075421472937, "grad_norm": 0.4241790473461151, "learning_rate": 1.706186837564703e-05, "loss": 0.0299, "step": 4188 }, { "epoch": 0.4648402839396628, "grad_norm": 0.4181368947029114, "learning_rate": 1.7058171062361353e-05, "loss": 0.0274, "step": 4191 }, { "epoch": 0.46517302573203195, "grad_norm": 1.1305770874023438, "learning_rate": 1.7054473749075674e-05, "loss": 0.0502, "step": 4194 }, { "epoch": 0.46550576752440104, "grad_norm": 0.41604679822921753, "learning_rate": 1.705077643578999e-05, "loss": 0.1074, "step": 4197 }, { "epoch": 0.4658385093167702, "grad_norm": 0.9158381223678589, "learning_rate": 1.7047079122504316e-05, "loss": 0.078, "step": 4200 }, { "epoch": 0.4661712511091393, "grad_norm": 0.9910063743591309, "learning_rate": 1.7043381809218637e-05, "loss": 0.0402, "step": 4203 }, { "epoch": 0.4665039929015084, "grad_norm": 0.5099816918373108, "learning_rate": 1.7039684495932958e-05, "loss": 0.0459, "step": 4206 }, { "epoch": 0.46683673469387754, "grad_norm": 0.4148958623409271, "learning_rate": 1.703598718264728e-05, "loss": 0.0665, "step": 4209 }, { "epoch": 0.4671694764862467, "grad_norm": 0.7965304255485535, "learning_rate": 1.70322898693616e-05, "loss": 0.0687, "step": 4212 }, { "epoch": 0.46750221827861577, "grad_norm": 0.7472852468490601, "learning_rate": 1.702859255607592e-05, "loss": 0.0576, "step": 4215 }, { "epoch": 0.4678349600709849, "grad_norm": 0.5487693548202515, "learning_rate": 1.702489524279024e-05, "loss": 0.0491, "step": 4218 }, { "epoch": 0.46816770186335405, "grad_norm": 0.44171181321144104, "learning_rate": 1.702119792950456e-05, "loss": 0.0617, "step": 4221 }, { "epoch": 0.46850044365572313, "grad_norm": 0.3679957389831543, "learning_rate": 1.7017500616218882e-05, "loss": 0.063, "step": 4224 }, { "epoch": 0.46883318544809227, "grad_norm": 0.3553660809993744, "learning_rate": 1.7013803302933203e-05, "loss": 0.0516, "step": 4227 }, { "epoch": 0.4691659272404614, "grad_norm": 0.917134165763855, "learning_rate": 1.7010105989647524e-05, "loss": 0.0311, "step": 4230 }, { "epoch": 0.46949866903283055, "grad_norm": 0.7419776916503906, "learning_rate": 1.7006408676361845e-05, "loss": 0.0357, "step": 4233 }, { "epoch": 0.46983141082519964, "grad_norm": 1.4156334400177002, "learning_rate": 1.7002711363076166e-05, "loss": 0.0987, "step": 4236 }, { "epoch": 0.4701641526175688, "grad_norm": 1.0254919528961182, "learning_rate": 1.6999014049790487e-05, "loss": 0.0513, "step": 4239 }, { "epoch": 0.4704968944099379, "grad_norm": 0.5675105452537537, "learning_rate": 1.6995316736504807e-05, "loss": 0.0402, "step": 4242 }, { "epoch": 0.470829636202307, "grad_norm": 0.6920408606529236, "learning_rate": 1.6991619423219128e-05, "loss": 0.0585, "step": 4245 }, { "epoch": 0.47116237799467614, "grad_norm": 0.34908902645111084, "learning_rate": 1.698792210993345e-05, "loss": 0.0281, "step": 4248 }, { "epoch": 0.4714951197870453, "grad_norm": 1.041985273361206, "learning_rate": 1.698422479664777e-05, "loss": 0.1062, "step": 4251 }, { "epoch": 0.47182786157941436, "grad_norm": 0.5273538827896118, "learning_rate": 1.6980527483362094e-05, "loss": 0.0412, "step": 4254 }, { "epoch": 0.4721606033717835, "grad_norm": 0.9612470269203186, "learning_rate": 1.697683017007641e-05, "loss": 0.0891, "step": 4257 }, { "epoch": 0.47249334516415264, "grad_norm": 0.21431420743465424, "learning_rate": 1.6973132856790732e-05, "loss": 0.0857, "step": 4260 }, { "epoch": 0.47282608695652173, "grad_norm": 0.5161914229393005, "learning_rate": 1.6969435543505056e-05, "loss": 0.0431, "step": 4263 }, { "epoch": 0.47315882874889087, "grad_norm": 1.100512981414795, "learning_rate": 1.6965738230219374e-05, "loss": 0.1019, "step": 4266 }, { "epoch": 0.47349157054126, "grad_norm": 0.2657117545604706, "learning_rate": 1.6962040916933695e-05, "loss": 0.0204, "step": 4269 }, { "epoch": 0.4738243123336291, "grad_norm": 0.40320703387260437, "learning_rate": 1.695834360364802e-05, "loss": 0.0487, "step": 4272 }, { "epoch": 0.47415705412599823, "grad_norm": 0.664795994758606, "learning_rate": 1.6954646290362336e-05, "loss": 0.0607, "step": 4275 }, { "epoch": 0.4744897959183674, "grad_norm": 0.5744611024856567, "learning_rate": 1.695094897707666e-05, "loss": 0.0572, "step": 4278 }, { "epoch": 0.47482253771073646, "grad_norm": 0.9518013000488281, "learning_rate": 1.694725166379098e-05, "loss": 0.0682, "step": 4281 }, { "epoch": 0.4751552795031056, "grad_norm": 0.911171019077301, "learning_rate": 1.69435543505053e-05, "loss": 0.0465, "step": 4284 }, { "epoch": 0.47548802129547474, "grad_norm": 0.8909227252006531, "learning_rate": 1.6939857037219623e-05, "loss": 0.0586, "step": 4287 }, { "epoch": 0.4758207630878438, "grad_norm": 0.4024156928062439, "learning_rate": 1.6936159723933944e-05, "loss": 0.0471, "step": 4290 }, { "epoch": 0.47615350488021296, "grad_norm": 0.5227215886116028, "learning_rate": 1.693246241064826e-05, "loss": 0.0761, "step": 4293 }, { "epoch": 0.4764862466725821, "grad_norm": 0.4246692359447479, "learning_rate": 1.6928765097362586e-05, "loss": 0.0368, "step": 4296 }, { "epoch": 0.4768189884649512, "grad_norm": 0.6288511157035828, "learning_rate": 1.6925067784076906e-05, "loss": 0.0243, "step": 4299 }, { "epoch": 0.47715173025732033, "grad_norm": 0.7316569685935974, "learning_rate": 1.6921370470791227e-05, "loss": 0.0485, "step": 4302 }, { "epoch": 0.47748447204968947, "grad_norm": 0.5508032441139221, "learning_rate": 1.6917673157505548e-05, "loss": 0.0472, "step": 4305 }, { "epoch": 0.47781721384205855, "grad_norm": 0.5348746180534363, "learning_rate": 1.691397584421987e-05, "loss": 0.0756, "step": 4308 }, { "epoch": 0.4781499556344277, "grad_norm": 0.5743087530136108, "learning_rate": 1.691027853093419e-05, "loss": 0.0805, "step": 4311 }, { "epoch": 0.47848269742679683, "grad_norm": 0.6120237112045288, "learning_rate": 1.690658121764851e-05, "loss": 0.0755, "step": 4314 }, { "epoch": 0.4788154392191659, "grad_norm": 0.5951755046844482, "learning_rate": 1.690288390436283e-05, "loss": 0.0446, "step": 4317 }, { "epoch": 0.47914818101153506, "grad_norm": 1.0142602920532227, "learning_rate": 1.6899186591077152e-05, "loss": 0.0555, "step": 4320 }, { "epoch": 0.4794809228039042, "grad_norm": 0.40237757563591003, "learning_rate": 1.6895489277791473e-05, "loss": 0.0456, "step": 4323 }, { "epoch": 0.4798136645962733, "grad_norm": 1.102583408355713, "learning_rate": 1.6891791964505794e-05, "loss": 0.0816, "step": 4326 }, { "epoch": 0.4801464063886424, "grad_norm": 1.007786512374878, "learning_rate": 1.6888094651220115e-05, "loss": 0.0394, "step": 4329 }, { "epoch": 0.48047914818101156, "grad_norm": 0.9899337291717529, "learning_rate": 1.6884397337934435e-05, "loss": 0.0678, "step": 4332 }, { "epoch": 0.48081188997338065, "grad_norm": 0.7935881018638611, "learning_rate": 1.6880700024648756e-05, "loss": 0.0676, "step": 4335 }, { "epoch": 0.4811446317657498, "grad_norm": 0.43865635991096497, "learning_rate": 1.6877002711363077e-05, "loss": 0.0408, "step": 4338 }, { "epoch": 0.4814773735581189, "grad_norm": 0.32203537225723267, "learning_rate": 1.6873305398077398e-05, "loss": 0.0276, "step": 4341 }, { "epoch": 0.481810115350488, "grad_norm": 0.7618623971939087, "learning_rate": 1.686960808479172e-05, "loss": 0.0839, "step": 4344 }, { "epoch": 0.48214285714285715, "grad_norm": 0.7254458069801331, "learning_rate": 1.686591077150604e-05, "loss": 0.0643, "step": 4347 }, { "epoch": 0.4824755989352263, "grad_norm": 0.7063652873039246, "learning_rate": 1.6862213458220364e-05, "loss": 0.0766, "step": 4350 }, { "epoch": 0.4828083407275954, "grad_norm": 0.39580485224723816, "learning_rate": 1.685851614493468e-05, "loss": 0.081, "step": 4353 }, { "epoch": 0.4831410825199645, "grad_norm": 0.4491705298423767, "learning_rate": 1.6854818831649002e-05, "loss": 0.0389, "step": 4356 }, { "epoch": 0.48347382431233366, "grad_norm": 0.41502249240875244, "learning_rate": 1.6851121518363326e-05, "loss": 0.0359, "step": 4359 }, { "epoch": 0.48380656610470274, "grad_norm": 0.25004854798316956, "learning_rate": 1.6847424205077644e-05, "loss": 0.0479, "step": 4362 }, { "epoch": 0.4841393078970719, "grad_norm": 1.1288716793060303, "learning_rate": 1.6843726891791964e-05, "loss": 0.0704, "step": 4365 }, { "epoch": 0.484472049689441, "grad_norm": 0.5114402770996094, "learning_rate": 1.684002957850629e-05, "loss": 0.0597, "step": 4368 }, { "epoch": 0.4848047914818101, "grad_norm": 0.7215884327888489, "learning_rate": 1.6836332265220606e-05, "loss": 0.0654, "step": 4371 }, { "epoch": 0.48513753327417924, "grad_norm": 0.939883291721344, "learning_rate": 1.683263495193493e-05, "loss": 0.0988, "step": 4374 }, { "epoch": 0.4854702750665484, "grad_norm": 0.3372933864593506, "learning_rate": 1.682893763864925e-05, "loss": 0.0271, "step": 4377 }, { "epoch": 0.48580301685891747, "grad_norm": 0.37368032336235046, "learning_rate": 1.682524032536357e-05, "loss": 0.0463, "step": 4380 }, { "epoch": 0.4861357586512866, "grad_norm": 0.4592452049255371, "learning_rate": 1.6821543012077893e-05, "loss": 0.0467, "step": 4383 }, { "epoch": 0.48646850044365575, "grad_norm": 0.5345008969306946, "learning_rate": 1.6817845698792214e-05, "loss": 0.0573, "step": 4386 }, { "epoch": 0.48680124223602483, "grad_norm": 0.9038109183311462, "learning_rate": 1.681414838550653e-05, "loss": 0.0753, "step": 4389 }, { "epoch": 0.487133984028394, "grad_norm": 1.6051522493362427, "learning_rate": 1.6810451072220855e-05, "loss": 0.0805, "step": 4392 }, { "epoch": 0.4874667258207631, "grad_norm": 0.4553673565387726, "learning_rate": 1.6806753758935176e-05, "loss": 0.0417, "step": 4395 }, { "epoch": 0.4877994676131322, "grad_norm": 0.520794689655304, "learning_rate": 1.6803056445649497e-05, "loss": 0.0767, "step": 4398 }, { "epoch": 0.48813220940550134, "grad_norm": 0.6564691066741943, "learning_rate": 1.6799359132363818e-05, "loss": 0.0431, "step": 4401 }, { "epoch": 0.4884649511978705, "grad_norm": 0.5053410530090332, "learning_rate": 1.679566181907814e-05, "loss": 0.0629, "step": 4404 }, { "epoch": 0.48879769299023956, "grad_norm": 1.1924248933792114, "learning_rate": 1.679196450579246e-05, "loss": 0.0444, "step": 4407 }, { "epoch": 0.4891304347826087, "grad_norm": 0.34134775400161743, "learning_rate": 1.678826719250678e-05, "loss": 0.0392, "step": 4410 }, { "epoch": 0.48946317657497784, "grad_norm": 0.4818028211593628, "learning_rate": 1.67845698792211e-05, "loss": 0.0651, "step": 4413 }, { "epoch": 0.4897959183673469, "grad_norm": 0.353455513715744, "learning_rate": 1.6780872565935422e-05, "loss": 0.0682, "step": 4416 }, { "epoch": 0.49012866015971607, "grad_norm": 0.5427457690238953, "learning_rate": 1.6777175252649743e-05, "loss": 0.0504, "step": 4419 }, { "epoch": 0.4904614019520852, "grad_norm": 0.6281574368476868, "learning_rate": 1.6773477939364063e-05, "loss": 0.0507, "step": 4422 }, { "epoch": 0.4907941437444543, "grad_norm": 1.3127552270889282, "learning_rate": 1.6769780626078384e-05, "loss": 0.0414, "step": 4425 }, { "epoch": 0.49112688553682343, "grad_norm": 0.4508342146873474, "learning_rate": 1.6766083312792705e-05, "loss": 0.0372, "step": 4428 }, { "epoch": 0.49145962732919257, "grad_norm": 0.1727321743965149, "learning_rate": 1.6762385999507026e-05, "loss": 0.0475, "step": 4431 }, { "epoch": 0.49179236912156166, "grad_norm": 1.5217121839523315, "learning_rate": 1.6758688686221347e-05, "loss": 0.0617, "step": 4434 }, { "epoch": 0.4921251109139308, "grad_norm": 1.155497670173645, "learning_rate": 1.6754991372935667e-05, "loss": 0.0706, "step": 4437 }, { "epoch": 0.49245785270629994, "grad_norm": 0.32947006821632385, "learning_rate": 1.6751294059649988e-05, "loss": 0.0515, "step": 4440 }, { "epoch": 0.492790594498669, "grad_norm": 0.37474411725997925, "learning_rate": 1.674759674636431e-05, "loss": 0.0875, "step": 4443 }, { "epoch": 0.49312333629103816, "grad_norm": 0.3087087869644165, "learning_rate": 1.6743899433078633e-05, "loss": 0.053, "step": 4446 }, { "epoch": 0.4934560780834073, "grad_norm": 0.713209331035614, "learning_rate": 1.674020211979295e-05, "loss": 0.0456, "step": 4449 }, { "epoch": 0.4937888198757764, "grad_norm": 0.7906778454780579, "learning_rate": 1.673650480650727e-05, "loss": 0.0682, "step": 4452 }, { "epoch": 0.4941215616681455, "grad_norm": 0.7002046704292297, "learning_rate": 1.6732807493221596e-05, "loss": 0.0556, "step": 4455 }, { "epoch": 0.49445430346051467, "grad_norm": 0.37907874584198, "learning_rate": 1.6729110179935913e-05, "loss": 0.0533, "step": 4458 }, { "epoch": 0.49478704525288375, "grad_norm": 1.3043681383132935, "learning_rate": 1.6725412866650234e-05, "loss": 0.0505, "step": 4461 }, { "epoch": 0.4951197870452529, "grad_norm": 2.7196834087371826, "learning_rate": 1.6721715553364558e-05, "loss": 0.077, "step": 4464 }, { "epoch": 0.49545252883762203, "grad_norm": 0.41927704215049744, "learning_rate": 1.6718018240078876e-05, "loss": 0.0435, "step": 4467 }, { "epoch": 0.4957852706299911, "grad_norm": 1.7294107675552368, "learning_rate": 1.67143209267932e-05, "loss": 0.0733, "step": 4470 }, { "epoch": 0.49611801242236025, "grad_norm": 0.32694271206855774, "learning_rate": 1.671062361350752e-05, "loss": 0.0384, "step": 4473 }, { "epoch": 0.4964507542147294, "grad_norm": 0.7975589036941528, "learning_rate": 1.6706926300221838e-05, "loss": 0.0492, "step": 4476 }, { "epoch": 0.4967834960070985, "grad_norm": 0.3793935775756836, "learning_rate": 1.6703228986936162e-05, "loss": 0.0573, "step": 4479 }, { "epoch": 0.4971162377994676, "grad_norm": 0.33683711290359497, "learning_rate": 1.6699531673650483e-05, "loss": 0.0451, "step": 4482 }, { "epoch": 0.49744897959183676, "grad_norm": 1.3698158264160156, "learning_rate": 1.66958343603648e-05, "loss": 0.06, "step": 4485 }, { "epoch": 0.49778172138420584, "grad_norm": 0.6425526738166809, "learning_rate": 1.6692137047079125e-05, "loss": 0.0792, "step": 4488 }, { "epoch": 0.498114463176575, "grad_norm": 0.7697923183441162, "learning_rate": 1.6688439733793446e-05, "loss": 0.0661, "step": 4491 }, { "epoch": 0.4984472049689441, "grad_norm": 0.3150743246078491, "learning_rate": 1.6684742420507766e-05, "loss": 0.0487, "step": 4494 }, { "epoch": 0.4987799467613132, "grad_norm": 0.7236104607582092, "learning_rate": 1.6681045107222087e-05, "loss": 0.0481, "step": 4497 }, { "epoch": 0.49911268855368235, "grad_norm": 0.463837593793869, "learning_rate": 1.6677347793936408e-05, "loss": 0.0402, "step": 4500 }, { "epoch": 0.4994454303460515, "grad_norm": 0.46632885932922363, "learning_rate": 1.667365048065073e-05, "loss": 0.0553, "step": 4503 }, { "epoch": 0.4997781721384206, "grad_norm": 0.33060044050216675, "learning_rate": 1.666995316736505e-05, "loss": 0.077, "step": 4506 }, { "epoch": 0.5001109139307897, "grad_norm": 0.42432937026023865, "learning_rate": 1.666625585407937e-05, "loss": 0.0654, "step": 4509 }, { "epoch": 0.5004436557231589, "grad_norm": 0.6062284111976624, "learning_rate": 1.666255854079369e-05, "loss": 0.0291, "step": 4512 }, { "epoch": 0.5007763975155279, "grad_norm": 0.5591434240341187, "learning_rate": 1.6658861227508012e-05, "loss": 0.0474, "step": 4515 }, { "epoch": 0.501109139307897, "grad_norm": 0.4713459014892578, "learning_rate": 1.6655163914222333e-05, "loss": 0.046, "step": 4518 }, { "epoch": 0.5014418811002662, "grad_norm": 0.6001019477844238, "learning_rate": 1.6651466600936654e-05, "loss": 0.0825, "step": 4521 }, { "epoch": 0.5017746228926353, "grad_norm": 0.752199649810791, "learning_rate": 1.6647769287650975e-05, "loss": 0.0644, "step": 4524 }, { "epoch": 0.5021073646850044, "grad_norm": 0.43398723006248474, "learning_rate": 1.6644071974365295e-05, "loss": 0.0276, "step": 4527 }, { "epoch": 0.5024401064773736, "grad_norm": 0.8522276282310486, "learning_rate": 1.6640374661079616e-05, "loss": 0.0666, "step": 4530 }, { "epoch": 0.5027728482697427, "grad_norm": 0.32082700729370117, "learning_rate": 1.6636677347793937e-05, "loss": 0.0517, "step": 4533 }, { "epoch": 0.5031055900621118, "grad_norm": 0.6898975372314453, "learning_rate": 1.6632980034508258e-05, "loss": 0.0503, "step": 4536 }, { "epoch": 0.503438331854481, "grad_norm": 0.5893958210945129, "learning_rate": 1.662928272122258e-05, "loss": 0.0522, "step": 4539 }, { "epoch": 0.50377107364685, "grad_norm": 0.5386446118354797, "learning_rate": 1.66255854079369e-05, "loss": 0.0688, "step": 4542 }, { "epoch": 0.5041038154392191, "grad_norm": 0.8277572393417358, "learning_rate": 1.662188809465122e-05, "loss": 0.0721, "step": 4545 }, { "epoch": 0.5044365572315883, "grad_norm": 0.8094577193260193, "learning_rate": 1.661819078136554e-05, "loss": 0.06, "step": 4548 }, { "epoch": 0.5047692990239574, "grad_norm": 0.4037778973579407, "learning_rate": 1.6614493468079865e-05, "loss": 0.0492, "step": 4551 }, { "epoch": 0.5051020408163265, "grad_norm": 0.6802642941474915, "learning_rate": 1.6610796154794183e-05, "loss": 0.065, "step": 4554 }, { "epoch": 0.5054347826086957, "grad_norm": 0.3808431327342987, "learning_rate": 1.6607098841508504e-05, "loss": 0.0366, "step": 4557 }, { "epoch": 0.5057675244010648, "grad_norm": 0.6629760265350342, "learning_rate": 1.6603401528222828e-05, "loss": 0.0749, "step": 4560 }, { "epoch": 0.5061002661934338, "grad_norm": 0.2528640031814575, "learning_rate": 1.6599704214937145e-05, "loss": 0.0532, "step": 4563 }, { "epoch": 0.506433007985803, "grad_norm": 0.34534114599227905, "learning_rate": 1.6596006901651466e-05, "loss": 0.0481, "step": 4566 }, { "epoch": 0.5067657497781721, "grad_norm": 0.5990370512008667, "learning_rate": 1.659230958836579e-05, "loss": 0.0711, "step": 4569 }, { "epoch": 0.5070984915705412, "grad_norm": 0.6130734086036682, "learning_rate": 1.6588612275080108e-05, "loss": 0.0324, "step": 4572 }, { "epoch": 0.5074312333629104, "grad_norm": 0.8493300676345825, "learning_rate": 1.6584914961794432e-05, "loss": 0.0655, "step": 4575 }, { "epoch": 0.5077639751552795, "grad_norm": 0.6660122871398926, "learning_rate": 1.6581217648508753e-05, "loss": 0.0789, "step": 4578 }, { "epoch": 0.5080967169476486, "grad_norm": 0.6729857325553894, "learning_rate": 1.657752033522307e-05, "loss": 0.0477, "step": 4581 }, { "epoch": 0.5084294587400178, "grad_norm": 0.8302450776100159, "learning_rate": 1.6573823021937394e-05, "loss": 0.0551, "step": 4584 }, { "epoch": 0.5087622005323869, "grad_norm": 0.36229178309440613, "learning_rate": 1.6570125708651715e-05, "loss": 0.0706, "step": 4587 }, { "epoch": 0.5090949423247559, "grad_norm": 0.6343546509742737, "learning_rate": 1.6566428395366033e-05, "loss": 0.1082, "step": 4590 }, { "epoch": 0.5094276841171251, "grad_norm": 0.3220469653606415, "learning_rate": 1.6562731082080357e-05, "loss": 0.0528, "step": 4593 }, { "epoch": 0.5097604259094942, "grad_norm": 0.4004788398742676, "learning_rate": 1.6559033768794678e-05, "loss": 0.059, "step": 4596 }, { "epoch": 0.5100931677018633, "grad_norm": 1.2969952821731567, "learning_rate": 1.6555336455509e-05, "loss": 0.0544, "step": 4599 }, { "epoch": 0.5104259094942325, "grad_norm": 0.5225270390510559, "learning_rate": 1.655163914222332e-05, "loss": 0.0772, "step": 4602 }, { "epoch": 0.5107586512866016, "grad_norm": 1.5033198595046997, "learning_rate": 1.654794182893764e-05, "loss": 0.1022, "step": 4605 }, { "epoch": 0.5110913930789707, "grad_norm": 0.40494832396507263, "learning_rate": 1.654424451565196e-05, "loss": 0.0725, "step": 4608 }, { "epoch": 0.5114241348713399, "grad_norm": 0.27577438950538635, "learning_rate": 1.6540547202366282e-05, "loss": 0.0391, "step": 4611 }, { "epoch": 0.511756876663709, "grad_norm": 0.23806490004062653, "learning_rate": 1.6536849889080603e-05, "loss": 0.0398, "step": 4614 }, { "epoch": 0.512089618456078, "grad_norm": 0.3740651607513428, "learning_rate": 1.6533152575794923e-05, "loss": 0.0713, "step": 4617 }, { "epoch": 0.5124223602484472, "grad_norm": 0.855127215385437, "learning_rate": 1.6529455262509244e-05, "loss": 0.096, "step": 4620 }, { "epoch": 0.5127551020408163, "grad_norm": 0.6967151761054993, "learning_rate": 1.6525757949223565e-05, "loss": 0.0571, "step": 4623 }, { "epoch": 0.5130878438331854, "grad_norm": 1.3236134052276611, "learning_rate": 1.6522060635937886e-05, "loss": 0.0435, "step": 4626 }, { "epoch": 0.5134205856255546, "grad_norm": 0.3151557445526123, "learning_rate": 1.6518363322652207e-05, "loss": 0.0496, "step": 4629 }, { "epoch": 0.5137533274179237, "grad_norm": 0.7181231379508972, "learning_rate": 1.6514666009366528e-05, "loss": 0.0656, "step": 4632 }, { "epoch": 0.5140860692102928, "grad_norm": 1.0765810012817383, "learning_rate": 1.651096869608085e-05, "loss": 0.0764, "step": 4635 }, { "epoch": 0.514418811002662, "grad_norm": 0.32394856214523315, "learning_rate": 1.650727138279517e-05, "loss": 0.039, "step": 4638 }, { "epoch": 0.514751552795031, "grad_norm": 0.3234981298446655, "learning_rate": 1.650357406950949e-05, "loss": 0.0515, "step": 4641 }, { "epoch": 0.5150842945874001, "grad_norm": 0.6237750053405762, "learning_rate": 1.649987675622381e-05, "loss": 0.0482, "step": 4644 }, { "epoch": 0.5154170363797693, "grad_norm": 0.9343238472938538, "learning_rate": 1.6496179442938135e-05, "loss": 0.0608, "step": 4647 }, { "epoch": 0.5157497781721384, "grad_norm": 1.1724766492843628, "learning_rate": 1.6492482129652452e-05, "loss": 0.0761, "step": 4650 }, { "epoch": 0.5160825199645075, "grad_norm": 0.450942724943161, "learning_rate": 1.6488784816366773e-05, "loss": 0.0216, "step": 4653 }, { "epoch": 0.5164152617568767, "grad_norm": 0.6015228033065796, "learning_rate": 1.6485087503081097e-05, "loss": 0.0684, "step": 4656 }, { "epoch": 0.5167480035492458, "grad_norm": 0.5950390696525574, "learning_rate": 1.6481390189795415e-05, "loss": 0.075, "step": 4659 }, { "epoch": 0.5170807453416149, "grad_norm": 0.7465001940727234, "learning_rate": 1.6477692876509736e-05, "loss": 0.0783, "step": 4662 }, { "epoch": 0.517413487133984, "grad_norm": 0.5842232704162598, "learning_rate": 1.647399556322406e-05, "loss": 0.0641, "step": 4665 }, { "epoch": 0.5177462289263531, "grad_norm": 0.674278199672699, "learning_rate": 1.6470298249938377e-05, "loss": 0.0592, "step": 4668 }, { "epoch": 0.5180789707187222, "grad_norm": 1.0941298007965088, "learning_rate": 1.64666009366527e-05, "loss": 0.0515, "step": 4671 }, { "epoch": 0.5184117125110914, "grad_norm": 0.585843026638031, "learning_rate": 1.6462903623367022e-05, "loss": 0.0457, "step": 4674 }, { "epoch": 0.5187444543034605, "grad_norm": 0.3578494191169739, "learning_rate": 1.645920631008134e-05, "loss": 0.0422, "step": 4677 }, { "epoch": 0.5190771960958296, "grad_norm": 0.739766538143158, "learning_rate": 1.6455508996795664e-05, "loss": 0.0605, "step": 4680 }, { "epoch": 0.5194099378881988, "grad_norm": 0.5677748918533325, "learning_rate": 1.6451811683509985e-05, "loss": 0.0586, "step": 4683 }, { "epoch": 0.5197426796805679, "grad_norm": 0.6721118092536926, "learning_rate": 1.6448114370224306e-05, "loss": 0.0527, "step": 4686 }, { "epoch": 0.520075421472937, "grad_norm": 0.5788212418556213, "learning_rate": 1.6444417056938627e-05, "loss": 0.0608, "step": 4689 }, { "epoch": 0.5204081632653061, "grad_norm": 0.4841180741786957, "learning_rate": 1.6440719743652947e-05, "loss": 0.0543, "step": 4692 }, { "epoch": 0.5207409050576752, "grad_norm": 0.5512898564338684, "learning_rate": 1.6437022430367268e-05, "loss": 0.0419, "step": 4695 }, { "epoch": 0.5210736468500443, "grad_norm": 0.6989451050758362, "learning_rate": 1.643332511708159e-05, "loss": 0.0867, "step": 4698 }, { "epoch": 0.5214063886424135, "grad_norm": 0.6068373918533325, "learning_rate": 1.642962780379591e-05, "loss": 0.0492, "step": 4701 }, { "epoch": 0.5217391304347826, "grad_norm": 0.4367235600948334, "learning_rate": 1.642593049051023e-05, "loss": 0.0416, "step": 4704 }, { "epoch": 0.5220718722271517, "grad_norm": 0.4920384883880615, "learning_rate": 1.642223317722455e-05, "loss": 0.0602, "step": 4707 }, { "epoch": 0.5224046140195209, "grad_norm": 0.7583115696907043, "learning_rate": 1.6418535863938872e-05, "loss": 0.0516, "step": 4710 }, { "epoch": 0.52273735581189, "grad_norm": 0.5330613851547241, "learning_rate": 1.6414838550653193e-05, "loss": 0.0361, "step": 4713 }, { "epoch": 0.523070097604259, "grad_norm": 1.353100299835205, "learning_rate": 1.6411141237367514e-05, "loss": 0.089, "step": 4716 }, { "epoch": 0.5234028393966282, "grad_norm": 0.5581251382827759, "learning_rate": 1.6407443924081835e-05, "loss": 0.0456, "step": 4719 }, { "epoch": 0.5237355811889973, "grad_norm": 0.3370274305343628, "learning_rate": 1.6403746610796156e-05, "loss": 0.07, "step": 4722 }, { "epoch": 0.5240683229813664, "grad_norm": 0.5331799387931824, "learning_rate": 1.6400049297510476e-05, "loss": 0.085, "step": 4725 }, { "epoch": 0.5244010647737356, "grad_norm": 0.4525289237499237, "learning_rate": 1.6396351984224797e-05, "loss": 0.0731, "step": 4728 }, { "epoch": 0.5247338065661047, "grad_norm": 0.9478143453598022, "learning_rate": 1.6392654670939118e-05, "loss": 0.0545, "step": 4731 }, { "epoch": 0.5250665483584738, "grad_norm": 1.1077148914337158, "learning_rate": 1.638895735765344e-05, "loss": 0.0822, "step": 4734 }, { "epoch": 0.525399290150843, "grad_norm": 0.7768875956535339, "learning_rate": 1.638526004436776e-05, "loss": 0.0515, "step": 4737 }, { "epoch": 0.525732031943212, "grad_norm": 0.7468367218971252, "learning_rate": 1.638156273108208e-05, "loss": 0.0452, "step": 4740 }, { "epoch": 0.5260647737355811, "grad_norm": 0.33763355016708374, "learning_rate": 1.6377865417796405e-05, "loss": 0.0318, "step": 4743 }, { "epoch": 0.5263975155279503, "grad_norm": 0.6392717361450195, "learning_rate": 1.6374168104510722e-05, "loss": 0.034, "step": 4746 }, { "epoch": 0.5267302573203194, "grad_norm": 0.7104130983352661, "learning_rate": 1.6370470791225043e-05, "loss": 0.0536, "step": 4749 }, { "epoch": 0.5270629991126885, "grad_norm": 0.9500771760940552, "learning_rate": 1.6366773477939367e-05, "loss": 0.1244, "step": 4752 }, { "epoch": 0.5273957409050577, "grad_norm": 0.4134586453437805, "learning_rate": 1.6363076164653685e-05, "loss": 0.0554, "step": 4755 }, { "epoch": 0.5277284826974268, "grad_norm": 0.8058443069458008, "learning_rate": 1.6359378851368005e-05, "loss": 0.0478, "step": 4758 }, { "epoch": 0.5280612244897959, "grad_norm": 0.5784439444541931, "learning_rate": 1.635568153808233e-05, "loss": 0.0578, "step": 4761 }, { "epoch": 0.5283939662821651, "grad_norm": 0.7656039595603943, "learning_rate": 1.6351984224796647e-05, "loss": 0.0618, "step": 4764 }, { "epoch": 0.5287267080745341, "grad_norm": 0.7759724855422974, "learning_rate": 1.634828691151097e-05, "loss": 0.0795, "step": 4767 }, { "epoch": 0.5290594498669032, "grad_norm": 1.4008173942565918, "learning_rate": 1.6344589598225292e-05, "loss": 0.0733, "step": 4770 }, { "epoch": 0.5293921916592724, "grad_norm": 0.7402316927909851, "learning_rate": 1.6340892284939613e-05, "loss": 0.0691, "step": 4773 }, { "epoch": 0.5297249334516415, "grad_norm": 0.401731014251709, "learning_rate": 1.6337194971653934e-05, "loss": 0.0407, "step": 4776 }, { "epoch": 0.5300576752440106, "grad_norm": 0.26191291213035583, "learning_rate": 1.6333497658368255e-05, "loss": 0.0244, "step": 4779 }, { "epoch": 0.5303904170363798, "grad_norm": 0.5628681778907776, "learning_rate": 1.6329800345082575e-05, "loss": 0.0305, "step": 4782 }, { "epoch": 0.5307231588287489, "grad_norm": 0.5555793046951294, "learning_rate": 1.6326103031796896e-05, "loss": 0.0818, "step": 4785 }, { "epoch": 0.531055900621118, "grad_norm": 1.3264458179473877, "learning_rate": 1.6322405718511217e-05, "loss": 0.0823, "step": 4788 }, { "epoch": 0.5313886424134872, "grad_norm": 0.5435878038406372, "learning_rate": 1.6318708405225538e-05, "loss": 0.0818, "step": 4791 }, { "epoch": 0.5317213842058562, "grad_norm": 0.5758127570152283, "learning_rate": 1.631501109193986e-05, "loss": 0.0596, "step": 4794 }, { "epoch": 0.5320541259982253, "grad_norm": 0.7821409106254578, "learning_rate": 1.631131377865418e-05, "loss": 0.0546, "step": 4797 }, { "epoch": 0.5323868677905945, "grad_norm": 0.5309866666793823, "learning_rate": 1.63076164653685e-05, "loss": 0.045, "step": 4800 }, { "epoch": 0.5327196095829636, "grad_norm": 0.5126593708992004, "learning_rate": 1.630391915208282e-05, "loss": 0.0507, "step": 4803 }, { "epoch": 0.5330523513753327, "grad_norm": 0.3532179892063141, "learning_rate": 1.6300221838797142e-05, "loss": 0.0555, "step": 4806 }, { "epoch": 0.5333850931677019, "grad_norm": 0.6672441959381104, "learning_rate": 1.6296524525511463e-05, "loss": 0.0782, "step": 4809 }, { "epoch": 0.533717834960071, "grad_norm": 0.3911055028438568, "learning_rate": 1.6292827212225784e-05, "loss": 0.0305, "step": 4812 }, { "epoch": 0.53405057675244, "grad_norm": 1.2656508684158325, "learning_rate": 1.6289129898940104e-05, "loss": 0.0629, "step": 4815 }, { "epoch": 0.5343833185448092, "grad_norm": 0.765901505947113, "learning_rate": 1.6285432585654425e-05, "loss": 0.0477, "step": 4818 }, { "epoch": 0.5347160603371783, "grad_norm": 1.0163873434066772, "learning_rate": 1.6281735272368746e-05, "loss": 0.0417, "step": 4821 }, { "epoch": 0.5350488021295474, "grad_norm": 0.6670400500297546, "learning_rate": 1.6278037959083067e-05, "loss": 0.0322, "step": 4824 }, { "epoch": 0.5353815439219166, "grad_norm": 1.1574643850326538, "learning_rate": 1.6274340645797388e-05, "loss": 0.069, "step": 4827 }, { "epoch": 0.5357142857142857, "grad_norm": 0.512347936630249, "learning_rate": 1.627064333251171e-05, "loss": 0.0606, "step": 4830 }, { "epoch": 0.5360470275066548, "grad_norm": 0.5061585307121277, "learning_rate": 1.626694601922603e-05, "loss": 0.0685, "step": 4833 }, { "epoch": 0.536379769299024, "grad_norm": 1.9586361646652222, "learning_rate": 1.626324870594035e-05, "loss": 0.0915, "step": 4836 }, { "epoch": 0.5367125110913931, "grad_norm": 0.26165178418159485, "learning_rate": 1.6259551392654674e-05, "loss": 0.0585, "step": 4839 }, { "epoch": 0.5370452528837621, "grad_norm": 0.8740687370300293, "learning_rate": 1.6255854079368992e-05, "loss": 0.0643, "step": 4842 }, { "epoch": 0.5373779946761313, "grad_norm": 0.6125316619873047, "learning_rate": 1.6252156766083313e-05, "loss": 0.051, "step": 4845 }, { "epoch": 0.5377107364685004, "grad_norm": 0.684438169002533, "learning_rate": 1.6248459452797637e-05, "loss": 0.0523, "step": 4848 }, { "epoch": 0.5380434782608695, "grad_norm": 0.9820858240127563, "learning_rate": 1.6244762139511958e-05, "loss": 0.0696, "step": 4851 }, { "epoch": 0.5383762200532387, "grad_norm": 0.2496250867843628, "learning_rate": 1.6241064826226275e-05, "loss": 0.038, "step": 4854 }, { "epoch": 0.5387089618456078, "grad_norm": 0.5023927688598633, "learning_rate": 1.62373675129406e-05, "loss": 0.0673, "step": 4857 }, { "epoch": 0.5390417036379769, "grad_norm": 0.700604259967804, "learning_rate": 1.623367019965492e-05, "loss": 0.0746, "step": 4860 }, { "epoch": 0.5393744454303461, "grad_norm": 0.7832975387573242, "learning_rate": 1.622997288636924e-05, "loss": 0.055, "step": 4863 }, { "epoch": 0.5397071872227152, "grad_norm": 0.5651887655258179, "learning_rate": 1.622627557308356e-05, "loss": 0.0742, "step": 4866 }, { "epoch": 0.5400399290150842, "grad_norm": 0.388367623090744, "learning_rate": 1.6222578259797883e-05, "loss": 0.0226, "step": 4869 }, { "epoch": 0.5403726708074534, "grad_norm": 0.3604065477848053, "learning_rate": 1.6218880946512203e-05, "loss": 0.0466, "step": 4872 }, { "epoch": 0.5407054125998225, "grad_norm": 0.6742284893989563, "learning_rate": 1.6215183633226524e-05, "loss": 0.0473, "step": 4875 }, { "epoch": 0.5410381543921916, "grad_norm": 0.2869873642921448, "learning_rate": 1.6211486319940845e-05, "loss": 0.0507, "step": 4878 }, { "epoch": 0.5413708961845608, "grad_norm": 0.2961215078830719, "learning_rate": 1.6207789006655166e-05, "loss": 0.0319, "step": 4881 }, { "epoch": 0.5417036379769299, "grad_norm": 0.3176356554031372, "learning_rate": 1.6204091693369487e-05, "loss": 0.0411, "step": 4884 }, { "epoch": 0.542036379769299, "grad_norm": 0.7113035321235657, "learning_rate": 1.6200394380083807e-05, "loss": 0.0951, "step": 4887 }, { "epoch": 0.5423691215616682, "grad_norm": 0.6850292086601257, "learning_rate": 1.6196697066798128e-05, "loss": 0.042, "step": 4890 }, { "epoch": 0.5427018633540373, "grad_norm": 0.4315359592437744, "learning_rate": 1.619299975351245e-05, "loss": 0.0402, "step": 4893 }, { "epoch": 0.5430346051464063, "grad_norm": 0.5258512496948242, "learning_rate": 1.618930244022677e-05, "loss": 0.062, "step": 4896 }, { "epoch": 0.5433673469387755, "grad_norm": 0.6225194334983826, "learning_rate": 1.618560512694109e-05, "loss": 0.0463, "step": 4899 }, { "epoch": 0.5437000887311446, "grad_norm": 0.34595754742622375, "learning_rate": 1.618190781365541e-05, "loss": 0.04, "step": 4902 }, { "epoch": 0.5440328305235137, "grad_norm": 0.1278652548789978, "learning_rate": 1.6178210500369732e-05, "loss": 0.0438, "step": 4905 }, { "epoch": 0.5443655723158829, "grad_norm": 0.4569694697856903, "learning_rate": 1.6174513187084053e-05, "loss": 0.071, "step": 4908 }, { "epoch": 0.544698314108252, "grad_norm": 0.40289801359176636, "learning_rate": 1.6170815873798374e-05, "loss": 0.0395, "step": 4911 }, { "epoch": 0.5450310559006211, "grad_norm": 2.128030300140381, "learning_rate": 1.6167118560512695e-05, "loss": 0.047, "step": 4914 }, { "epoch": 0.5453637976929903, "grad_norm": 1.447357177734375, "learning_rate": 1.6163421247227016e-05, "loss": 0.0715, "step": 4917 }, { "epoch": 0.5456965394853593, "grad_norm": 0.7226765751838684, "learning_rate": 1.6159723933941336e-05, "loss": 0.0429, "step": 4920 }, { "epoch": 0.5460292812777284, "grad_norm": 0.7456640601158142, "learning_rate": 1.6156026620655657e-05, "loss": 0.0667, "step": 4923 }, { "epoch": 0.5463620230700976, "grad_norm": 0.32164984941482544, "learning_rate": 1.6152329307369978e-05, "loss": 0.0325, "step": 4926 }, { "epoch": 0.5466947648624667, "grad_norm": 0.6369009613990784, "learning_rate": 1.61486319940843e-05, "loss": 0.0409, "step": 4929 }, { "epoch": 0.5470275066548358, "grad_norm": 0.36427557468414307, "learning_rate": 1.614493468079862e-05, "loss": 0.0457, "step": 4932 }, { "epoch": 0.547360248447205, "grad_norm": 0.4992024004459381, "learning_rate": 1.6141237367512944e-05, "loss": 0.1112, "step": 4935 }, { "epoch": 0.5476929902395741, "grad_norm": 0.7561984658241272, "learning_rate": 1.6137540054227265e-05, "loss": 0.0684, "step": 4938 }, { "epoch": 0.5480257320319432, "grad_norm": 0.5635733604431152, "learning_rate": 1.6133842740941582e-05, "loss": 0.0509, "step": 4941 }, { "epoch": 0.5483584738243124, "grad_norm": 0.809857964515686, "learning_rate": 1.6130145427655906e-05, "loss": 0.0371, "step": 4944 }, { "epoch": 0.5486912156166814, "grad_norm": 0.4593777656555176, "learning_rate": 1.6126448114370227e-05, "loss": 0.0426, "step": 4947 }, { "epoch": 0.5490239574090505, "grad_norm": 0.5917204022407532, "learning_rate": 1.6122750801084545e-05, "loss": 0.0556, "step": 4950 }, { "epoch": 0.5493566992014197, "grad_norm": 0.450695276260376, "learning_rate": 1.611905348779887e-05, "loss": 0.0883, "step": 4953 }, { "epoch": 0.5496894409937888, "grad_norm": 0.6929962038993835, "learning_rate": 1.611535617451319e-05, "loss": 0.0414, "step": 4956 }, { "epoch": 0.5500221827861579, "grad_norm": 0.5394061207771301, "learning_rate": 1.611165886122751e-05, "loss": 0.0429, "step": 4959 }, { "epoch": 0.5503549245785271, "grad_norm": 0.8818528056144714, "learning_rate": 1.610796154794183e-05, "loss": 0.0697, "step": 4962 }, { "epoch": 0.5506876663708962, "grad_norm": 0.4484509825706482, "learning_rate": 1.6104264234656152e-05, "loss": 0.0492, "step": 4965 }, { "epoch": 0.5510204081632653, "grad_norm": 0.8746159672737122, "learning_rate": 1.6100566921370473e-05, "loss": 0.0805, "step": 4968 }, { "epoch": 0.5513531499556344, "grad_norm": 0.7191565036773682, "learning_rate": 1.6096869608084794e-05, "loss": 0.0641, "step": 4971 }, { "epoch": 0.5516858917480035, "grad_norm": 0.508062481880188, "learning_rate": 1.6093172294799115e-05, "loss": 0.0648, "step": 4974 }, { "epoch": 0.5520186335403726, "grad_norm": 0.7924707531929016, "learning_rate": 1.6089474981513435e-05, "loss": 0.0941, "step": 4977 }, { "epoch": 0.5523513753327418, "grad_norm": 0.5493044257164001, "learning_rate": 1.6085777668227756e-05, "loss": 0.062, "step": 4980 }, { "epoch": 0.5526841171251109, "grad_norm": 0.7076014876365662, "learning_rate": 1.6082080354942077e-05, "loss": 0.0605, "step": 4983 }, { "epoch": 0.55301685891748, "grad_norm": 0.44455569982528687, "learning_rate": 1.6078383041656398e-05, "loss": 0.0296, "step": 4986 }, { "epoch": 0.5533496007098492, "grad_norm": 0.6161890029907227, "learning_rate": 1.607468572837072e-05, "loss": 0.0512, "step": 4989 }, { "epoch": 0.5536823425022183, "grad_norm": 0.7101277112960815, "learning_rate": 1.607098841508504e-05, "loss": 0.0641, "step": 4992 }, { "epoch": 0.5540150842945873, "grad_norm": 0.3453422784805298, "learning_rate": 1.606729110179936e-05, "loss": 0.0728, "step": 4995 }, { "epoch": 0.5543478260869565, "grad_norm": 0.6668671369552612, "learning_rate": 1.606359378851368e-05, "loss": 0.0512, "step": 4998 }, { "epoch": 0.5546805678793256, "grad_norm": 0.8219621181488037, "learning_rate": 1.6059896475228002e-05, "loss": 0.0428, "step": 5001 }, { "epoch": 0.5550133096716947, "grad_norm": 0.45358189940452576, "learning_rate": 1.6056199161942323e-05, "loss": 0.0467, "step": 5004 }, { "epoch": 0.5553460514640639, "grad_norm": 0.6637992262840271, "learning_rate": 1.6052501848656644e-05, "loss": 0.0426, "step": 5007 }, { "epoch": 0.555678793256433, "grad_norm": 0.32020580768585205, "learning_rate": 1.6048804535370964e-05, "loss": 0.0415, "step": 5010 }, { "epoch": 0.5560115350488021, "grad_norm": 1.4639726877212524, "learning_rate": 1.6045107222085285e-05, "loss": 0.0662, "step": 5013 }, { "epoch": 0.5563442768411713, "grad_norm": 0.4240921139717102, "learning_rate": 1.604140990879961e-05, "loss": 0.0708, "step": 5016 }, { "epoch": 0.5566770186335404, "grad_norm": 0.49903425574302673, "learning_rate": 1.6037712595513927e-05, "loss": 0.0432, "step": 5019 }, { "epoch": 0.5570097604259094, "grad_norm": 0.3559170067310333, "learning_rate": 1.6034015282228248e-05, "loss": 0.0272, "step": 5022 }, { "epoch": 0.5573425022182786, "grad_norm": 0.6159944534301758, "learning_rate": 1.6030317968942572e-05, "loss": 0.042, "step": 5025 }, { "epoch": 0.5576752440106477, "grad_norm": 0.3796788156032562, "learning_rate": 1.602662065565689e-05, "loss": 0.0558, "step": 5028 }, { "epoch": 0.5580079858030168, "grad_norm": 0.8426181077957153, "learning_rate": 1.602292334237121e-05, "loss": 0.0262, "step": 5031 }, { "epoch": 0.558340727595386, "grad_norm": 0.4741966128349304, "learning_rate": 1.6019226029085534e-05, "loss": 0.0399, "step": 5034 }, { "epoch": 0.5586734693877551, "grad_norm": 0.29703524708747864, "learning_rate": 1.6015528715799852e-05, "loss": 0.0318, "step": 5037 }, { "epoch": 0.5590062111801242, "grad_norm": 0.541343092918396, "learning_rate": 1.6011831402514176e-05, "loss": 0.0636, "step": 5040 }, { "epoch": 0.5593389529724934, "grad_norm": 1.204793095588684, "learning_rate": 1.6008134089228497e-05, "loss": 0.0721, "step": 5043 }, { "epoch": 0.5596716947648624, "grad_norm": 0.931162416934967, "learning_rate": 1.6004436775942814e-05, "loss": 0.0678, "step": 5046 }, { "epoch": 0.5600044365572315, "grad_norm": 0.36315762996673584, "learning_rate": 1.600073946265714e-05, "loss": 0.0631, "step": 5049 }, { "epoch": 0.5603371783496007, "grad_norm": 0.8383373022079468, "learning_rate": 1.599704214937146e-05, "loss": 0.0462, "step": 5052 }, { "epoch": 0.5606699201419698, "grad_norm": 0.4856332540512085, "learning_rate": 1.5993344836085777e-05, "loss": 0.0432, "step": 5055 }, { "epoch": 0.5610026619343389, "grad_norm": 0.25565043091773987, "learning_rate": 1.59896475228001e-05, "loss": 0.0375, "step": 5058 }, { "epoch": 0.5613354037267081, "grad_norm": 0.46933043003082275, "learning_rate": 1.5985950209514422e-05, "loss": 0.0372, "step": 5061 }, { "epoch": 0.5616681455190772, "grad_norm": 0.3383731245994568, "learning_rate": 1.5982252896228743e-05, "loss": 0.0457, "step": 5064 }, { "epoch": 0.5620008873114463, "grad_norm": 0.5122916102409363, "learning_rate": 1.5978555582943063e-05, "loss": 0.0525, "step": 5067 }, { "epoch": 0.5623336291038155, "grad_norm": 0.8524137139320374, "learning_rate": 1.5974858269657384e-05, "loss": 0.0173, "step": 5070 }, { "epoch": 0.5626663708961845, "grad_norm": 0.44270771741867065, "learning_rate": 1.5971160956371705e-05, "loss": 0.0379, "step": 5073 }, { "epoch": 0.5629991126885537, "grad_norm": 0.6381003856658936, "learning_rate": 1.5967463643086026e-05, "loss": 0.0325, "step": 5076 }, { "epoch": 0.5633318544809228, "grad_norm": 0.4175693690776825, "learning_rate": 1.5963766329800347e-05, "loss": 0.0604, "step": 5079 }, { "epoch": 0.5636645962732919, "grad_norm": 0.40478187799453735, "learning_rate": 1.5960069016514668e-05, "loss": 0.025, "step": 5082 }, { "epoch": 0.5639973380656611, "grad_norm": 1.2201155424118042, "learning_rate": 1.595637170322899e-05, "loss": 0.0567, "step": 5085 }, { "epoch": 0.5643300798580302, "grad_norm": 0.6114994287490845, "learning_rate": 1.595267438994331e-05, "loss": 0.0224, "step": 5088 }, { "epoch": 0.5646628216503993, "grad_norm": 0.9024442434310913, "learning_rate": 1.594897707665763e-05, "loss": 0.1051, "step": 5091 }, { "epoch": 0.5649955634427685, "grad_norm": 0.7893527150154114, "learning_rate": 1.594527976337195e-05, "loss": 0.0691, "step": 5094 }, { "epoch": 0.5653283052351376, "grad_norm": 0.7287545800209045, "learning_rate": 1.594158245008627e-05, "loss": 0.0445, "step": 5097 }, { "epoch": 0.5656610470275066, "grad_norm": 0.5691441297531128, "learning_rate": 1.5937885136800592e-05, "loss": 0.0374, "step": 5100 }, { "epoch": 0.5659937888198758, "grad_norm": 0.6280270218849182, "learning_rate": 1.5934187823514913e-05, "loss": 0.0339, "step": 5103 }, { "epoch": 0.5663265306122449, "grad_norm": 0.7062181234359741, "learning_rate": 1.5930490510229234e-05, "loss": 0.0312, "step": 5106 }, { "epoch": 0.566659272404614, "grad_norm": 1.0022716522216797, "learning_rate": 1.5926793196943555e-05, "loss": 0.0337, "step": 5109 }, { "epoch": 0.5669920141969832, "grad_norm": 1.6287060976028442, "learning_rate": 1.592309588365788e-05, "loss": 0.1251, "step": 5112 }, { "epoch": 0.5673247559893523, "grad_norm": 0.5149232149124146, "learning_rate": 1.5919398570372197e-05, "loss": 0.0769, "step": 5115 }, { "epoch": 0.5676574977817214, "grad_norm": 0.27455419301986694, "learning_rate": 1.5915701257086517e-05, "loss": 0.0617, "step": 5118 }, { "epoch": 0.5679902395740906, "grad_norm": 0.47746315598487854, "learning_rate": 1.591200394380084e-05, "loss": 0.0671, "step": 5121 }, { "epoch": 0.5683229813664596, "grad_norm": 1.2510346174240112, "learning_rate": 1.590830663051516e-05, "loss": 0.0526, "step": 5124 }, { "epoch": 0.5686557231588287, "grad_norm": 0.6269752979278564, "learning_rate": 1.590460931722948e-05, "loss": 0.054, "step": 5127 }, { "epoch": 0.5689884649511979, "grad_norm": 1.0413000583648682, "learning_rate": 1.5900912003943804e-05, "loss": 0.049, "step": 5130 }, { "epoch": 0.569321206743567, "grad_norm": 0.6871169805526733, "learning_rate": 1.589721469065812e-05, "loss": 0.0887, "step": 5133 }, { "epoch": 0.5696539485359361, "grad_norm": 1.4046785831451416, "learning_rate": 1.5893517377372446e-05, "loss": 0.0411, "step": 5136 }, { "epoch": 0.5699866903283053, "grad_norm": 1.6940693855285645, "learning_rate": 1.5889820064086766e-05, "loss": 0.0709, "step": 5139 }, { "epoch": 0.5703194321206744, "grad_norm": 0.5264779925346375, "learning_rate": 1.5886122750801084e-05, "loss": 0.021, "step": 5142 }, { "epoch": 0.5706521739130435, "grad_norm": 0.6489671468734741, "learning_rate": 1.5882425437515408e-05, "loss": 0.0338, "step": 5145 }, { "epoch": 0.5709849157054127, "grad_norm": 0.5367608070373535, "learning_rate": 1.587872812422973e-05, "loss": 0.0684, "step": 5148 }, { "epoch": 0.5713176574977817, "grad_norm": 0.9099798798561096, "learning_rate": 1.5875030810944046e-05, "loss": 0.0662, "step": 5151 }, { "epoch": 0.5716503992901508, "grad_norm": 0.6430858969688416, "learning_rate": 1.587133349765837e-05, "loss": 0.0307, "step": 5154 }, { "epoch": 0.57198314108252, "grad_norm": 0.8332350254058838, "learning_rate": 1.586763618437269e-05, "loss": 0.0683, "step": 5157 }, { "epoch": 0.5723158828748891, "grad_norm": 0.4816548526287079, "learning_rate": 1.5863938871087012e-05, "loss": 0.0627, "step": 5160 }, { "epoch": 0.5726486246672582, "grad_norm": 0.5717704892158508, "learning_rate": 1.5860241557801333e-05, "loss": 0.0636, "step": 5163 }, { "epoch": 0.5729813664596274, "grad_norm": 0.3523533046245575, "learning_rate": 1.5856544244515654e-05, "loss": 0.0362, "step": 5166 }, { "epoch": 0.5733141082519965, "grad_norm": 1.2271071672439575, "learning_rate": 1.5852846931229975e-05, "loss": 0.0458, "step": 5169 }, { "epoch": 0.5736468500443656, "grad_norm": 1.6487252712249756, "learning_rate": 1.5849149617944296e-05, "loss": 0.0595, "step": 5172 }, { "epoch": 0.5739795918367347, "grad_norm": 0.9081434607505798, "learning_rate": 1.5845452304658616e-05, "loss": 0.0576, "step": 5175 }, { "epoch": 0.5743123336291038, "grad_norm": 0.5217030048370361, "learning_rate": 1.5841754991372937e-05, "loss": 0.0586, "step": 5178 }, { "epoch": 0.5746450754214729, "grad_norm": 0.9181904196739197, "learning_rate": 1.5838057678087258e-05, "loss": 0.0569, "step": 5181 }, { "epoch": 0.5749778172138421, "grad_norm": 0.6314923167228699, "learning_rate": 1.583436036480158e-05, "loss": 0.0712, "step": 5184 }, { "epoch": 0.5753105590062112, "grad_norm": 0.44546571373939514, "learning_rate": 1.58306630515159e-05, "loss": 0.084, "step": 5187 }, { "epoch": 0.5756433007985803, "grad_norm": 0.520753800868988, "learning_rate": 1.582696573823022e-05, "loss": 0.0536, "step": 5190 }, { "epoch": 0.5759760425909495, "grad_norm": 0.3313755691051483, "learning_rate": 1.582326842494454e-05, "loss": 0.0437, "step": 5193 }, { "epoch": 0.5763087843833186, "grad_norm": 0.44318869709968567, "learning_rate": 1.5819571111658862e-05, "loss": 0.0347, "step": 5196 }, { "epoch": 0.5766415261756876, "grad_norm": 0.964013934135437, "learning_rate": 1.5815873798373183e-05, "loss": 0.0619, "step": 5199 }, { "epoch": 0.5769742679680568, "grad_norm": 0.4444999396800995, "learning_rate": 1.5812176485087504e-05, "loss": 0.0532, "step": 5202 }, { "epoch": 0.5773070097604259, "grad_norm": 1.646679401397705, "learning_rate": 1.5808479171801825e-05, "loss": 0.0919, "step": 5205 }, { "epoch": 0.577639751552795, "grad_norm": 1.162111759185791, "learning_rate": 1.580478185851615e-05, "loss": 0.0495, "step": 5208 }, { "epoch": 0.5779724933451642, "grad_norm": 0.263092041015625, "learning_rate": 1.5801084545230466e-05, "loss": 0.0608, "step": 5211 }, { "epoch": 0.5783052351375333, "grad_norm": 0.48242098093032837, "learning_rate": 1.5797387231944787e-05, "loss": 0.042, "step": 5214 }, { "epoch": 0.5786379769299024, "grad_norm": 1.500932216644287, "learning_rate": 1.579368991865911e-05, "loss": 0.0907, "step": 5217 }, { "epoch": 0.5789707187222716, "grad_norm": 0.7270702719688416, "learning_rate": 1.578999260537343e-05, "loss": 0.0494, "step": 5220 }, { "epoch": 0.5793034605146407, "grad_norm": 0.3922547996044159, "learning_rate": 1.578629529208775e-05, "loss": 0.0351, "step": 5223 }, { "epoch": 0.5796362023070097, "grad_norm": 0.6423364281654358, "learning_rate": 1.5782597978802074e-05, "loss": 0.0416, "step": 5226 }, { "epoch": 0.5799689440993789, "grad_norm": 0.5710586309432983, "learning_rate": 1.577890066551639e-05, "loss": 0.0764, "step": 5229 }, { "epoch": 0.580301685891748, "grad_norm": 0.6238887906074524, "learning_rate": 1.5775203352230715e-05, "loss": 0.0472, "step": 5232 }, { "epoch": 0.5806344276841171, "grad_norm": 0.6200597286224365, "learning_rate": 1.5771506038945036e-05, "loss": 0.0533, "step": 5235 }, { "epoch": 0.5809671694764863, "grad_norm": 0.6137021780014038, "learning_rate": 1.5767808725659354e-05, "loss": 0.0941, "step": 5238 }, { "epoch": 0.5812999112688554, "grad_norm": 0.8226279616355896, "learning_rate": 1.5764111412373678e-05, "loss": 0.036, "step": 5241 }, { "epoch": 0.5816326530612245, "grad_norm": 0.48905429244041443, "learning_rate": 1.5760414099088e-05, "loss": 0.0444, "step": 5244 }, { "epoch": 0.5819653948535937, "grad_norm": 0.5253121256828308, "learning_rate": 1.5756716785802316e-05, "loss": 0.0505, "step": 5247 }, { "epoch": 0.5822981366459627, "grad_norm": 1.561289668083191, "learning_rate": 1.575301947251664e-05, "loss": 0.0373, "step": 5250 }, { "epoch": 0.5826308784383318, "grad_norm": 0.20907069742679596, "learning_rate": 1.574932215923096e-05, "loss": 0.054, "step": 5253 }, { "epoch": 0.582963620230701, "grad_norm": 0.584528923034668, "learning_rate": 1.5745624845945282e-05, "loss": 0.0482, "step": 5256 }, { "epoch": 0.5832963620230701, "grad_norm": 0.6811756491661072, "learning_rate": 1.5741927532659603e-05, "loss": 0.0636, "step": 5259 }, { "epoch": 0.5836291038154392, "grad_norm": 0.8787506222724915, "learning_rate": 1.5738230219373924e-05, "loss": 0.0713, "step": 5262 }, { "epoch": 0.5839618456078084, "grad_norm": 1.1560895442962646, "learning_rate": 1.5734532906088244e-05, "loss": 0.0586, "step": 5265 }, { "epoch": 0.5842945874001775, "grad_norm": 0.5716573596000671, "learning_rate": 1.5730835592802565e-05, "loss": 0.0927, "step": 5268 }, { "epoch": 0.5846273291925466, "grad_norm": 0.7650666832923889, "learning_rate": 1.5727138279516886e-05, "loss": 0.0733, "step": 5271 }, { "epoch": 0.5849600709849158, "grad_norm": 0.5080830454826355, "learning_rate": 1.5723440966231207e-05, "loss": 0.0375, "step": 5274 }, { "epoch": 0.5852928127772848, "grad_norm": 0.5037235617637634, "learning_rate": 1.5719743652945528e-05, "loss": 0.064, "step": 5277 }, { "epoch": 0.5856255545696539, "grad_norm": 0.4388017952442169, "learning_rate": 1.571604633965985e-05, "loss": 0.0519, "step": 5280 }, { "epoch": 0.5859582963620231, "grad_norm": 0.5902581810951233, "learning_rate": 1.571234902637417e-05, "loss": 0.0429, "step": 5283 }, { "epoch": 0.5862910381543922, "grad_norm": 0.5639892220497131, "learning_rate": 1.570865171308849e-05, "loss": 0.0533, "step": 5286 }, { "epoch": 0.5866237799467613, "grad_norm": 1.498172402381897, "learning_rate": 1.570495439980281e-05, "loss": 0.0525, "step": 5289 }, { "epoch": 0.5869565217391305, "grad_norm": 1.1813150644302368, "learning_rate": 1.5701257086517132e-05, "loss": 0.0557, "step": 5292 }, { "epoch": 0.5872892635314996, "grad_norm": 0.7021898031234741, "learning_rate": 1.5697559773231453e-05, "loss": 0.0873, "step": 5295 }, { "epoch": 0.5876220053238687, "grad_norm": 1.2364012002944946, "learning_rate": 1.5693862459945773e-05, "loss": 0.099, "step": 5298 }, { "epoch": 0.5879547471162379, "grad_norm": 1.0121334791183472, "learning_rate": 1.5690165146660094e-05, "loss": 0.0672, "step": 5301 }, { "epoch": 0.5882874889086069, "grad_norm": 0.676474392414093, "learning_rate": 1.568646783337442e-05, "loss": 0.078, "step": 5304 }, { "epoch": 0.588620230700976, "grad_norm": 0.46478793025016785, "learning_rate": 1.5682770520088736e-05, "loss": 0.033, "step": 5307 }, { "epoch": 0.5889529724933452, "grad_norm": 0.990711510181427, "learning_rate": 1.5679073206803057e-05, "loss": 0.0807, "step": 5310 }, { "epoch": 0.5892857142857143, "grad_norm": 0.7593509554862976, "learning_rate": 1.567537589351738e-05, "loss": 0.0578, "step": 5313 }, { "epoch": 0.5896184560780834, "grad_norm": 1.2795480489730835, "learning_rate": 1.5671678580231698e-05, "loss": 0.0859, "step": 5316 }, { "epoch": 0.5899511978704526, "grad_norm": 0.755778431892395, "learning_rate": 1.566798126694602e-05, "loss": 0.0584, "step": 5319 }, { "epoch": 0.5902839396628217, "grad_norm": 0.5701360702514648, "learning_rate": 1.5664283953660343e-05, "loss": 0.0686, "step": 5322 }, { "epoch": 0.5906166814551908, "grad_norm": 0.5858903527259827, "learning_rate": 1.566058664037466e-05, "loss": 0.0344, "step": 5325 }, { "epoch": 0.59094942324756, "grad_norm": 0.6029195189476013, "learning_rate": 1.5656889327088985e-05, "loss": 0.0707, "step": 5328 }, { "epoch": 0.591282165039929, "grad_norm": 0.7853928804397583, "learning_rate": 1.5653192013803306e-05, "loss": 0.0577, "step": 5331 }, { "epoch": 0.5916149068322981, "grad_norm": 0.49017682671546936, "learning_rate": 1.5649494700517623e-05, "loss": 0.0445, "step": 5334 }, { "epoch": 0.5919476486246673, "grad_norm": 0.5385863184928894, "learning_rate": 1.5645797387231947e-05, "loss": 0.0379, "step": 5337 }, { "epoch": 0.5922803904170364, "grad_norm": 0.8719037175178528, "learning_rate": 1.5642100073946268e-05, "loss": 0.0515, "step": 5340 }, { "epoch": 0.5926131322094055, "grad_norm": 0.5738103985786438, "learning_rate": 1.5638402760660586e-05, "loss": 0.048, "step": 5343 }, { "epoch": 0.5929458740017747, "grad_norm": 0.37445178627967834, "learning_rate": 1.563470544737491e-05, "loss": 0.0351, "step": 5346 }, { "epoch": 0.5932786157941438, "grad_norm": 1.648461103439331, "learning_rate": 1.563100813408923e-05, "loss": 0.0923, "step": 5349 }, { "epoch": 0.5936113575865128, "grad_norm": 0.4302060008049011, "learning_rate": 1.562731082080355e-05, "loss": 0.0845, "step": 5352 }, { "epoch": 0.593944099378882, "grad_norm": 0.7728216648101807, "learning_rate": 1.5623613507517872e-05, "loss": 0.0738, "step": 5355 }, { "epoch": 0.5942768411712511, "grad_norm": 0.5113272070884705, "learning_rate": 1.5619916194232193e-05, "loss": 0.0412, "step": 5358 }, { "epoch": 0.5946095829636202, "grad_norm": 0.3138580620288849, "learning_rate": 1.5616218880946514e-05, "loss": 0.0376, "step": 5361 }, { "epoch": 0.5949423247559894, "grad_norm": 0.6077708005905151, "learning_rate": 1.5612521567660835e-05, "loss": 0.0416, "step": 5364 }, { "epoch": 0.5952750665483585, "grad_norm": 0.5558707118034363, "learning_rate": 1.5608824254375156e-05, "loss": 0.0631, "step": 5367 }, { "epoch": 0.5956078083407276, "grad_norm": 0.437857449054718, "learning_rate": 1.5605126941089476e-05, "loss": 0.0747, "step": 5370 }, { "epoch": 0.5959405501330968, "grad_norm": 0.6092281341552734, "learning_rate": 1.5601429627803797e-05, "loss": 0.0476, "step": 5373 }, { "epoch": 0.5962732919254659, "grad_norm": 0.8192142248153687, "learning_rate": 1.5597732314518118e-05, "loss": 0.0603, "step": 5376 }, { "epoch": 0.5966060337178349, "grad_norm": 0.3455490469932556, "learning_rate": 1.559403500123244e-05, "loss": 0.071, "step": 5379 }, { "epoch": 0.5969387755102041, "grad_norm": 0.43971574306488037, "learning_rate": 1.559033768794676e-05, "loss": 0.0482, "step": 5382 }, { "epoch": 0.5972715173025732, "grad_norm": 1.062625765800476, "learning_rate": 1.558664037466108e-05, "loss": 0.055, "step": 5385 }, { "epoch": 0.5976042590949423, "grad_norm": 0.8330872654914856, "learning_rate": 1.55829430613754e-05, "loss": 0.0606, "step": 5388 }, { "epoch": 0.5979370008873115, "grad_norm": 0.5166788697242737, "learning_rate": 1.5579245748089722e-05, "loss": 0.0471, "step": 5391 }, { "epoch": 0.5982697426796806, "grad_norm": 0.6772616505622864, "learning_rate": 1.5575548434804043e-05, "loss": 0.0331, "step": 5394 }, { "epoch": 0.5986024844720497, "grad_norm": 0.802371621131897, "learning_rate": 1.5571851121518364e-05, "loss": 0.0569, "step": 5397 }, { "epoch": 0.5989352262644189, "grad_norm": 0.4987129271030426, "learning_rate": 1.5568153808232688e-05, "loss": 0.0365, "step": 5400 }, { "epoch": 0.599267968056788, "grad_norm": 0.5143981575965881, "learning_rate": 1.5564456494947005e-05, "loss": 0.0724, "step": 5403 }, { "epoch": 0.599600709849157, "grad_norm": 1.0109604597091675, "learning_rate": 1.5560759181661326e-05, "loss": 0.0663, "step": 5406 }, { "epoch": 0.5999334516415262, "grad_norm": 0.37450993061065674, "learning_rate": 1.555706186837565e-05, "loss": 0.0652, "step": 5409 }, { "epoch": 0.6002661934338953, "grad_norm": 1.1278702020645142, "learning_rate": 1.5553364555089968e-05, "loss": 0.0896, "step": 5412 }, { "epoch": 0.6005989352262644, "grad_norm": 0.7448617815971375, "learning_rate": 1.554966724180429e-05, "loss": 0.0447, "step": 5415 }, { "epoch": 0.6009316770186336, "grad_norm": 0.5178096890449524, "learning_rate": 1.5545969928518613e-05, "loss": 0.0488, "step": 5418 }, { "epoch": 0.6012644188110027, "grad_norm": 0.8272609710693359, "learning_rate": 1.554227261523293e-05, "loss": 0.0758, "step": 5421 }, { "epoch": 0.6015971606033718, "grad_norm": 0.3943493664264679, "learning_rate": 1.553857530194725e-05, "loss": 0.0322, "step": 5424 }, { "epoch": 0.601929902395741, "grad_norm": 1.140074610710144, "learning_rate": 1.5534877988661575e-05, "loss": 0.0751, "step": 5427 }, { "epoch": 0.60226264418811, "grad_norm": 1.0712058544158936, "learning_rate": 1.5531180675375893e-05, "loss": 0.0987, "step": 5430 }, { "epoch": 0.6025953859804791, "grad_norm": 0.5757758021354675, "learning_rate": 1.5527483362090217e-05, "loss": 0.0671, "step": 5433 }, { "epoch": 0.6029281277728483, "grad_norm": 0.6574673056602478, "learning_rate": 1.5523786048804538e-05, "loss": 0.0748, "step": 5436 }, { "epoch": 0.6032608695652174, "grad_norm": 0.44896066188812256, "learning_rate": 1.5520088735518855e-05, "loss": 0.047, "step": 5439 }, { "epoch": 0.6035936113575865, "grad_norm": 0.3518737852573395, "learning_rate": 1.551639142223318e-05, "loss": 0.0379, "step": 5442 }, { "epoch": 0.6039263531499557, "grad_norm": 0.681968629360199, "learning_rate": 1.55126941089475e-05, "loss": 0.0586, "step": 5445 }, { "epoch": 0.6042590949423248, "grad_norm": 0.346698522567749, "learning_rate": 1.5508996795661818e-05, "loss": 0.0378, "step": 5448 }, { "epoch": 0.6045918367346939, "grad_norm": 0.558957040309906, "learning_rate": 1.5505299482376142e-05, "loss": 0.0377, "step": 5451 }, { "epoch": 0.604924578527063, "grad_norm": 0.7215071320533752, "learning_rate": 1.5501602169090463e-05, "loss": 0.0726, "step": 5454 }, { "epoch": 0.6052573203194321, "grad_norm": 0.1469893455505371, "learning_rate": 1.5497904855804784e-05, "loss": 0.0216, "step": 5457 }, { "epoch": 0.6055900621118012, "grad_norm": 0.6045068502426147, "learning_rate": 1.5494207542519104e-05, "loss": 0.0498, "step": 5460 }, { "epoch": 0.6059228039041704, "grad_norm": 0.6000756621360779, "learning_rate": 1.5490510229233425e-05, "loss": 0.0428, "step": 5463 }, { "epoch": 0.6062555456965395, "grad_norm": 0.5550869703292847, "learning_rate": 1.5486812915947746e-05, "loss": 0.0834, "step": 5466 }, { "epoch": 0.6065882874889086, "grad_norm": 0.5663019418716431, "learning_rate": 1.5483115602662067e-05, "loss": 0.0521, "step": 5469 }, { "epoch": 0.6069210292812778, "grad_norm": 0.9321568608283997, "learning_rate": 1.5479418289376388e-05, "loss": 0.0833, "step": 5472 }, { "epoch": 0.6072537710736469, "grad_norm": 0.6349860429763794, "learning_rate": 1.547572097609071e-05, "loss": 0.0574, "step": 5475 }, { "epoch": 0.607586512866016, "grad_norm": 0.431191086769104, "learning_rate": 1.547202366280503e-05, "loss": 0.0529, "step": 5478 }, { "epoch": 0.6079192546583851, "grad_norm": 1.762280821800232, "learning_rate": 1.546832634951935e-05, "loss": 0.0948, "step": 5481 }, { "epoch": 0.6082519964507542, "grad_norm": 0.4424886107444763, "learning_rate": 1.546462903623367e-05, "loss": 0.0329, "step": 5484 }, { "epoch": 0.6085847382431233, "grad_norm": 0.5407235622406006, "learning_rate": 1.5460931722947992e-05, "loss": 0.0717, "step": 5487 }, { "epoch": 0.6089174800354925, "grad_norm": 0.4046662747859955, "learning_rate": 1.5457234409662313e-05, "loss": 0.0655, "step": 5490 }, { "epoch": 0.6092502218278616, "grad_norm": 0.6358171105384827, "learning_rate": 1.5453537096376633e-05, "loss": 0.0596, "step": 5493 }, { "epoch": 0.6095829636202307, "grad_norm": 0.7251865267753601, "learning_rate": 1.5449839783090954e-05, "loss": 0.0627, "step": 5496 }, { "epoch": 0.6099157054125999, "grad_norm": 0.7346992492675781, "learning_rate": 1.5446142469805275e-05, "loss": 0.0752, "step": 5499 }, { "epoch": 0.610248447204969, "grad_norm": 0.5575667023658752, "learning_rate": 1.5442445156519596e-05, "loss": 0.0552, "step": 5502 }, { "epoch": 0.610581188997338, "grad_norm": 0.320455938577652, "learning_rate": 1.543874784323392e-05, "loss": 0.0539, "step": 5505 }, { "epoch": 0.6109139307897072, "grad_norm": 1.6345449686050415, "learning_rate": 1.5435050529948238e-05, "loss": 0.0681, "step": 5508 }, { "epoch": 0.6112466725820763, "grad_norm": 0.6561957001686096, "learning_rate": 1.543135321666256e-05, "loss": 0.0519, "step": 5511 }, { "epoch": 0.6115794143744454, "grad_norm": 0.5577664375305176, "learning_rate": 1.5427655903376883e-05, "loss": 0.0479, "step": 5514 }, { "epoch": 0.6119121561668146, "grad_norm": 1.0197263956069946, "learning_rate": 1.54239585900912e-05, "loss": 0.0459, "step": 5517 }, { "epoch": 0.6122448979591837, "grad_norm": 0.3891264498233795, "learning_rate": 1.542026127680552e-05, "loss": 0.0416, "step": 5520 }, { "epoch": 0.6125776397515528, "grad_norm": 1.0009613037109375, "learning_rate": 1.5416563963519845e-05, "loss": 0.0575, "step": 5523 }, { "epoch": 0.612910381543922, "grad_norm": 0.5528519153594971, "learning_rate": 1.5412866650234162e-05, "loss": 0.08, "step": 5526 }, { "epoch": 0.613243123336291, "grad_norm": 0.5142378211021423, "learning_rate": 1.5409169336948487e-05, "loss": 0.0856, "step": 5529 }, { "epoch": 0.6135758651286601, "grad_norm": 0.3576796054840088, "learning_rate": 1.5405472023662807e-05, "loss": 0.0666, "step": 5532 }, { "epoch": 0.6139086069210293, "grad_norm": 0.22339695692062378, "learning_rate": 1.5401774710377125e-05, "loss": 0.0255, "step": 5535 }, { "epoch": 0.6142413487133984, "grad_norm": 0.3618619441986084, "learning_rate": 1.539807739709145e-05, "loss": 0.0401, "step": 5538 }, { "epoch": 0.6145740905057675, "grad_norm": 0.41217660903930664, "learning_rate": 1.539438008380577e-05, "loss": 0.0275, "step": 5541 }, { "epoch": 0.6149068322981367, "grad_norm": 0.7101370692253113, "learning_rate": 1.5390682770520087e-05, "loss": 0.0474, "step": 5544 }, { "epoch": 0.6152395740905058, "grad_norm": 0.3902337849140167, "learning_rate": 1.538698545723441e-05, "loss": 0.0578, "step": 5547 }, { "epoch": 0.6155723158828749, "grad_norm": 2.565805673599243, "learning_rate": 1.5383288143948732e-05, "loss": 0.0468, "step": 5550 }, { "epoch": 0.6159050576752441, "grad_norm": 0.5912244319915771, "learning_rate": 1.5379590830663053e-05, "loss": 0.05, "step": 5553 }, { "epoch": 0.6162377994676131, "grad_norm": 0.520232081413269, "learning_rate": 1.5375893517377374e-05, "loss": 0.0439, "step": 5556 }, { "epoch": 0.6165705412599822, "grad_norm": 0.6711508631706238, "learning_rate": 1.5372196204091695e-05, "loss": 0.0799, "step": 5559 }, { "epoch": 0.6169032830523514, "grad_norm": 0.6015632152557373, "learning_rate": 1.5368498890806016e-05, "loss": 0.0429, "step": 5562 }, { "epoch": 0.6172360248447205, "grad_norm": 0.5392657518386841, "learning_rate": 1.5364801577520337e-05, "loss": 0.0614, "step": 5565 }, { "epoch": 0.6175687666370896, "grad_norm": 0.5435337424278259, "learning_rate": 1.5361104264234657e-05, "loss": 0.0446, "step": 5568 }, { "epoch": 0.6179015084294588, "grad_norm": 0.7468942403793335, "learning_rate": 1.5357406950948978e-05, "loss": 0.0363, "step": 5571 }, { "epoch": 0.6182342502218279, "grad_norm": 0.8347851634025574, "learning_rate": 1.53537096376633e-05, "loss": 0.0558, "step": 5574 }, { "epoch": 0.618566992014197, "grad_norm": 0.5219293236732483, "learning_rate": 1.535001232437762e-05, "loss": 0.0735, "step": 5577 }, { "epoch": 0.6188997338065662, "grad_norm": 0.14276482164859772, "learning_rate": 1.534631501109194e-05, "loss": 0.0383, "step": 5580 }, { "epoch": 0.6192324755989352, "grad_norm": 1.5365883111953735, "learning_rate": 1.534261769780626e-05, "loss": 0.088, "step": 5583 }, { "epoch": 0.6195652173913043, "grad_norm": 0.34857091307640076, "learning_rate": 1.5338920384520582e-05, "loss": 0.029, "step": 5586 }, { "epoch": 0.6198979591836735, "grad_norm": 1.2580466270446777, "learning_rate": 1.5335223071234903e-05, "loss": 0.0749, "step": 5589 }, { "epoch": 0.6202307009760426, "grad_norm": 0.6368199586868286, "learning_rate": 1.5331525757949224e-05, "loss": 0.0766, "step": 5592 }, { "epoch": 0.6205634427684117, "grad_norm": 0.7787484526634216, "learning_rate": 1.5327828444663545e-05, "loss": 0.0535, "step": 5595 }, { "epoch": 0.6208961845607809, "grad_norm": 0.3768588602542877, "learning_rate": 1.5324131131377866e-05, "loss": 0.0844, "step": 5598 }, { "epoch": 0.62122892635315, "grad_norm": 0.8858184218406677, "learning_rate": 1.532043381809219e-05, "loss": 0.0754, "step": 5601 }, { "epoch": 0.621561668145519, "grad_norm": 0.7264569401741028, "learning_rate": 1.5316736504806507e-05, "loss": 0.0711, "step": 5604 }, { "epoch": 0.6218944099378882, "grad_norm": 0.3435293436050415, "learning_rate": 1.5313039191520828e-05, "loss": 0.0685, "step": 5607 }, { "epoch": 0.6222271517302573, "grad_norm": 0.5652928352355957, "learning_rate": 1.5309341878235152e-05, "loss": 0.0569, "step": 5610 }, { "epoch": 0.6225598935226264, "grad_norm": 0.27600011229515076, "learning_rate": 1.530564456494947e-05, "loss": 0.05, "step": 5613 }, { "epoch": 0.6228926353149956, "grad_norm": 0.307182639837265, "learning_rate": 1.530194725166379e-05, "loss": 0.0569, "step": 5616 }, { "epoch": 0.6232253771073647, "grad_norm": 0.6979321837425232, "learning_rate": 1.5298249938378115e-05, "loss": 0.056, "step": 5619 }, { "epoch": 0.6235581188997338, "grad_norm": 0.5493857264518738, "learning_rate": 1.5294552625092432e-05, "loss": 0.0831, "step": 5622 }, { "epoch": 0.623890860692103, "grad_norm": 0.6713234782218933, "learning_rate": 1.5290855311806756e-05, "loss": 0.0346, "step": 5625 }, { "epoch": 0.6242236024844721, "grad_norm": 0.40779978036880493, "learning_rate": 1.5287157998521077e-05, "loss": 0.0326, "step": 5628 }, { "epoch": 0.6245563442768411, "grad_norm": 0.6841585040092468, "learning_rate": 1.5283460685235395e-05, "loss": 0.0594, "step": 5631 }, { "epoch": 0.6248890860692103, "grad_norm": 0.8941486477851868, "learning_rate": 1.527976337194972e-05, "loss": 0.0228, "step": 5634 }, { "epoch": 0.6252218278615794, "grad_norm": 0.6711507439613342, "learning_rate": 1.527606605866404e-05, "loss": 0.0549, "step": 5637 }, { "epoch": 0.6255545696539485, "grad_norm": 0.2887111008167267, "learning_rate": 1.5272368745378357e-05, "loss": 0.0511, "step": 5640 }, { "epoch": 0.6258873114463177, "grad_norm": 1.2934881448745728, "learning_rate": 1.526867143209268e-05, "loss": 0.0703, "step": 5643 }, { "epoch": 0.6262200532386868, "grad_norm": 0.23530730605125427, "learning_rate": 1.5264974118807002e-05, "loss": 0.0489, "step": 5646 }, { "epoch": 0.6265527950310559, "grad_norm": 0.42859548330307007, "learning_rate": 1.5261276805521323e-05, "loss": 0.0537, "step": 5649 }, { "epoch": 0.6268855368234251, "grad_norm": 0.4393932521343231, "learning_rate": 1.5257579492235644e-05, "loss": 0.0515, "step": 5652 }, { "epoch": 0.6272182786157942, "grad_norm": 0.9024003148078918, "learning_rate": 1.5253882178949963e-05, "loss": 0.0554, "step": 5655 }, { "epoch": 0.6275510204081632, "grad_norm": 0.6823457479476929, "learning_rate": 1.5250184865664285e-05, "loss": 0.0911, "step": 5658 }, { "epoch": 0.6278837622005324, "grad_norm": 0.9697958827018738, "learning_rate": 1.5246487552378606e-05, "loss": 0.0905, "step": 5661 }, { "epoch": 0.6282165039929015, "grad_norm": 0.6234971284866333, "learning_rate": 1.5242790239092925e-05, "loss": 0.0771, "step": 5664 }, { "epoch": 0.6285492457852706, "grad_norm": 0.4469732642173767, "learning_rate": 1.5239092925807248e-05, "loss": 0.0467, "step": 5667 }, { "epoch": 0.6288819875776398, "grad_norm": 0.4108036458492279, "learning_rate": 1.5235395612521569e-05, "loss": 0.0499, "step": 5670 }, { "epoch": 0.6292147293700089, "grad_norm": 0.6050655841827393, "learning_rate": 1.5231698299235891e-05, "loss": 0.0729, "step": 5673 }, { "epoch": 0.629547471162378, "grad_norm": 0.4556558132171631, "learning_rate": 1.522800098595021e-05, "loss": 0.0365, "step": 5676 }, { "epoch": 0.6298802129547472, "grad_norm": 0.43856173753738403, "learning_rate": 1.5224303672664531e-05, "loss": 0.0597, "step": 5679 }, { "epoch": 0.6302129547471162, "grad_norm": 0.48732835054397583, "learning_rate": 1.5220606359378854e-05, "loss": 0.0542, "step": 5682 }, { "epoch": 0.6305456965394853, "grad_norm": 1.266591191291809, "learning_rate": 1.5216909046093173e-05, "loss": 0.0629, "step": 5685 }, { "epoch": 0.6308784383318545, "grad_norm": 0.4922550916671753, "learning_rate": 1.5213211732807494e-05, "loss": 0.0427, "step": 5688 }, { "epoch": 0.6312111801242236, "grad_norm": 1.9315332174301147, "learning_rate": 1.5209514419521816e-05, "loss": 0.0671, "step": 5691 }, { "epoch": 0.6315439219165927, "grad_norm": 0.40946364402770996, "learning_rate": 1.5205817106236135e-05, "loss": 0.0425, "step": 5694 }, { "epoch": 0.6318766637089619, "grad_norm": 0.34411731362342834, "learning_rate": 1.5202119792950458e-05, "loss": 0.067, "step": 5697 }, { "epoch": 0.632209405501331, "grad_norm": 0.42241528630256653, "learning_rate": 1.5198422479664779e-05, "loss": 0.0628, "step": 5700 }, { "epoch": 0.6325421472937001, "grad_norm": 0.5584643483161926, "learning_rate": 1.5194725166379098e-05, "loss": 0.1071, "step": 5703 }, { "epoch": 0.6328748890860693, "grad_norm": 0.6555048227310181, "learning_rate": 1.519102785309342e-05, "loss": 0.0421, "step": 5706 }, { "epoch": 0.6332076308784383, "grad_norm": 0.5068521499633789, "learning_rate": 1.5187330539807741e-05, "loss": 0.0484, "step": 5709 }, { "epoch": 0.6335403726708074, "grad_norm": 0.5786513686180115, "learning_rate": 1.5183633226522062e-05, "loss": 0.0528, "step": 5712 }, { "epoch": 0.6338731144631766, "grad_norm": 0.8114227056503296, "learning_rate": 1.5179935913236383e-05, "loss": 0.0574, "step": 5715 }, { "epoch": 0.6342058562555457, "grad_norm": 0.4603212773799896, "learning_rate": 1.5176238599950703e-05, "loss": 0.0623, "step": 5718 }, { "epoch": 0.6345385980479148, "grad_norm": 0.3901427388191223, "learning_rate": 1.5172541286665026e-05, "loss": 0.0363, "step": 5721 }, { "epoch": 0.634871339840284, "grad_norm": 0.6083022952079773, "learning_rate": 1.5168843973379345e-05, "loss": 0.0598, "step": 5724 }, { "epoch": 0.6352040816326531, "grad_norm": 0.5021528601646423, "learning_rate": 1.5165146660093666e-05, "loss": 0.0742, "step": 5727 }, { "epoch": 0.6355368234250222, "grad_norm": 0.3462506830692291, "learning_rate": 1.5161449346807988e-05, "loss": 0.0605, "step": 5730 }, { "epoch": 0.6358695652173914, "grad_norm": 0.5761926770210266, "learning_rate": 1.5157752033522308e-05, "loss": 0.0556, "step": 5733 }, { "epoch": 0.6362023070097604, "grad_norm": 0.4621409773826599, "learning_rate": 1.5154054720236628e-05, "loss": 0.0449, "step": 5736 }, { "epoch": 0.6365350488021295, "grad_norm": 1.8472747802734375, "learning_rate": 1.5150357406950951e-05, "loss": 0.0549, "step": 5739 }, { "epoch": 0.6368677905944987, "grad_norm": 0.5662969946861267, "learning_rate": 1.514666009366527e-05, "loss": 0.0531, "step": 5742 }, { "epoch": 0.6372005323868678, "grad_norm": 0.7071343660354614, "learning_rate": 1.5142962780379593e-05, "loss": 0.0581, "step": 5745 }, { "epoch": 0.6375332741792369, "grad_norm": 0.9787854552268982, "learning_rate": 1.5139265467093913e-05, "loss": 0.0719, "step": 5748 }, { "epoch": 0.6378660159716061, "grad_norm": 0.9098105430603027, "learning_rate": 1.5135568153808232e-05, "loss": 0.0543, "step": 5751 }, { "epoch": 0.6381987577639752, "grad_norm": 0.33673104643821716, "learning_rate": 1.5131870840522555e-05, "loss": 0.0559, "step": 5754 }, { "epoch": 0.6385314995563443, "grad_norm": 0.32291412353515625, "learning_rate": 1.5128173527236876e-05, "loss": 0.0489, "step": 5757 }, { "epoch": 0.6388642413487134, "grad_norm": 0.7695102691650391, "learning_rate": 1.5124476213951197e-05, "loss": 0.0458, "step": 5760 }, { "epoch": 0.6391969831410825, "grad_norm": 1.1324846744537354, "learning_rate": 1.5120778900665517e-05, "loss": 0.0603, "step": 5763 }, { "epoch": 0.6395297249334516, "grad_norm": 0.45041102170944214, "learning_rate": 1.5117081587379838e-05, "loss": 0.0736, "step": 5766 }, { "epoch": 0.6398624667258208, "grad_norm": 1.0094285011291504, "learning_rate": 1.511338427409416e-05, "loss": 0.0423, "step": 5769 }, { "epoch": 0.6401952085181899, "grad_norm": 0.676537036895752, "learning_rate": 1.510968696080848e-05, "loss": 0.0595, "step": 5772 }, { "epoch": 0.640527950310559, "grad_norm": 1.0063574314117432, "learning_rate": 1.51059896475228e-05, "loss": 0.0573, "step": 5775 }, { "epoch": 0.6408606921029282, "grad_norm": 0.32041484117507935, "learning_rate": 1.5102292334237123e-05, "loss": 0.0489, "step": 5778 }, { "epoch": 0.6411934338952973, "grad_norm": 0.5484521985054016, "learning_rate": 1.5098595020951442e-05, "loss": 0.0438, "step": 5781 }, { "epoch": 0.6415261756876663, "grad_norm": 0.7966428399085999, "learning_rate": 1.5094897707665763e-05, "loss": 0.0551, "step": 5784 }, { "epoch": 0.6418589174800355, "grad_norm": 0.7512060403823853, "learning_rate": 1.5091200394380086e-05, "loss": 0.033, "step": 5787 }, { "epoch": 0.6421916592724046, "grad_norm": 0.8070841431617737, "learning_rate": 1.5087503081094405e-05, "loss": 0.0596, "step": 5790 }, { "epoch": 0.6425244010647737, "grad_norm": 0.5024553537368774, "learning_rate": 1.5083805767808727e-05, "loss": 0.0526, "step": 5793 }, { "epoch": 0.6428571428571429, "grad_norm": 0.7105781435966492, "learning_rate": 1.5080108454523048e-05, "loss": 0.0668, "step": 5796 }, { "epoch": 0.643189884649512, "grad_norm": 0.2728571593761444, "learning_rate": 1.5076411141237369e-05, "loss": 0.0366, "step": 5799 }, { "epoch": 0.6435226264418811, "grad_norm": 0.7069293856620789, "learning_rate": 1.507271382795169e-05, "loss": 0.0619, "step": 5802 }, { "epoch": 0.6438553682342503, "grad_norm": 0.7180178761482239, "learning_rate": 1.506901651466601e-05, "loss": 0.0555, "step": 5805 }, { "epoch": 0.6441881100266194, "grad_norm": 0.2745313048362732, "learning_rate": 1.5065319201380331e-05, "loss": 0.0443, "step": 5808 }, { "epoch": 0.6445208518189884, "grad_norm": 0.7152108550071716, "learning_rate": 1.5061621888094652e-05, "loss": 0.0447, "step": 5811 }, { "epoch": 0.6448535936113576, "grad_norm": 0.21226032078266144, "learning_rate": 1.5057924574808973e-05, "loss": 0.06, "step": 5814 }, { "epoch": 0.6451863354037267, "grad_norm": 0.5187946557998657, "learning_rate": 1.5054227261523296e-05, "loss": 0.0534, "step": 5817 }, { "epoch": 0.6455190771960958, "grad_norm": 0.9161660671234131, "learning_rate": 1.5050529948237615e-05, "loss": 0.056, "step": 5820 }, { "epoch": 0.645851818988465, "grad_norm": 0.34803155064582825, "learning_rate": 1.5046832634951936e-05, "loss": 0.035, "step": 5823 }, { "epoch": 0.6461845607808341, "grad_norm": 1.8718054294586182, "learning_rate": 1.5043135321666258e-05, "loss": 0.0708, "step": 5826 }, { "epoch": 0.6465173025732032, "grad_norm": 0.3217279613018036, "learning_rate": 1.5039438008380577e-05, "loss": 0.0401, "step": 5829 }, { "epoch": 0.6468500443655724, "grad_norm": 0.41442787647247314, "learning_rate": 1.5035740695094898e-05, "loss": 0.0373, "step": 5832 }, { "epoch": 0.6471827861579414, "grad_norm": 0.5268890261650085, "learning_rate": 1.503204338180922e-05, "loss": 0.0367, "step": 5835 }, { "epoch": 0.6475155279503105, "grad_norm": 0.3362460136413574, "learning_rate": 1.5028346068523541e-05, "loss": 0.044, "step": 5838 }, { "epoch": 0.6478482697426797, "grad_norm": 0.35155999660491943, "learning_rate": 1.5024648755237862e-05, "loss": 0.0362, "step": 5841 }, { "epoch": 0.6481810115350488, "grad_norm": 0.8014340996742249, "learning_rate": 1.5020951441952183e-05, "loss": 0.0532, "step": 5844 }, { "epoch": 0.6485137533274179, "grad_norm": 0.33491018414497375, "learning_rate": 1.5017254128666504e-05, "loss": 0.0286, "step": 5847 }, { "epoch": 0.6488464951197871, "grad_norm": 0.8719918727874756, "learning_rate": 1.5013556815380825e-05, "loss": 0.057, "step": 5850 }, { "epoch": 0.6491792369121562, "grad_norm": 0.6280909776687622, "learning_rate": 1.5009859502095145e-05, "loss": 0.0381, "step": 5853 }, { "epoch": 0.6495119787045253, "grad_norm": 0.6809992790222168, "learning_rate": 1.5006162188809466e-05, "loss": 0.0838, "step": 5856 }, { "epoch": 0.6498447204968945, "grad_norm": 0.40513041615486145, "learning_rate": 1.5002464875523787e-05, "loss": 0.0436, "step": 5859 }, { "epoch": 0.6501774622892635, "grad_norm": 0.5042964816093445, "learning_rate": 1.4998767562238108e-05, "loss": 0.0537, "step": 5862 }, { "epoch": 0.6505102040816326, "grad_norm": 0.2937583029270172, "learning_rate": 1.4995070248952429e-05, "loss": 0.0319, "step": 5865 }, { "epoch": 0.6508429458740018, "grad_norm": 0.9344322085380554, "learning_rate": 1.499137293566675e-05, "loss": 0.0874, "step": 5868 }, { "epoch": 0.6511756876663709, "grad_norm": 0.16388213634490967, "learning_rate": 1.498767562238107e-05, "loss": 0.0472, "step": 5871 }, { "epoch": 0.65150842945874, "grad_norm": 0.6442260146141052, "learning_rate": 1.4983978309095393e-05, "loss": 0.0791, "step": 5874 }, { "epoch": 0.6518411712511092, "grad_norm": 0.5291545987129211, "learning_rate": 1.4980280995809714e-05, "loss": 0.0958, "step": 5877 }, { "epoch": 0.6521739130434783, "grad_norm": 0.45379728078842163, "learning_rate": 1.4976583682524033e-05, "loss": 0.0475, "step": 5880 }, { "epoch": 0.6525066548358474, "grad_norm": 0.549034833908081, "learning_rate": 1.4972886369238355e-05, "loss": 0.0566, "step": 5883 }, { "epoch": 0.6528393966282166, "grad_norm": 0.35400134325027466, "learning_rate": 1.4969189055952676e-05, "loss": 0.0189, "step": 5886 }, { "epoch": 0.6531721384205856, "grad_norm": 0.2556561231613159, "learning_rate": 1.4965491742666995e-05, "loss": 0.0515, "step": 5889 }, { "epoch": 0.6535048802129547, "grad_norm": 0.44385191798210144, "learning_rate": 1.4961794429381318e-05, "loss": 0.0265, "step": 5892 }, { "epoch": 0.6538376220053239, "grad_norm": 0.9356870651245117, "learning_rate": 1.4958097116095639e-05, "loss": 0.071, "step": 5895 }, { "epoch": 0.654170363797693, "grad_norm": 0.6099126935005188, "learning_rate": 1.495439980280996e-05, "loss": 0.0507, "step": 5898 }, { "epoch": 0.6545031055900621, "grad_norm": 0.5185577869415283, "learning_rate": 1.495070248952428e-05, "loss": 0.0391, "step": 5901 }, { "epoch": 0.6548358473824313, "grad_norm": 0.9065160155296326, "learning_rate": 1.4947005176238601e-05, "loss": 0.0552, "step": 5904 }, { "epoch": 0.6551685891748004, "grad_norm": 0.5548466444015503, "learning_rate": 1.4943307862952922e-05, "loss": 0.0332, "step": 5907 }, { "epoch": 0.6555013309671694, "grad_norm": 0.8244514465332031, "learning_rate": 1.4939610549667243e-05, "loss": 0.0884, "step": 5910 }, { "epoch": 0.6558340727595386, "grad_norm": 0.3945707082748413, "learning_rate": 1.4935913236381564e-05, "loss": 0.0808, "step": 5913 }, { "epoch": 0.6561668145519077, "grad_norm": 0.5651019215583801, "learning_rate": 1.4932215923095886e-05, "loss": 0.029, "step": 5916 }, { "epoch": 0.6564995563442768, "grad_norm": 0.42604753375053406, "learning_rate": 1.4928518609810205e-05, "loss": 0.033, "step": 5919 }, { "epoch": 0.656832298136646, "grad_norm": 0.5040234327316284, "learning_rate": 1.4924821296524528e-05, "loss": 0.0515, "step": 5922 }, { "epoch": 0.6571650399290151, "grad_norm": 1.0493823289871216, "learning_rate": 1.4921123983238848e-05, "loss": 0.0915, "step": 5925 }, { "epoch": 0.6574977817213842, "grad_norm": 0.6251220703125, "learning_rate": 1.4917426669953168e-05, "loss": 0.0831, "step": 5928 }, { "epoch": 0.6578305235137534, "grad_norm": 0.6688065528869629, "learning_rate": 1.491372935666749e-05, "loss": 0.0641, "step": 5931 }, { "epoch": 0.6581632653061225, "grad_norm": 0.5384476780891418, "learning_rate": 1.4910032043381811e-05, "loss": 0.0481, "step": 5934 }, { "epoch": 0.6584960070984915, "grad_norm": 0.31618693470954895, "learning_rate": 1.490633473009613e-05, "loss": 0.0635, "step": 5937 }, { "epoch": 0.6588287488908607, "grad_norm": 0.5667364597320557, "learning_rate": 1.4902637416810453e-05, "loss": 0.0514, "step": 5940 }, { "epoch": 0.6591614906832298, "grad_norm": 0.48742079734802246, "learning_rate": 1.4898940103524773e-05, "loss": 0.0491, "step": 5943 }, { "epoch": 0.6594942324755989, "grad_norm": 0.5917084217071533, "learning_rate": 1.4895242790239094e-05, "loss": 0.0268, "step": 5946 }, { "epoch": 0.6598269742679681, "grad_norm": 0.7647333145141602, "learning_rate": 1.4891545476953415e-05, "loss": 0.0524, "step": 5949 }, { "epoch": 0.6601597160603372, "grad_norm": 0.397356241941452, "learning_rate": 1.4887848163667736e-05, "loss": 0.0562, "step": 5952 }, { "epoch": 0.6604924578527063, "grad_norm": 0.48074427247047424, "learning_rate": 1.4884150850382057e-05, "loss": 0.0333, "step": 5955 }, { "epoch": 0.6608251996450755, "grad_norm": 0.4923575222492218, "learning_rate": 1.4880453537096378e-05, "loss": 0.0743, "step": 5958 }, { "epoch": 0.6611579414374446, "grad_norm": 1.6932307481765747, "learning_rate": 1.4876756223810698e-05, "loss": 0.1131, "step": 5961 }, { "epoch": 0.6614906832298136, "grad_norm": 0.4714964032173157, "learning_rate": 1.4873058910525021e-05, "loss": 0.0463, "step": 5964 }, { "epoch": 0.6618234250221828, "grad_norm": 0.6484069228172302, "learning_rate": 1.486936159723934e-05, "loss": 0.0669, "step": 5967 }, { "epoch": 0.6621561668145519, "grad_norm": 0.5912299156188965, "learning_rate": 1.4865664283953662e-05, "loss": 0.0327, "step": 5970 }, { "epoch": 0.662488908606921, "grad_norm": 0.3524036705493927, "learning_rate": 1.4861966970667983e-05, "loss": 0.0274, "step": 5973 }, { "epoch": 0.6628216503992902, "grad_norm": 0.4564480185508728, "learning_rate": 1.4858269657382302e-05, "loss": 0.0795, "step": 5976 }, { "epoch": 0.6631543921916593, "grad_norm": 0.6489741206169128, "learning_rate": 1.4854572344096625e-05, "loss": 0.0435, "step": 5979 }, { "epoch": 0.6634871339840284, "grad_norm": 0.7145840525627136, "learning_rate": 1.4850875030810946e-05, "loss": 0.0387, "step": 5982 }, { "epoch": 0.6638198757763976, "grad_norm": 0.593610405921936, "learning_rate": 1.4847177717525265e-05, "loss": 0.054, "step": 5985 }, { "epoch": 0.6641526175687666, "grad_norm": 0.2860753536224365, "learning_rate": 1.4843480404239587e-05, "loss": 0.042, "step": 5988 }, { "epoch": 0.6644853593611357, "grad_norm": 0.39021357893943787, "learning_rate": 1.4839783090953908e-05, "loss": 0.0219, "step": 5991 }, { "epoch": 0.6648181011535049, "grad_norm": 0.13939164578914642, "learning_rate": 1.4836085777668229e-05, "loss": 0.0377, "step": 5994 }, { "epoch": 0.665150842945874, "grad_norm": 1.1132392883300781, "learning_rate": 1.483238846438255e-05, "loss": 0.0483, "step": 5997 }, { "epoch": 0.6654835847382431, "grad_norm": 1.4069913625717163, "learning_rate": 1.482869115109687e-05, "loss": 0.0775, "step": 6000 }, { "epoch": 0.6658163265306123, "grad_norm": 0.42879703640937805, "learning_rate": 1.4824993837811193e-05, "loss": 0.0429, "step": 6003 }, { "epoch": 0.6661490683229814, "grad_norm": 0.4553404748439789, "learning_rate": 1.4821296524525512e-05, "loss": 0.0402, "step": 6006 }, { "epoch": 0.6664818101153505, "grad_norm": 0.4625157415866852, "learning_rate": 1.4817599211239833e-05, "loss": 0.0274, "step": 6009 }, { "epoch": 0.6668145519077197, "grad_norm": 0.452280730009079, "learning_rate": 1.4813901897954156e-05, "loss": 0.0468, "step": 6012 }, { "epoch": 0.6671472937000887, "grad_norm": 0.6088408827781677, "learning_rate": 1.4810204584668475e-05, "loss": 0.0479, "step": 6015 }, { "epoch": 0.6674800354924578, "grad_norm": 3.14906907081604, "learning_rate": 1.4806507271382797e-05, "loss": 0.0535, "step": 6018 }, { "epoch": 0.667812777284827, "grad_norm": 0.3605135977268219, "learning_rate": 1.4802809958097118e-05, "loss": 0.0209, "step": 6021 }, { "epoch": 0.6681455190771961, "grad_norm": 1.2294272184371948, "learning_rate": 1.4799112644811437e-05, "loss": 0.0763, "step": 6024 }, { "epoch": 0.6684782608695652, "grad_norm": 0.3915591239929199, "learning_rate": 1.479541533152576e-05, "loss": 0.0493, "step": 6027 }, { "epoch": 0.6688110026619344, "grad_norm": 0.5705204606056213, "learning_rate": 1.479171801824008e-05, "loss": 0.0254, "step": 6030 }, { "epoch": 0.6691437444543035, "grad_norm": 0.3664964735507965, "learning_rate": 1.47880207049544e-05, "loss": 0.0382, "step": 6033 }, { "epoch": 0.6694764862466726, "grad_norm": 0.7337558269500732, "learning_rate": 1.4784323391668722e-05, "loss": 0.0476, "step": 6036 }, { "epoch": 0.6698092280390417, "grad_norm": 0.8393928408622742, "learning_rate": 1.4780626078383043e-05, "loss": 0.0671, "step": 6039 }, { "epoch": 0.6701419698314108, "grad_norm": 0.7555551528930664, "learning_rate": 1.4776928765097366e-05, "loss": 0.0671, "step": 6042 }, { "epoch": 0.6704747116237799, "grad_norm": 0.7045422196388245, "learning_rate": 1.4773231451811685e-05, "loss": 0.0543, "step": 6045 }, { "epoch": 0.6708074534161491, "grad_norm": 0.880768358707428, "learning_rate": 1.4769534138526006e-05, "loss": 0.0778, "step": 6048 }, { "epoch": 0.6711401952085182, "grad_norm": 0.5218469500541687, "learning_rate": 1.4765836825240328e-05, "loss": 0.0362, "step": 6051 }, { "epoch": 0.6714729370008873, "grad_norm": 0.447319895029068, "learning_rate": 1.4762139511954647e-05, "loss": 0.0595, "step": 6054 }, { "epoch": 0.6718056787932565, "grad_norm": 0.45957908034324646, "learning_rate": 1.4758442198668968e-05, "loss": 0.0438, "step": 6057 }, { "epoch": 0.6721384205856256, "grad_norm": 0.5304993987083435, "learning_rate": 1.475474488538329e-05, "loss": 0.056, "step": 6060 }, { "epoch": 0.6724711623779946, "grad_norm": 0.47096019983291626, "learning_rate": 1.475104757209761e-05, "loss": 0.0375, "step": 6063 }, { "epoch": 0.6728039041703638, "grad_norm": 0.5774662494659424, "learning_rate": 1.4747350258811932e-05, "loss": 0.0431, "step": 6066 }, { "epoch": 0.6731366459627329, "grad_norm": 0.43163102865219116, "learning_rate": 1.4743652945526253e-05, "loss": 0.049, "step": 6069 }, { "epoch": 0.673469387755102, "grad_norm": 0.41588449478149414, "learning_rate": 1.4739955632240572e-05, "loss": 0.0728, "step": 6072 }, { "epoch": 0.6738021295474712, "grad_norm": 0.6348492503166199, "learning_rate": 1.4736258318954895e-05, "loss": 0.0382, "step": 6075 }, { "epoch": 0.6741348713398403, "grad_norm": 0.5990058183670044, "learning_rate": 1.4732561005669215e-05, "loss": 0.0376, "step": 6078 }, { "epoch": 0.6744676131322094, "grad_norm": 0.8687829971313477, "learning_rate": 1.4728863692383535e-05, "loss": 0.0611, "step": 6081 }, { "epoch": 0.6748003549245786, "grad_norm": 0.48825132846832275, "learning_rate": 1.4725166379097857e-05, "loss": 0.0443, "step": 6084 }, { "epoch": 0.6751330967169477, "grad_norm": 0.49192696809768677, "learning_rate": 1.4721469065812178e-05, "loss": 0.036, "step": 6087 }, { "epoch": 0.6754658385093167, "grad_norm": 0.5208090543746948, "learning_rate": 1.47177717525265e-05, "loss": 0.0326, "step": 6090 }, { "epoch": 0.6757985803016859, "grad_norm": 0.9256842136383057, "learning_rate": 1.471407443924082e-05, "loss": 0.0869, "step": 6093 }, { "epoch": 0.676131322094055, "grad_norm": 0.6051145195960999, "learning_rate": 1.471037712595514e-05, "loss": 0.0536, "step": 6096 }, { "epoch": 0.6764640638864241, "grad_norm": 0.7308712601661682, "learning_rate": 1.4706679812669463e-05, "loss": 0.0628, "step": 6099 }, { "epoch": 0.6767968056787933, "grad_norm": 0.5613245368003845, "learning_rate": 1.4702982499383782e-05, "loss": 0.0576, "step": 6102 }, { "epoch": 0.6771295474711624, "grad_norm": 0.41418930888175964, "learning_rate": 1.4699285186098103e-05, "loss": 0.0297, "step": 6105 }, { "epoch": 0.6774622892635315, "grad_norm": 1.1205085515975952, "learning_rate": 1.4695587872812425e-05, "loss": 0.0754, "step": 6108 }, { "epoch": 0.6777950310559007, "grad_norm": 0.7522783875465393, "learning_rate": 1.4691890559526744e-05, "loss": 0.0774, "step": 6111 }, { "epoch": 0.6781277728482697, "grad_norm": 0.7787558436393738, "learning_rate": 1.4688193246241067e-05, "loss": 0.0549, "step": 6114 }, { "epoch": 0.6784605146406388, "grad_norm": 0.518629252910614, "learning_rate": 1.4684495932955388e-05, "loss": 0.0479, "step": 6117 }, { "epoch": 0.678793256433008, "grad_norm": 0.7878998517990112, "learning_rate": 1.4680798619669707e-05, "loss": 0.0653, "step": 6120 }, { "epoch": 0.6791259982253771, "grad_norm": 1.0726779699325562, "learning_rate": 1.467710130638403e-05, "loss": 0.0598, "step": 6123 }, { "epoch": 0.6794587400177462, "grad_norm": 0.4636858105659485, "learning_rate": 1.467340399309835e-05, "loss": 0.0263, "step": 6126 }, { "epoch": 0.6797914818101154, "grad_norm": 0.17134876549243927, "learning_rate": 1.466970667981267e-05, "loss": 0.0436, "step": 6129 }, { "epoch": 0.6801242236024845, "grad_norm": 0.4875248670578003, "learning_rate": 1.4666009366526992e-05, "loss": 0.0354, "step": 6132 }, { "epoch": 0.6804569653948536, "grad_norm": 0.5492991805076599, "learning_rate": 1.4662312053241313e-05, "loss": 0.0441, "step": 6135 }, { "epoch": 0.6807897071872228, "grad_norm": 0.6504498720169067, "learning_rate": 1.4658614739955635e-05, "loss": 0.0766, "step": 6138 }, { "epoch": 0.6811224489795918, "grad_norm": 0.27757492661476135, "learning_rate": 1.4654917426669954e-05, "loss": 0.0394, "step": 6141 }, { "epoch": 0.6814551907719609, "grad_norm": 0.5311102271080017, "learning_rate": 1.4651220113384275e-05, "loss": 0.0777, "step": 6144 }, { "epoch": 0.6817879325643301, "grad_norm": 0.5879144072532654, "learning_rate": 1.4647522800098598e-05, "loss": 0.0358, "step": 6147 }, { "epoch": 0.6821206743566992, "grad_norm": 0.4747384190559387, "learning_rate": 1.4643825486812917e-05, "loss": 0.0429, "step": 6150 }, { "epoch": 0.6824534161490683, "grad_norm": 0.7691537737846375, "learning_rate": 1.4640128173527238e-05, "loss": 0.0412, "step": 6153 }, { "epoch": 0.6827861579414375, "grad_norm": 0.6953061819076538, "learning_rate": 1.463643086024156e-05, "loss": 0.0562, "step": 6156 }, { "epoch": 0.6831188997338066, "grad_norm": 0.2903642952442169, "learning_rate": 1.463273354695588e-05, "loss": 0.0542, "step": 6159 }, { "epoch": 0.6834516415261757, "grad_norm": 0.5245875716209412, "learning_rate": 1.4629036233670202e-05, "loss": 0.0228, "step": 6162 }, { "epoch": 0.6837843833185449, "grad_norm": 0.4838315546512604, "learning_rate": 1.4625338920384523e-05, "loss": 0.0678, "step": 6165 }, { "epoch": 0.6841171251109139, "grad_norm": 0.30312952399253845, "learning_rate": 1.4621641607098842e-05, "loss": 0.0334, "step": 6168 }, { "epoch": 0.684449866903283, "grad_norm": 1.4627763032913208, "learning_rate": 1.4617944293813164e-05, "loss": 0.0747, "step": 6171 }, { "epoch": 0.6847826086956522, "grad_norm": 0.5423263907432556, "learning_rate": 1.4614246980527485e-05, "loss": 0.0377, "step": 6174 }, { "epoch": 0.6851153504880213, "grad_norm": 0.7571638226509094, "learning_rate": 1.4610549667241804e-05, "loss": 0.0469, "step": 6177 }, { "epoch": 0.6854480922803904, "grad_norm": 0.49032601714134216, "learning_rate": 1.4606852353956127e-05, "loss": 0.0442, "step": 6180 }, { "epoch": 0.6857808340727596, "grad_norm": 0.5958486199378967, "learning_rate": 1.4603155040670448e-05, "loss": 0.1019, "step": 6183 }, { "epoch": 0.6861135758651287, "grad_norm": 0.5009533166885376, "learning_rate": 1.459945772738477e-05, "loss": 0.0462, "step": 6186 }, { "epoch": 0.6864463176574978, "grad_norm": 0.5406983494758606, "learning_rate": 1.4595760414099089e-05, "loss": 0.0641, "step": 6189 }, { "epoch": 0.686779059449867, "grad_norm": 0.3684712052345276, "learning_rate": 1.459206310081341e-05, "loss": 0.0296, "step": 6192 }, { "epoch": 0.687111801242236, "grad_norm": 0.5289161205291748, "learning_rate": 1.4588365787527732e-05, "loss": 0.0386, "step": 6195 }, { "epoch": 0.6874445430346051, "grad_norm": 0.6481142044067383, "learning_rate": 1.4584668474242052e-05, "loss": 0.078, "step": 6198 }, { "epoch": 0.6877772848269743, "grad_norm": 0.32597512006759644, "learning_rate": 1.4580971160956372e-05, "loss": 0.0307, "step": 6201 }, { "epoch": 0.6881100266193434, "grad_norm": 1.6639093160629272, "learning_rate": 1.4577273847670695e-05, "loss": 0.0614, "step": 6204 }, { "epoch": 0.6884427684117125, "grad_norm": 0.5118216276168823, "learning_rate": 1.4573576534385014e-05, "loss": 0.0381, "step": 6207 }, { "epoch": 0.6887755102040817, "grad_norm": 0.9851461052894592, "learning_rate": 1.4569879221099337e-05, "loss": 0.0401, "step": 6210 }, { "epoch": 0.6891082519964508, "grad_norm": 0.1875012069940567, "learning_rate": 1.4566181907813657e-05, "loss": 0.0486, "step": 6213 }, { "epoch": 0.6894409937888198, "grad_norm": 1.3228521347045898, "learning_rate": 1.4562484594527977e-05, "loss": 0.0405, "step": 6216 }, { "epoch": 0.689773735581189, "grad_norm": 0.8884420394897461, "learning_rate": 1.4558787281242299e-05, "loss": 0.062, "step": 6219 }, { "epoch": 0.6901064773735581, "grad_norm": 0.28651484847068787, "learning_rate": 1.455508996795662e-05, "loss": 0.0325, "step": 6222 }, { "epoch": 0.6904392191659272, "grad_norm": 0.8137440085411072, "learning_rate": 1.4551392654670939e-05, "loss": 0.0449, "step": 6225 }, { "epoch": 0.6907719609582964, "grad_norm": 0.20196762681007385, "learning_rate": 1.4547695341385262e-05, "loss": 0.0196, "step": 6228 }, { "epoch": 0.6911047027506655, "grad_norm": 0.22070206701755524, "learning_rate": 1.4543998028099582e-05, "loss": 0.0194, "step": 6231 }, { "epoch": 0.6914374445430346, "grad_norm": 0.5024257898330688, "learning_rate": 1.4540300714813905e-05, "loss": 0.0428, "step": 6234 }, { "epoch": 0.6917701863354038, "grad_norm": 0.3355485200881958, "learning_rate": 1.4536603401528224e-05, "loss": 0.0487, "step": 6237 }, { "epoch": 0.6921029281277729, "grad_norm": 1.2977166175842285, "learning_rate": 1.4532906088242545e-05, "loss": 0.0862, "step": 6240 }, { "epoch": 0.6924356699201419, "grad_norm": 1.0258370637893677, "learning_rate": 1.4529208774956867e-05, "loss": 0.0775, "step": 6243 }, { "epoch": 0.6927684117125111, "grad_norm": 0.4845483899116516, "learning_rate": 1.4525511461671186e-05, "loss": 0.0466, "step": 6246 }, { "epoch": 0.6931011535048802, "grad_norm": 1.1195645332336426, "learning_rate": 1.4521814148385507e-05, "loss": 0.0438, "step": 6249 }, { "epoch": 0.6934338952972493, "grad_norm": 0.35815295577049255, "learning_rate": 1.451811683509983e-05, "loss": 0.0655, "step": 6252 }, { "epoch": 0.6937666370896185, "grad_norm": 0.9638643264770508, "learning_rate": 1.4514419521814149e-05, "loss": 0.0386, "step": 6255 }, { "epoch": 0.6940993788819876, "grad_norm": 0.9370717406272888, "learning_rate": 1.4510722208528471e-05, "loss": 0.0657, "step": 6258 }, { "epoch": 0.6944321206743567, "grad_norm": 0.47911372780799866, "learning_rate": 1.4507024895242792e-05, "loss": 0.046, "step": 6261 }, { "epoch": 0.6947648624667259, "grad_norm": 0.6717323064804077, "learning_rate": 1.4503327581957111e-05, "loss": 0.065, "step": 6264 }, { "epoch": 0.695097604259095, "grad_norm": 0.5177185535430908, "learning_rate": 1.4499630268671434e-05, "loss": 0.0486, "step": 6267 }, { "epoch": 0.695430346051464, "grad_norm": 0.4957672953605652, "learning_rate": 1.4495932955385755e-05, "loss": 0.058, "step": 6270 }, { "epoch": 0.6957630878438332, "grad_norm": 0.44960877299308777, "learning_rate": 1.4492235642100074e-05, "loss": 0.0352, "step": 6273 }, { "epoch": 0.6960958296362023, "grad_norm": 0.6411247849464417, "learning_rate": 1.4488538328814396e-05, "loss": 0.0739, "step": 6276 }, { "epoch": 0.6964285714285714, "grad_norm": 0.6084186434745789, "learning_rate": 1.4484841015528717e-05, "loss": 0.0532, "step": 6279 }, { "epoch": 0.6967613132209406, "grad_norm": 0.7602421641349792, "learning_rate": 1.448114370224304e-05, "loss": 0.0581, "step": 6282 }, { "epoch": 0.6970940550133097, "grad_norm": 0.6773678064346313, "learning_rate": 1.4477446388957359e-05, "loss": 0.042, "step": 6285 }, { "epoch": 0.6974267968056788, "grad_norm": 0.4681881070137024, "learning_rate": 1.447374907567168e-05, "loss": 0.0527, "step": 6288 }, { "epoch": 0.697759538598048, "grad_norm": 1.993117094039917, "learning_rate": 1.4470051762386002e-05, "loss": 0.1102, "step": 6291 }, { "epoch": 0.698092280390417, "grad_norm": 1.203800916671753, "learning_rate": 1.4466354449100321e-05, "loss": 0.0667, "step": 6294 }, { "epoch": 0.6984250221827861, "grad_norm": 0.2016172558069229, "learning_rate": 1.4462657135814642e-05, "loss": 0.0366, "step": 6297 }, { "epoch": 0.6987577639751553, "grad_norm": 0.6158326864242554, "learning_rate": 1.4458959822528965e-05, "loss": 0.0913, "step": 6300 }, { "epoch": 0.6990905057675244, "grad_norm": 0.48099854588508606, "learning_rate": 1.4455262509243284e-05, "loss": 0.0613, "step": 6303 }, { "epoch": 0.6994232475598935, "grad_norm": 0.6004952192306519, "learning_rate": 1.4451565195957606e-05, "loss": 0.0567, "step": 6306 }, { "epoch": 0.6997559893522627, "grad_norm": 0.2923017740249634, "learning_rate": 1.4447867882671927e-05, "loss": 0.0476, "step": 6309 }, { "epoch": 0.7000887311446318, "grad_norm": 0.4653570055961609, "learning_rate": 1.4444170569386246e-05, "loss": 0.0448, "step": 6312 }, { "epoch": 0.7004214729370009, "grad_norm": 0.6264351606369019, "learning_rate": 1.4440473256100569e-05, "loss": 0.0324, "step": 6315 }, { "epoch": 0.70075421472937, "grad_norm": 0.356584370136261, "learning_rate": 1.443677594281489e-05, "loss": 0.0518, "step": 6318 }, { "epoch": 0.7010869565217391, "grad_norm": 0.6480504870414734, "learning_rate": 1.4433078629529209e-05, "loss": 0.0691, "step": 6321 }, { "epoch": 0.7014196983141082, "grad_norm": 0.3287508189678192, "learning_rate": 1.4429381316243531e-05, "loss": 0.0212, "step": 6324 }, { "epoch": 0.7017524401064774, "grad_norm": 0.6435609459877014, "learning_rate": 1.4425684002957852e-05, "loss": 0.0271, "step": 6327 }, { "epoch": 0.7020851818988465, "grad_norm": 0.6550247669219971, "learning_rate": 1.4421986689672171e-05, "loss": 0.09, "step": 6330 }, { "epoch": 0.7024179236912156, "grad_norm": 0.4242513179779053, "learning_rate": 1.4418289376386494e-05, "loss": 0.0289, "step": 6333 }, { "epoch": 0.7027506654835848, "grad_norm": 0.20472963154315948, "learning_rate": 1.4414592063100814e-05, "loss": 0.0548, "step": 6336 }, { "epoch": 0.7030834072759539, "grad_norm": 0.5278670191764832, "learning_rate": 1.4410894749815137e-05, "loss": 0.0463, "step": 6339 }, { "epoch": 0.703416149068323, "grad_norm": 0.5833300352096558, "learning_rate": 1.4407197436529456e-05, "loss": 0.0739, "step": 6342 }, { "epoch": 0.7037488908606921, "grad_norm": 0.4163578748703003, "learning_rate": 1.4403500123243777e-05, "loss": 0.0676, "step": 6345 }, { "epoch": 0.7040816326530612, "grad_norm": 0.3682399392127991, "learning_rate": 1.43998028099581e-05, "loss": 0.0664, "step": 6348 }, { "epoch": 0.7044143744454303, "grad_norm": 0.45766186714172363, "learning_rate": 1.4396105496672419e-05, "loss": 0.0466, "step": 6351 }, { "epoch": 0.7047471162377995, "grad_norm": 0.5733155608177185, "learning_rate": 1.439240818338674e-05, "loss": 0.0573, "step": 6354 }, { "epoch": 0.7050798580301686, "grad_norm": 0.2438609004020691, "learning_rate": 1.4388710870101062e-05, "loss": 0.0354, "step": 6357 }, { "epoch": 0.7054125998225377, "grad_norm": 0.21869207918643951, "learning_rate": 1.4385013556815381e-05, "loss": 0.027, "step": 6360 }, { "epoch": 0.7057453416149069, "grad_norm": 0.1295529007911682, "learning_rate": 1.4381316243529703e-05, "loss": 0.0341, "step": 6363 }, { "epoch": 0.706078083407276, "grad_norm": 1.1845983266830444, "learning_rate": 1.4377618930244024e-05, "loss": 0.0617, "step": 6366 }, { "epoch": 0.706410825199645, "grad_norm": 0.5546205639839172, "learning_rate": 1.4373921616958343e-05, "loss": 0.0511, "step": 6369 }, { "epoch": 0.7067435669920142, "grad_norm": 0.2360113263130188, "learning_rate": 1.4370224303672666e-05, "loss": 0.043, "step": 6372 }, { "epoch": 0.7070763087843833, "grad_norm": 0.23704466223716736, "learning_rate": 1.4366526990386987e-05, "loss": 0.0593, "step": 6375 }, { "epoch": 0.7074090505767524, "grad_norm": 0.32379791140556335, "learning_rate": 1.4362829677101306e-05, "loss": 0.0527, "step": 6378 }, { "epoch": 0.7077417923691216, "grad_norm": 0.3085450530052185, "learning_rate": 1.4359132363815628e-05, "loss": 0.0552, "step": 6381 }, { "epoch": 0.7080745341614907, "grad_norm": 0.4888991415500641, "learning_rate": 1.435543505052995e-05, "loss": 0.0613, "step": 6384 }, { "epoch": 0.7084072759538598, "grad_norm": 0.4441985785961151, "learning_rate": 1.4351737737244272e-05, "loss": 0.0371, "step": 6387 }, { "epoch": 0.708740017746229, "grad_norm": 0.46483951807022095, "learning_rate": 1.4348040423958591e-05, "loss": 0.0589, "step": 6390 }, { "epoch": 0.709072759538598, "grad_norm": 1.4393824338912964, "learning_rate": 1.4344343110672912e-05, "loss": 0.0623, "step": 6393 }, { "epoch": 0.7094055013309671, "grad_norm": 0.30279740691185, "learning_rate": 1.4340645797387234e-05, "loss": 0.0374, "step": 6396 }, { "epoch": 0.7097382431233363, "grad_norm": 0.4781794846057892, "learning_rate": 1.4336948484101553e-05, "loss": 0.0642, "step": 6399 }, { "epoch": 0.7100709849157054, "grad_norm": 0.41941338777542114, "learning_rate": 1.4333251170815874e-05, "loss": 0.0505, "step": 6402 }, { "epoch": 0.7104037267080745, "grad_norm": 0.5545704960823059, "learning_rate": 1.4329553857530197e-05, "loss": 0.0765, "step": 6405 }, { "epoch": 0.7107364685004437, "grad_norm": 0.32651811838150024, "learning_rate": 1.4325856544244516e-05, "loss": 0.0465, "step": 6408 }, { "epoch": 0.7110692102928128, "grad_norm": 0.2678914964199066, "learning_rate": 1.4322159230958838e-05, "loss": 0.0527, "step": 6411 }, { "epoch": 0.7114019520851819, "grad_norm": 0.8685445189476013, "learning_rate": 1.4318461917673159e-05, "loss": 0.0444, "step": 6414 }, { "epoch": 0.7117346938775511, "grad_norm": 0.5865405201911926, "learning_rate": 1.4314764604387478e-05, "loss": 0.0422, "step": 6417 }, { "epoch": 0.7120674356699201, "grad_norm": 0.5008291602134705, "learning_rate": 1.43110672911018e-05, "loss": 0.0921, "step": 6420 }, { "epoch": 0.7124001774622892, "grad_norm": 0.4972597062587738, "learning_rate": 1.4307369977816122e-05, "loss": 0.0605, "step": 6423 }, { "epoch": 0.7127329192546584, "grad_norm": 0.48066216707229614, "learning_rate": 1.430367266453044e-05, "loss": 0.0797, "step": 6426 }, { "epoch": 0.7130656610470275, "grad_norm": 1.4030958414077759, "learning_rate": 1.4299975351244763e-05, "loss": 0.1055, "step": 6429 }, { "epoch": 0.7133984028393966, "grad_norm": 0.3440294563770294, "learning_rate": 1.4296278037959084e-05, "loss": 0.027, "step": 6432 }, { "epoch": 0.7137311446317658, "grad_norm": 0.3528490662574768, "learning_rate": 1.4292580724673407e-05, "loss": 0.064, "step": 6435 }, { "epoch": 0.7140638864241349, "grad_norm": 0.3049275875091553, "learning_rate": 1.4288883411387726e-05, "loss": 0.0598, "step": 6438 }, { "epoch": 0.714396628216504, "grad_norm": 0.4318962097167969, "learning_rate": 1.4285186098102047e-05, "loss": 0.0541, "step": 6441 }, { "epoch": 0.7147293700088732, "grad_norm": 0.30269530415534973, "learning_rate": 1.4281488784816369e-05, "loss": 0.068, "step": 6444 }, { "epoch": 0.7150621118012422, "grad_norm": 0.6877302527427673, "learning_rate": 1.4277791471530688e-05, "loss": 0.0525, "step": 6447 }, { "epoch": 0.7153948535936113, "grad_norm": 0.4328120946884155, "learning_rate": 1.4274094158245009e-05, "loss": 0.0474, "step": 6450 }, { "epoch": 0.7157275953859805, "grad_norm": 0.464939147233963, "learning_rate": 1.4270396844959331e-05, "loss": 0.027, "step": 6453 }, { "epoch": 0.7160603371783496, "grad_norm": 0.7360124588012695, "learning_rate": 1.426669953167365e-05, "loss": 0.0408, "step": 6456 }, { "epoch": 0.7163930789707187, "grad_norm": 0.3876354396343231, "learning_rate": 1.4263002218387973e-05, "loss": 0.0223, "step": 6459 }, { "epoch": 0.7167258207630879, "grad_norm": 0.41397997736930847, "learning_rate": 1.4259304905102294e-05, "loss": 0.0443, "step": 6462 }, { "epoch": 0.717058562555457, "grad_norm": 0.5053120255470276, "learning_rate": 1.4255607591816613e-05, "loss": 0.049, "step": 6465 }, { "epoch": 0.717391304347826, "grad_norm": 0.4330223798751831, "learning_rate": 1.4251910278530936e-05, "loss": 0.0875, "step": 6468 }, { "epoch": 0.7177240461401952, "grad_norm": 0.4156365990638733, "learning_rate": 1.4248212965245256e-05, "loss": 0.0328, "step": 6471 }, { "epoch": 0.7180567879325643, "grad_norm": 0.9195415377616882, "learning_rate": 1.4244515651959576e-05, "loss": 0.0789, "step": 6474 }, { "epoch": 0.7183895297249334, "grad_norm": 0.4561370015144348, "learning_rate": 1.4240818338673898e-05, "loss": 0.0273, "step": 6477 }, { "epoch": 0.7187222715173026, "grad_norm": 0.5904334187507629, "learning_rate": 1.4237121025388219e-05, "loss": 0.0497, "step": 6480 }, { "epoch": 0.7190550133096717, "grad_norm": 0.7440276741981506, "learning_rate": 1.4233423712102541e-05, "loss": 0.0619, "step": 6483 }, { "epoch": 0.7193877551020408, "grad_norm": 1.2863119840621948, "learning_rate": 1.422972639881686e-05, "loss": 0.058, "step": 6486 }, { "epoch": 0.71972049689441, "grad_norm": 0.9274805784225464, "learning_rate": 1.4226029085531181e-05, "loss": 0.0618, "step": 6489 }, { "epoch": 0.7200532386867791, "grad_norm": 0.7027255296707153, "learning_rate": 1.4222331772245504e-05, "loss": 0.0773, "step": 6492 }, { "epoch": 0.7203859804791481, "grad_norm": 0.6506423950195312, "learning_rate": 1.4218634458959823e-05, "loss": 0.0665, "step": 6495 }, { "epoch": 0.7207187222715173, "grad_norm": 0.5361448526382446, "learning_rate": 1.4214937145674144e-05, "loss": 0.0284, "step": 6498 }, { "epoch": 0.7210514640638864, "grad_norm": 0.6452819108963013, "learning_rate": 1.4211239832388466e-05, "loss": 0.0365, "step": 6501 }, { "epoch": 0.7213842058562555, "grad_norm": 0.7907208800315857, "learning_rate": 1.4207542519102785e-05, "loss": 0.0432, "step": 6504 }, { "epoch": 0.7217169476486247, "grad_norm": 0.2956153154373169, "learning_rate": 1.4203845205817108e-05, "loss": 0.0389, "step": 6507 }, { "epoch": 0.7220496894409938, "grad_norm": 0.3212461471557617, "learning_rate": 1.4200147892531429e-05, "loss": 0.0442, "step": 6510 }, { "epoch": 0.7223824312333629, "grad_norm": 0.5057607889175415, "learning_rate": 1.4196450579245748e-05, "loss": 0.06, "step": 6513 }, { "epoch": 0.7227151730257321, "grad_norm": 1.1160998344421387, "learning_rate": 1.419275326596007e-05, "loss": 0.0631, "step": 6516 }, { "epoch": 0.7230479148181012, "grad_norm": 0.5593299269676208, "learning_rate": 1.4189055952674391e-05, "loss": 0.0396, "step": 6519 }, { "epoch": 0.7233806566104702, "grad_norm": 0.49484533071517944, "learning_rate": 1.418535863938871e-05, "loss": 0.0494, "step": 6522 }, { "epoch": 0.7237133984028394, "grad_norm": 1.0435597896575928, "learning_rate": 1.4181661326103033e-05, "loss": 0.0748, "step": 6525 }, { "epoch": 0.7240461401952085, "grad_norm": 0.6793468594551086, "learning_rate": 1.4177964012817354e-05, "loss": 0.0423, "step": 6528 }, { "epoch": 0.7243788819875776, "grad_norm": 0.3945784866809845, "learning_rate": 1.4174266699531676e-05, "loss": 0.0694, "step": 6531 }, { "epoch": 0.7247116237799468, "grad_norm": 0.4256824254989624, "learning_rate": 1.4170569386245995e-05, "loss": 0.0573, "step": 6534 }, { "epoch": 0.7250443655723159, "grad_norm": 0.7216191291809082, "learning_rate": 1.4166872072960316e-05, "loss": 0.0441, "step": 6537 }, { "epoch": 0.725377107364685, "grad_norm": 0.6538639068603516, "learning_rate": 1.4163174759674639e-05, "loss": 0.0721, "step": 6540 }, { "epoch": 0.7257098491570542, "grad_norm": 0.44412562251091003, "learning_rate": 1.4159477446388958e-05, "loss": 0.052, "step": 6543 }, { "epoch": 0.7260425909494232, "grad_norm": 0.5659828186035156, "learning_rate": 1.4155780133103279e-05, "loss": 0.0508, "step": 6546 }, { "epoch": 0.7263753327417923, "grad_norm": 0.3470565974712372, "learning_rate": 1.4152082819817601e-05, "loss": 0.0471, "step": 6549 }, { "epoch": 0.7267080745341615, "grad_norm": 1.554084062576294, "learning_rate": 1.414838550653192e-05, "loss": 0.0457, "step": 6552 }, { "epoch": 0.7270408163265306, "grad_norm": 1.0497864484786987, "learning_rate": 1.4144688193246243e-05, "loss": 0.0617, "step": 6555 }, { "epoch": 0.7273735581188997, "grad_norm": 0.6932433843612671, "learning_rate": 1.4140990879960564e-05, "loss": 0.0624, "step": 6558 }, { "epoch": 0.7277062999112689, "grad_norm": 0.4763273000717163, "learning_rate": 1.4137293566674883e-05, "loss": 0.0455, "step": 6561 }, { "epoch": 0.728039041703638, "grad_norm": 0.5588045120239258, "learning_rate": 1.4133596253389205e-05, "loss": 0.0427, "step": 6564 }, { "epoch": 0.7283717834960071, "grad_norm": 0.8978041410446167, "learning_rate": 1.4129898940103526e-05, "loss": 0.0656, "step": 6567 }, { "epoch": 0.7287045252883763, "grad_norm": 0.4981473684310913, "learning_rate": 1.4126201626817845e-05, "loss": 0.0666, "step": 6570 }, { "epoch": 0.7290372670807453, "grad_norm": 1.1696701049804688, "learning_rate": 1.4122504313532168e-05, "loss": 0.0484, "step": 6573 }, { "epoch": 0.7293700088731144, "grad_norm": 1.050846815109253, "learning_rate": 1.4118807000246489e-05, "loss": 0.072, "step": 6576 }, { "epoch": 0.7297027506654836, "grad_norm": 0.5594915151596069, "learning_rate": 1.4115109686960811e-05, "loss": 0.0681, "step": 6579 }, { "epoch": 0.7300354924578527, "grad_norm": 0.5786823630332947, "learning_rate": 1.411141237367513e-05, "loss": 0.088, "step": 6582 }, { "epoch": 0.7303682342502218, "grad_norm": 0.7378315329551697, "learning_rate": 1.4107715060389451e-05, "loss": 0.0634, "step": 6585 }, { "epoch": 0.730700976042591, "grad_norm": 0.7134234309196472, "learning_rate": 1.4104017747103773e-05, "loss": 0.0661, "step": 6588 }, { "epoch": 0.7310337178349601, "grad_norm": 0.21689875423908234, "learning_rate": 1.4100320433818093e-05, "loss": 0.0306, "step": 6591 }, { "epoch": 0.7313664596273292, "grad_norm": 0.3955785930156708, "learning_rate": 1.4096623120532413e-05, "loss": 0.033, "step": 6594 }, { "epoch": 0.7316992014196984, "grad_norm": 0.9538146257400513, "learning_rate": 1.4092925807246736e-05, "loss": 0.0617, "step": 6597 }, { "epoch": 0.7320319432120674, "grad_norm": 0.9423672556877136, "learning_rate": 1.4089228493961055e-05, "loss": 0.0647, "step": 6600 }, { "epoch": 0.7323646850044365, "grad_norm": 0.46205633878707886, "learning_rate": 1.4085531180675378e-05, "loss": 0.0392, "step": 6603 }, { "epoch": 0.7326974267968057, "grad_norm": 0.6328001618385315, "learning_rate": 1.4081833867389698e-05, "loss": 0.039, "step": 6606 }, { "epoch": 0.7330301685891748, "grad_norm": 0.39155328273773193, "learning_rate": 1.4078136554104018e-05, "loss": 0.0243, "step": 6609 }, { "epoch": 0.7333629103815439, "grad_norm": 0.3773397207260132, "learning_rate": 1.407443924081834e-05, "loss": 0.0481, "step": 6612 }, { "epoch": 0.7336956521739131, "grad_norm": 0.6344343423843384, "learning_rate": 1.4070741927532661e-05, "loss": 0.0626, "step": 6615 }, { "epoch": 0.7340283939662822, "grad_norm": 0.4884355366230011, "learning_rate": 1.406704461424698e-05, "loss": 0.0694, "step": 6618 }, { "epoch": 0.7343611357586513, "grad_norm": 1.8466455936431885, "learning_rate": 1.4063347300961303e-05, "loss": 0.0841, "step": 6621 }, { "epoch": 0.7346938775510204, "grad_norm": 0.5894522666931152, "learning_rate": 1.4059649987675623e-05, "loss": 0.0677, "step": 6624 }, { "epoch": 0.7350266193433895, "grad_norm": 0.5846576690673828, "learning_rate": 1.4055952674389946e-05, "loss": 0.069, "step": 6627 }, { "epoch": 0.7353593611357586, "grad_norm": 0.30423006415367126, "learning_rate": 1.4052255361104265e-05, "loss": 0.0528, "step": 6630 }, { "epoch": 0.7356921029281278, "grad_norm": 0.5161929130554199, "learning_rate": 1.4048558047818586e-05, "loss": 0.0564, "step": 6633 }, { "epoch": 0.7360248447204969, "grad_norm": 0.3231588900089264, "learning_rate": 1.4044860734532908e-05, "loss": 0.0536, "step": 6636 }, { "epoch": 0.736357586512866, "grad_norm": 0.7217681407928467, "learning_rate": 1.4041163421247227e-05, "loss": 0.0499, "step": 6639 }, { "epoch": 0.7366903283052352, "grad_norm": 0.6767754554748535, "learning_rate": 1.4037466107961548e-05, "loss": 0.0617, "step": 6642 }, { "epoch": 0.7370230700976043, "grad_norm": 0.4114813208580017, "learning_rate": 1.403376879467587e-05, "loss": 0.0457, "step": 6645 }, { "epoch": 0.7373558118899733, "grad_norm": 0.4135752320289612, "learning_rate": 1.403007148139019e-05, "loss": 0.0559, "step": 6648 }, { "epoch": 0.7376885536823425, "grad_norm": 0.7812623381614685, "learning_rate": 1.4026374168104512e-05, "loss": 0.0435, "step": 6651 }, { "epoch": 0.7380212954747116, "grad_norm": 1.0523589849472046, "learning_rate": 1.4022676854818833e-05, "loss": 0.0461, "step": 6654 }, { "epoch": 0.7383540372670807, "grad_norm": 0.500382125377655, "learning_rate": 1.4018979541533152e-05, "loss": 0.0602, "step": 6657 }, { "epoch": 0.7386867790594499, "grad_norm": 0.8043946623802185, "learning_rate": 1.4015282228247475e-05, "loss": 0.0625, "step": 6660 }, { "epoch": 0.739019520851819, "grad_norm": 0.24053683876991272, "learning_rate": 1.4011584914961796e-05, "loss": 0.0299, "step": 6663 }, { "epoch": 0.7393522626441881, "grad_norm": 0.5439703464508057, "learning_rate": 1.4007887601676115e-05, "loss": 0.0359, "step": 6666 }, { "epoch": 0.7396850044365573, "grad_norm": 0.16062846779823303, "learning_rate": 1.4004190288390437e-05, "loss": 0.0363, "step": 6669 }, { "epoch": 0.7400177462289264, "grad_norm": 0.347856730222702, "learning_rate": 1.4000492975104758e-05, "loss": 0.0569, "step": 6672 }, { "epoch": 0.7403504880212954, "grad_norm": 0.4920308589935303, "learning_rate": 1.399679566181908e-05, "loss": 0.0462, "step": 6675 }, { "epoch": 0.7406832298136646, "grad_norm": 1.2276172637939453, "learning_rate": 1.39930983485334e-05, "loss": 0.0685, "step": 6678 }, { "epoch": 0.7410159716060337, "grad_norm": 0.5126725435256958, "learning_rate": 1.398940103524772e-05, "loss": 0.0492, "step": 6681 }, { "epoch": 0.7413487133984028, "grad_norm": 0.548940122127533, "learning_rate": 1.3985703721962043e-05, "loss": 0.0655, "step": 6684 }, { "epoch": 0.741681455190772, "grad_norm": 0.2595776915550232, "learning_rate": 1.3982006408676362e-05, "loss": 0.0281, "step": 6687 }, { "epoch": 0.7420141969831411, "grad_norm": 0.5559605360031128, "learning_rate": 1.3978309095390683e-05, "loss": 0.0649, "step": 6690 }, { "epoch": 0.7423469387755102, "grad_norm": 0.4231014549732208, "learning_rate": 1.3974611782105006e-05, "loss": 0.0538, "step": 6693 }, { "epoch": 0.7426796805678794, "grad_norm": 0.5049533247947693, "learning_rate": 1.3970914468819325e-05, "loss": 0.059, "step": 6696 }, { "epoch": 0.7430124223602484, "grad_norm": 0.5667850375175476, "learning_rate": 1.3967217155533647e-05, "loss": 0.0525, "step": 6699 }, { "epoch": 0.7433451641526175, "grad_norm": 0.5581734776496887, "learning_rate": 1.3963519842247968e-05, "loss": 0.054, "step": 6702 }, { "epoch": 0.7436779059449867, "grad_norm": 0.3696436285972595, "learning_rate": 1.3959822528962287e-05, "loss": 0.0463, "step": 6705 }, { "epoch": 0.7440106477373558, "grad_norm": 0.8279126286506653, "learning_rate": 1.395612521567661e-05, "loss": 0.0446, "step": 6708 }, { "epoch": 0.7443433895297249, "grad_norm": 0.9437918066978455, "learning_rate": 1.395242790239093e-05, "loss": 0.0698, "step": 6711 }, { "epoch": 0.7446761313220941, "grad_norm": 0.48279955983161926, "learning_rate": 1.394873058910525e-05, "loss": 0.0464, "step": 6714 }, { "epoch": 0.7450088731144632, "grad_norm": 0.49451884627342224, "learning_rate": 1.3945033275819572e-05, "loss": 0.0994, "step": 6717 }, { "epoch": 0.7453416149068323, "grad_norm": 0.4616346061229706, "learning_rate": 1.3941335962533893e-05, "loss": 0.0429, "step": 6720 }, { "epoch": 0.7456743566992015, "grad_norm": 0.3866720497608185, "learning_rate": 1.3937638649248215e-05, "loss": 0.0662, "step": 6723 }, { "epoch": 0.7460070984915705, "grad_norm": 0.7075809240341187, "learning_rate": 1.3933941335962535e-05, "loss": 0.0738, "step": 6726 }, { "epoch": 0.7463398402839396, "grad_norm": 0.445466011762619, "learning_rate": 1.3930244022676855e-05, "loss": 0.1108, "step": 6729 }, { "epoch": 0.7466725820763088, "grad_norm": 0.926679790019989, "learning_rate": 1.3926546709391178e-05, "loss": 0.0549, "step": 6732 }, { "epoch": 0.7470053238686779, "grad_norm": 0.687928318977356, "learning_rate": 1.3922849396105497e-05, "loss": 0.0546, "step": 6735 }, { "epoch": 0.747338065661047, "grad_norm": 0.9307888746261597, "learning_rate": 1.3919152082819818e-05, "loss": 0.0802, "step": 6738 }, { "epoch": 0.7476708074534162, "grad_norm": 0.5699781179428101, "learning_rate": 1.391545476953414e-05, "loss": 0.0633, "step": 6741 }, { "epoch": 0.7480035492457853, "grad_norm": 0.52405846118927, "learning_rate": 1.391175745624846e-05, "loss": 0.0421, "step": 6744 }, { "epoch": 0.7483362910381544, "grad_norm": 0.18965473771095276, "learning_rate": 1.3908060142962782e-05, "loss": 0.0213, "step": 6747 }, { "epoch": 0.7486690328305236, "grad_norm": 0.4320389926433563, "learning_rate": 1.3904362829677103e-05, "loss": 0.0512, "step": 6750 }, { "epoch": 0.7490017746228926, "grad_norm": 0.718959391117096, "learning_rate": 1.3900665516391422e-05, "loss": 0.0699, "step": 6753 }, { "epoch": 0.7493345164152617, "grad_norm": 0.4784375727176666, "learning_rate": 1.3896968203105744e-05, "loss": 0.0462, "step": 6756 }, { "epoch": 0.7496672582076309, "grad_norm": 0.4587520658969879, "learning_rate": 1.3893270889820065e-05, "loss": 0.0454, "step": 6759 }, { "epoch": 0.75, "grad_norm": 1.5522009134292603, "learning_rate": 1.3889573576534386e-05, "loss": 0.0342, "step": 6762 }, { "epoch": 0.7503327417923691, "grad_norm": 0.868460476398468, "learning_rate": 1.3885876263248707e-05, "loss": 0.0543, "step": 6765 }, { "epoch": 0.7506654835847383, "grad_norm": 0.523127555847168, "learning_rate": 1.3882178949963028e-05, "loss": 0.0409, "step": 6768 }, { "epoch": 0.7509982253771074, "grad_norm": 1.4940086603164673, "learning_rate": 1.3878481636677349e-05, "loss": 0.0637, "step": 6771 }, { "epoch": 0.7513309671694764, "grad_norm": 0.5837235450744629, "learning_rate": 1.387478432339167e-05, "loss": 0.0399, "step": 6774 }, { "epoch": 0.7516637089618456, "grad_norm": 0.6547781229019165, "learning_rate": 1.387108701010599e-05, "loss": 0.0587, "step": 6777 }, { "epoch": 0.7519964507542147, "grad_norm": 0.4301832318305969, "learning_rate": 1.3867389696820313e-05, "loss": 0.042, "step": 6780 }, { "epoch": 0.7523291925465838, "grad_norm": 0.8317931890487671, "learning_rate": 1.3863692383534632e-05, "loss": 0.0653, "step": 6783 }, { "epoch": 0.752661934338953, "grad_norm": 0.6048564314842224, "learning_rate": 1.3859995070248953e-05, "loss": 0.0478, "step": 6786 }, { "epoch": 0.7529946761313221, "grad_norm": 0.5645517706871033, "learning_rate": 1.3856297756963275e-05, "loss": 0.0538, "step": 6789 }, { "epoch": 0.7533274179236912, "grad_norm": 0.3707904815673828, "learning_rate": 1.3852600443677594e-05, "loss": 0.0637, "step": 6792 }, { "epoch": 0.7536601597160604, "grad_norm": 0.9661321640014648, "learning_rate": 1.3848903130391915e-05, "loss": 0.0671, "step": 6795 }, { "epoch": 0.7539929015084295, "grad_norm": 0.5984059572219849, "learning_rate": 1.3845205817106238e-05, "loss": 0.0471, "step": 6798 }, { "epoch": 0.7543256433007985, "grad_norm": 0.13956381380558014, "learning_rate": 1.3841508503820558e-05, "loss": 0.0274, "step": 6801 }, { "epoch": 0.7546583850931677, "grad_norm": 0.5798934698104858, "learning_rate": 1.383781119053488e-05, "loss": 0.0435, "step": 6804 }, { "epoch": 0.7549911268855368, "grad_norm": 0.3124759793281555, "learning_rate": 1.38341138772492e-05, "loss": 0.0325, "step": 6807 }, { "epoch": 0.7553238686779059, "grad_norm": 0.6195421814918518, "learning_rate": 1.3830416563963521e-05, "loss": 0.0558, "step": 6810 }, { "epoch": 0.7556566104702751, "grad_norm": 0.3510340452194214, "learning_rate": 1.3826719250677842e-05, "loss": 0.0363, "step": 6813 }, { "epoch": 0.7559893522626442, "grad_norm": 0.6829861402511597, "learning_rate": 1.3823021937392163e-05, "loss": 0.0486, "step": 6816 }, { "epoch": 0.7563220940550133, "grad_norm": 0.8042024970054626, "learning_rate": 1.3819324624106483e-05, "loss": 0.0596, "step": 6819 }, { "epoch": 0.7566548358473825, "grad_norm": 1.4254924058914185, "learning_rate": 1.3815627310820804e-05, "loss": 0.0317, "step": 6822 }, { "epoch": 0.7569875776397516, "grad_norm": 0.7231554388999939, "learning_rate": 1.3811929997535125e-05, "loss": 0.0815, "step": 6825 }, { "epoch": 0.7573203194321206, "grad_norm": 0.3845732808113098, "learning_rate": 1.3808232684249448e-05, "loss": 0.0337, "step": 6828 }, { "epoch": 0.7576530612244898, "grad_norm": 1.7195631265640259, "learning_rate": 1.3804535370963767e-05, "loss": 0.0615, "step": 6831 }, { "epoch": 0.7579858030168589, "grad_norm": 0.5530279278755188, "learning_rate": 1.3800838057678088e-05, "loss": 0.0436, "step": 6834 }, { "epoch": 0.758318544809228, "grad_norm": 0.4595615267753601, "learning_rate": 1.379714074439241e-05, "loss": 0.0622, "step": 6837 }, { "epoch": 0.7586512866015972, "grad_norm": 0.49644315242767334, "learning_rate": 1.3793443431106731e-05, "loss": 0.0797, "step": 6840 }, { "epoch": 0.7589840283939663, "grad_norm": 0.2371433526277542, "learning_rate": 1.378974611782105e-05, "loss": 0.047, "step": 6843 }, { "epoch": 0.7593167701863354, "grad_norm": 0.6215728521347046, "learning_rate": 1.3786048804535372e-05, "loss": 0.0459, "step": 6846 }, { "epoch": 0.7596495119787046, "grad_norm": 0.4700585603713989, "learning_rate": 1.3782351491249693e-05, "loss": 0.0548, "step": 6849 }, { "epoch": 0.7599822537710736, "grad_norm": 0.5729385018348694, "learning_rate": 1.3778654177964014e-05, "loss": 0.0389, "step": 6852 }, { "epoch": 0.7603149955634427, "grad_norm": 0.5168904662132263, "learning_rate": 1.3774956864678335e-05, "loss": 0.0728, "step": 6855 }, { "epoch": 0.7606477373558119, "grad_norm": 0.6729869842529297, "learning_rate": 1.3771259551392656e-05, "loss": 0.0514, "step": 6858 }, { "epoch": 0.760980479148181, "grad_norm": 0.5139790773391724, "learning_rate": 1.3767562238106977e-05, "loss": 0.0489, "step": 6861 }, { "epoch": 0.7613132209405501, "grad_norm": 0.5026693344116211, "learning_rate": 1.3763864924821297e-05, "loss": 0.0435, "step": 6864 }, { "epoch": 0.7616459627329193, "grad_norm": 0.38265034556388855, "learning_rate": 1.3760167611535618e-05, "loss": 0.039, "step": 6867 }, { "epoch": 0.7619787045252884, "grad_norm": 0.5481712818145752, "learning_rate": 1.3756470298249939e-05, "loss": 0.0469, "step": 6870 }, { "epoch": 0.7623114463176575, "grad_norm": 0.5272092819213867, "learning_rate": 1.375277298496426e-05, "loss": 0.0506, "step": 6873 }, { "epoch": 0.7626441881100267, "grad_norm": 0.8337377309799194, "learning_rate": 1.3749075671678582e-05, "loss": 0.026, "step": 6876 }, { "epoch": 0.7629769299023957, "grad_norm": 0.8123607635498047, "learning_rate": 1.3745378358392902e-05, "loss": 0.0392, "step": 6879 }, { "epoch": 0.7633096716947648, "grad_norm": 0.40675488114356995, "learning_rate": 1.3741681045107222e-05, "loss": 0.0524, "step": 6882 }, { "epoch": 0.763642413487134, "grad_norm": 0.46824100613594055, "learning_rate": 1.3737983731821545e-05, "loss": 0.0497, "step": 6885 }, { "epoch": 0.7639751552795031, "grad_norm": 0.3587464392185211, "learning_rate": 1.3734286418535866e-05, "loss": 0.0375, "step": 6888 }, { "epoch": 0.7643078970718722, "grad_norm": 0.32295700907707214, "learning_rate": 1.3730589105250185e-05, "loss": 0.033, "step": 6891 }, { "epoch": 0.7646406388642414, "grad_norm": 0.9467336535453796, "learning_rate": 1.3726891791964507e-05, "loss": 0.0761, "step": 6894 }, { "epoch": 0.7649733806566105, "grad_norm": 0.8183095455169678, "learning_rate": 1.3723194478678828e-05, "loss": 0.0606, "step": 6897 }, { "epoch": 0.7653061224489796, "grad_norm": 0.24208195507526398, "learning_rate": 1.3719497165393149e-05, "loss": 0.0255, "step": 6900 }, { "epoch": 0.7656388642413487, "grad_norm": 0.7251461148262024, "learning_rate": 1.371579985210747e-05, "loss": 0.0467, "step": 6903 }, { "epoch": 0.7659716060337178, "grad_norm": 0.5127456784248352, "learning_rate": 1.371210253882179e-05, "loss": 0.0284, "step": 6906 }, { "epoch": 0.7663043478260869, "grad_norm": 0.33514007925987244, "learning_rate": 1.3708405225536111e-05, "loss": 0.0471, "step": 6909 }, { "epoch": 0.7666370896184561, "grad_norm": 0.7306869029998779, "learning_rate": 1.3704707912250432e-05, "loss": 0.0729, "step": 6912 }, { "epoch": 0.7669698314108252, "grad_norm": 0.38003280758857727, "learning_rate": 1.3701010598964753e-05, "loss": 0.0379, "step": 6915 }, { "epoch": 0.7673025732031943, "grad_norm": 1.298746109008789, "learning_rate": 1.3697313285679074e-05, "loss": 0.0472, "step": 6918 }, { "epoch": 0.7676353149955635, "grad_norm": 0.703133761882782, "learning_rate": 1.3693615972393395e-05, "loss": 0.0685, "step": 6921 }, { "epoch": 0.7679680567879326, "grad_norm": 0.6468784809112549, "learning_rate": 1.3689918659107717e-05, "loss": 0.0624, "step": 6924 }, { "epoch": 0.7683007985803016, "grad_norm": 1.123448133468628, "learning_rate": 1.3686221345822038e-05, "loss": 0.065, "step": 6927 }, { "epoch": 0.7686335403726708, "grad_norm": 0.36742061376571655, "learning_rate": 1.3682524032536357e-05, "loss": 0.0411, "step": 6930 }, { "epoch": 0.7689662821650399, "grad_norm": 0.25502902269363403, "learning_rate": 1.367882671925068e-05, "loss": 0.0451, "step": 6933 }, { "epoch": 0.769299023957409, "grad_norm": 0.4549921452999115, "learning_rate": 1.3675129405965e-05, "loss": 0.0572, "step": 6936 }, { "epoch": 0.7696317657497782, "grad_norm": 0.2187502235174179, "learning_rate": 1.367143209267932e-05, "loss": 0.0539, "step": 6939 }, { "epoch": 0.7699645075421473, "grad_norm": 0.7269256114959717, "learning_rate": 1.3667734779393642e-05, "loss": 0.0633, "step": 6942 }, { "epoch": 0.7702972493345164, "grad_norm": 0.6678751111030579, "learning_rate": 1.3664037466107963e-05, "loss": 0.0622, "step": 6945 }, { "epoch": 0.7706299911268856, "grad_norm": 0.4039294421672821, "learning_rate": 1.3660340152822284e-05, "loss": 0.0411, "step": 6948 }, { "epoch": 0.7709627329192547, "grad_norm": 0.8289210796356201, "learning_rate": 1.3656642839536605e-05, "loss": 0.0717, "step": 6951 }, { "epoch": 0.7712954747116237, "grad_norm": 1.213400959968567, "learning_rate": 1.3652945526250925e-05, "loss": 0.0424, "step": 6954 }, { "epoch": 0.7716282165039929, "grad_norm": 0.9993210434913635, "learning_rate": 1.3649248212965246e-05, "loss": 0.1036, "step": 6957 }, { "epoch": 0.771960958296362, "grad_norm": 0.4614068269729614, "learning_rate": 1.3645550899679567e-05, "loss": 0.025, "step": 6960 }, { "epoch": 0.7722937000887311, "grad_norm": 0.24279731512069702, "learning_rate": 1.3641853586393888e-05, "loss": 0.0381, "step": 6963 }, { "epoch": 0.7726264418811003, "grad_norm": 0.27670592069625854, "learning_rate": 1.363815627310821e-05, "loss": 0.0406, "step": 6966 }, { "epoch": 0.7729591836734694, "grad_norm": 0.9605715274810791, "learning_rate": 1.363445895982253e-05, "loss": 0.0744, "step": 6969 }, { "epoch": 0.7732919254658385, "grad_norm": 0.8104544281959534, "learning_rate": 1.3630761646536852e-05, "loss": 0.0738, "step": 6972 }, { "epoch": 0.7736246672582077, "grad_norm": 0.8367044925689697, "learning_rate": 1.3627064333251173e-05, "loss": 0.0556, "step": 6975 }, { "epoch": 0.7739574090505768, "grad_norm": 0.34105563163757324, "learning_rate": 1.3623367019965492e-05, "loss": 0.0968, "step": 6978 }, { "epoch": 0.7742901508429458, "grad_norm": 0.45277178287506104, "learning_rate": 1.3619669706679814e-05, "loss": 0.0611, "step": 6981 }, { "epoch": 0.774622892635315, "grad_norm": 0.6123768091201782, "learning_rate": 1.3615972393394135e-05, "loss": 0.0383, "step": 6984 }, { "epoch": 0.7749556344276841, "grad_norm": 0.3850551247596741, "learning_rate": 1.3612275080108454e-05, "loss": 0.0486, "step": 6987 }, { "epoch": 0.7752883762200532, "grad_norm": 0.3058047592639923, "learning_rate": 1.3608577766822777e-05, "loss": 0.0403, "step": 6990 }, { "epoch": 0.7756211180124224, "grad_norm": 0.3872954249382019, "learning_rate": 1.3604880453537098e-05, "loss": 0.0523, "step": 6993 }, { "epoch": 0.7759538598047915, "grad_norm": 0.644769549369812, "learning_rate": 1.3601183140251419e-05, "loss": 0.0426, "step": 6996 }, { "epoch": 0.7762866015971606, "grad_norm": 0.3633786141872406, "learning_rate": 1.359748582696574e-05, "loss": 0.0338, "step": 6999 }, { "epoch": 0.7766193433895298, "grad_norm": 0.31931671500205994, "learning_rate": 1.359378851368006e-05, "loss": 0.0342, "step": 7002 }, { "epoch": 0.7769520851818988, "grad_norm": 1.0194590091705322, "learning_rate": 1.3590091200394383e-05, "loss": 0.0773, "step": 7005 }, { "epoch": 0.7772848269742679, "grad_norm": 0.6425788402557373, "learning_rate": 1.3586393887108702e-05, "loss": 0.0708, "step": 7008 }, { "epoch": 0.7776175687666371, "grad_norm": 0.9225112795829773, "learning_rate": 1.3582696573823023e-05, "loss": 0.1075, "step": 7011 }, { "epoch": 0.7779503105590062, "grad_norm": 0.9561117887496948, "learning_rate": 1.3578999260537345e-05, "loss": 0.0652, "step": 7014 }, { "epoch": 0.7782830523513753, "grad_norm": 0.4823874831199646, "learning_rate": 1.3575301947251664e-05, "loss": 0.0406, "step": 7017 }, { "epoch": 0.7786157941437445, "grad_norm": 0.5152601599693298, "learning_rate": 1.3571604633965987e-05, "loss": 0.0552, "step": 7020 }, { "epoch": 0.7789485359361136, "grad_norm": 0.7021452784538269, "learning_rate": 1.3567907320680308e-05, "loss": 0.044, "step": 7023 }, { "epoch": 0.7792812777284827, "grad_norm": 0.4970664083957672, "learning_rate": 1.3564210007394627e-05, "loss": 0.0319, "step": 7026 }, { "epoch": 0.7796140195208519, "grad_norm": 0.48347240686416626, "learning_rate": 1.356051269410895e-05, "loss": 0.0407, "step": 7029 }, { "epoch": 0.7799467613132209, "grad_norm": 0.6243951916694641, "learning_rate": 1.355681538082327e-05, "loss": 0.0299, "step": 7032 }, { "epoch": 0.78027950310559, "grad_norm": 0.35299059748649597, "learning_rate": 1.355311806753759e-05, "loss": 0.0345, "step": 7035 }, { "epoch": 0.7806122448979592, "grad_norm": 2.1070237159729004, "learning_rate": 1.3549420754251912e-05, "loss": 0.0699, "step": 7038 }, { "epoch": 0.7809449866903283, "grad_norm": 0.8083229660987854, "learning_rate": 1.3545723440966233e-05, "loss": 0.0411, "step": 7041 }, { "epoch": 0.7812777284826974, "grad_norm": 0.6642881631851196, "learning_rate": 1.3542026127680553e-05, "loss": 0.0255, "step": 7044 }, { "epoch": 0.7816104702750666, "grad_norm": 0.26391464471817017, "learning_rate": 1.3538328814394874e-05, "loss": 0.0442, "step": 7047 }, { "epoch": 0.7819432120674357, "grad_norm": 0.26167434453964233, "learning_rate": 1.3534631501109195e-05, "loss": 0.06, "step": 7050 }, { "epoch": 0.7822759538598048, "grad_norm": 0.42674165964126587, "learning_rate": 1.3530934187823518e-05, "loss": 0.0682, "step": 7053 }, { "epoch": 0.782608695652174, "grad_norm": 0.43688052892684937, "learning_rate": 1.3527236874537837e-05, "loss": 0.0421, "step": 7056 }, { "epoch": 0.782941437444543, "grad_norm": 0.4086498022079468, "learning_rate": 1.3523539561252158e-05, "loss": 0.0717, "step": 7059 }, { "epoch": 0.7832741792369121, "grad_norm": 0.42865896224975586, "learning_rate": 1.351984224796648e-05, "loss": 0.0698, "step": 7062 }, { "epoch": 0.7836069210292813, "grad_norm": 0.6730954647064209, "learning_rate": 1.3516144934680799e-05, "loss": 0.0462, "step": 7065 }, { "epoch": 0.7839396628216504, "grad_norm": 0.589313268661499, "learning_rate": 1.3512447621395122e-05, "loss": 0.0647, "step": 7068 }, { "epoch": 0.7842724046140195, "grad_norm": 1.34096360206604, "learning_rate": 1.3508750308109442e-05, "loss": 0.0704, "step": 7071 }, { "epoch": 0.7846051464063887, "grad_norm": 0.48069092631340027, "learning_rate": 1.3505052994823762e-05, "loss": 0.0754, "step": 7074 }, { "epoch": 0.7849378881987578, "grad_norm": 0.5827749967575073, "learning_rate": 1.3501355681538084e-05, "loss": 0.043, "step": 7077 }, { "epoch": 0.7852706299911268, "grad_norm": 0.6488829851150513, "learning_rate": 1.3497658368252405e-05, "loss": 0.0445, "step": 7080 }, { "epoch": 0.785603371783496, "grad_norm": 0.5473216772079468, "learning_rate": 1.3493961054966724e-05, "loss": 0.0396, "step": 7083 }, { "epoch": 0.7859361135758651, "grad_norm": 0.5308311581611633, "learning_rate": 1.3490263741681047e-05, "loss": 0.0739, "step": 7086 }, { "epoch": 0.7862688553682342, "grad_norm": 0.609637975692749, "learning_rate": 1.3486566428395367e-05, "loss": 0.0477, "step": 7089 }, { "epoch": 0.7866015971606034, "grad_norm": 0.7394914627075195, "learning_rate": 1.348286911510969e-05, "loss": 0.0542, "step": 7092 }, { "epoch": 0.7869343389529725, "grad_norm": 0.5869066715240479, "learning_rate": 1.3479171801824009e-05, "loss": 0.0383, "step": 7095 }, { "epoch": 0.7872670807453416, "grad_norm": 0.4668184518814087, "learning_rate": 1.347547448853833e-05, "loss": 0.0234, "step": 7098 }, { "epoch": 0.7875998225377108, "grad_norm": 0.8726705312728882, "learning_rate": 1.3471777175252652e-05, "loss": 0.0442, "step": 7101 }, { "epoch": 0.7879325643300799, "grad_norm": 0.4560718834400177, "learning_rate": 1.3468079861966972e-05, "loss": 0.0485, "step": 7104 }, { "epoch": 0.7882653061224489, "grad_norm": 0.5052443146705627, "learning_rate": 1.3464382548681292e-05, "loss": 0.0527, "step": 7107 }, { "epoch": 0.7885980479148181, "grad_norm": 0.5071144700050354, "learning_rate": 1.3460685235395615e-05, "loss": 0.0322, "step": 7110 }, { "epoch": 0.7889307897071872, "grad_norm": 0.37018316984176636, "learning_rate": 1.3456987922109934e-05, "loss": 0.0676, "step": 7113 }, { "epoch": 0.7892635314995563, "grad_norm": 1.0295770168304443, "learning_rate": 1.3453290608824256e-05, "loss": 0.0654, "step": 7116 }, { "epoch": 0.7895962732919255, "grad_norm": 0.38385340571403503, "learning_rate": 1.3449593295538577e-05, "loss": 0.0491, "step": 7119 }, { "epoch": 0.7899290150842946, "grad_norm": 0.3851141333580017, "learning_rate": 1.3445895982252896e-05, "loss": 0.0519, "step": 7122 }, { "epoch": 0.7902617568766637, "grad_norm": 0.5392422080039978, "learning_rate": 1.3442198668967219e-05, "loss": 0.0332, "step": 7125 }, { "epoch": 0.7905944986690329, "grad_norm": 0.866764485836029, "learning_rate": 1.343850135568154e-05, "loss": 0.0457, "step": 7128 }, { "epoch": 0.790927240461402, "grad_norm": 0.8214328289031982, "learning_rate": 1.3434804042395859e-05, "loss": 0.0898, "step": 7131 }, { "epoch": 0.791259982253771, "grad_norm": 0.6251400113105774, "learning_rate": 1.3431106729110181e-05, "loss": 0.0737, "step": 7134 }, { "epoch": 0.7915927240461402, "grad_norm": 0.6308704018592834, "learning_rate": 1.3427409415824502e-05, "loss": 0.0547, "step": 7137 }, { "epoch": 0.7919254658385093, "grad_norm": 0.6739557385444641, "learning_rate": 1.3423712102538825e-05, "loss": 0.053, "step": 7140 }, { "epoch": 0.7922582076308784, "grad_norm": 0.3112190067768097, "learning_rate": 1.3420014789253144e-05, "loss": 0.0558, "step": 7143 }, { "epoch": 0.7925909494232476, "grad_norm": 0.2706547975540161, "learning_rate": 1.3416317475967465e-05, "loss": 0.0305, "step": 7146 }, { "epoch": 0.7929236912156167, "grad_norm": 0.6264970302581787, "learning_rate": 1.3412620162681787e-05, "loss": 0.0531, "step": 7149 }, { "epoch": 0.7932564330079858, "grad_norm": 0.2870643436908722, "learning_rate": 1.3408922849396106e-05, "loss": 0.0359, "step": 7152 }, { "epoch": 0.793589174800355, "grad_norm": 1.0851136445999146, "learning_rate": 1.3405225536110427e-05, "loss": 0.0681, "step": 7155 }, { "epoch": 0.793921916592724, "grad_norm": 0.2915292978286743, "learning_rate": 1.340152822282475e-05, "loss": 0.0265, "step": 7158 }, { "epoch": 0.7942546583850931, "grad_norm": 0.19492606818675995, "learning_rate": 1.3397830909539069e-05, "loss": 0.0322, "step": 7161 }, { "epoch": 0.7945874001774623, "grad_norm": 0.2916894853115082, "learning_rate": 1.3394133596253391e-05, "loss": 0.0381, "step": 7164 }, { "epoch": 0.7949201419698314, "grad_norm": 0.30363988876342773, "learning_rate": 1.3390436282967712e-05, "loss": 0.029, "step": 7167 }, { "epoch": 0.7952528837622005, "grad_norm": 0.48160505294799805, "learning_rate": 1.3386738969682031e-05, "loss": 0.077, "step": 7170 }, { "epoch": 0.7955856255545697, "grad_norm": 0.7897032499313354, "learning_rate": 1.3383041656396354e-05, "loss": 0.07, "step": 7173 }, { "epoch": 0.7959183673469388, "grad_norm": 0.6568636298179626, "learning_rate": 1.3379344343110675e-05, "loss": 0.0669, "step": 7176 }, { "epoch": 0.7962511091393079, "grad_norm": 0.6518925428390503, "learning_rate": 1.3375647029824994e-05, "loss": 0.0492, "step": 7179 }, { "epoch": 0.796583850931677, "grad_norm": 0.3010895252227783, "learning_rate": 1.3371949716539316e-05, "loss": 0.0725, "step": 7182 }, { "epoch": 0.7969165927240461, "grad_norm": 1.2115349769592285, "learning_rate": 1.3368252403253637e-05, "loss": 0.0803, "step": 7185 }, { "epoch": 0.7972493345164152, "grad_norm": 0.40287432074546814, "learning_rate": 1.336455508996796e-05, "loss": 0.034, "step": 7188 }, { "epoch": 0.7975820763087844, "grad_norm": 0.3306931257247925, "learning_rate": 1.3360857776682279e-05, "loss": 0.0307, "step": 7191 }, { "epoch": 0.7979148181011535, "grad_norm": 0.4771062135696411, "learning_rate": 1.33571604633966e-05, "loss": 0.0542, "step": 7194 }, { "epoch": 0.7982475598935226, "grad_norm": 0.5161990523338318, "learning_rate": 1.3353463150110922e-05, "loss": 0.0305, "step": 7197 }, { "epoch": 0.7985803016858918, "grad_norm": 0.5604353547096252, "learning_rate": 1.3349765836825241e-05, "loss": 0.0461, "step": 7200 }, { "epoch": 0.7989130434782609, "grad_norm": 0.41034963726997375, "learning_rate": 1.3346068523539562e-05, "loss": 0.0433, "step": 7203 }, { "epoch": 0.79924578527063, "grad_norm": 0.2582657039165497, "learning_rate": 1.3342371210253884e-05, "loss": 0.0521, "step": 7206 }, { "epoch": 0.7995785270629991, "grad_norm": 1.0298833847045898, "learning_rate": 1.3338673896968204e-05, "loss": 0.0991, "step": 7209 }, { "epoch": 0.7999112688553682, "grad_norm": 0.4682266414165497, "learning_rate": 1.3334976583682526e-05, "loss": 0.0405, "step": 7212 }, { "epoch": 0.8002440106477373, "grad_norm": 0.5854337811470032, "learning_rate": 1.3331279270396847e-05, "loss": 0.0869, "step": 7215 }, { "epoch": 0.8005767524401065, "grad_norm": 0.9837507009506226, "learning_rate": 1.3327581957111166e-05, "loss": 0.051, "step": 7218 }, { "epoch": 0.8009094942324756, "grad_norm": 0.46680304408073425, "learning_rate": 1.3323884643825489e-05, "loss": 0.0329, "step": 7221 }, { "epoch": 0.8012422360248447, "grad_norm": 0.5176241993904114, "learning_rate": 1.332018733053981e-05, "loss": 0.0383, "step": 7224 }, { "epoch": 0.8015749778172139, "grad_norm": 0.8460305333137512, "learning_rate": 1.3316490017254129e-05, "loss": 0.0836, "step": 7227 }, { "epoch": 0.801907719609583, "grad_norm": 1.6985629796981812, "learning_rate": 1.3312792703968451e-05, "loss": 0.046, "step": 7230 }, { "epoch": 0.802240461401952, "grad_norm": 0.25689876079559326, "learning_rate": 1.3309095390682772e-05, "loss": 0.0306, "step": 7233 }, { "epoch": 0.8025732031943212, "grad_norm": 0.6485418081283569, "learning_rate": 1.3305398077397091e-05, "loss": 0.0588, "step": 7236 }, { "epoch": 0.8029059449866903, "grad_norm": 0.45707669854164124, "learning_rate": 1.3301700764111413e-05, "loss": 0.0499, "step": 7239 }, { "epoch": 0.8032386867790594, "grad_norm": 0.6352065205574036, "learning_rate": 1.3298003450825734e-05, "loss": 0.0339, "step": 7242 }, { "epoch": 0.8035714285714286, "grad_norm": 0.7302868962287903, "learning_rate": 1.3294306137540057e-05, "loss": 0.086, "step": 7245 }, { "epoch": 0.8039041703637977, "grad_norm": 0.6422930955886841, "learning_rate": 1.3290608824254376e-05, "loss": 0.0675, "step": 7248 }, { "epoch": 0.8042369121561668, "grad_norm": 0.5489389300346375, "learning_rate": 1.3286911510968697e-05, "loss": 0.0806, "step": 7251 }, { "epoch": 0.804569653948536, "grad_norm": 0.43209078907966614, "learning_rate": 1.328321419768302e-05, "loss": 0.0854, "step": 7254 }, { "epoch": 0.804902395740905, "grad_norm": 0.37219834327697754, "learning_rate": 1.3279516884397338e-05, "loss": 0.0381, "step": 7257 }, { "epoch": 0.8052351375332741, "grad_norm": 0.7948285937309265, "learning_rate": 1.327581957111166e-05, "loss": 0.0349, "step": 7260 }, { "epoch": 0.8055678793256433, "grad_norm": 0.2715185284614563, "learning_rate": 1.3272122257825982e-05, "loss": 0.041, "step": 7263 }, { "epoch": 0.8059006211180124, "grad_norm": 0.7914915680885315, "learning_rate": 1.3268424944540301e-05, "loss": 0.0578, "step": 7266 }, { "epoch": 0.8062333629103815, "grad_norm": 0.49715787172317505, "learning_rate": 1.3264727631254623e-05, "loss": 0.0393, "step": 7269 }, { "epoch": 0.8065661047027507, "grad_norm": 0.6499034762382507, "learning_rate": 1.3261030317968944e-05, "loss": 0.0583, "step": 7272 }, { "epoch": 0.8068988464951198, "grad_norm": 0.26078471541404724, "learning_rate": 1.3257333004683263e-05, "loss": 0.0682, "step": 7275 }, { "epoch": 0.8072315882874889, "grad_norm": 0.8048827052116394, "learning_rate": 1.3253635691397586e-05, "loss": 0.0443, "step": 7278 }, { "epoch": 0.8075643300798581, "grad_norm": 0.32369327545166016, "learning_rate": 1.3249938378111907e-05, "loss": 0.059, "step": 7281 }, { "epoch": 0.8078970718722271, "grad_norm": 0.7105504870414734, "learning_rate": 1.3246241064826226e-05, "loss": 0.0566, "step": 7284 }, { "epoch": 0.8082298136645962, "grad_norm": 1.0010986328125, "learning_rate": 1.3242543751540548e-05, "loss": 0.0622, "step": 7287 }, { "epoch": 0.8085625554569654, "grad_norm": 0.5868343114852905, "learning_rate": 1.3238846438254869e-05, "loss": 0.0816, "step": 7290 }, { "epoch": 0.8088952972493345, "grad_norm": 1.2821122407913208, "learning_rate": 1.3235149124969192e-05, "loss": 0.0736, "step": 7293 }, { "epoch": 0.8092280390417036, "grad_norm": 0.39421650767326355, "learning_rate": 1.323145181168351e-05, "loss": 0.0422, "step": 7296 }, { "epoch": 0.8095607808340728, "grad_norm": 0.44155314564704895, "learning_rate": 1.3227754498397832e-05, "loss": 0.0635, "step": 7299 }, { "epoch": 0.8098935226264419, "grad_norm": 0.9237297177314758, "learning_rate": 1.3224057185112154e-05, "loss": 0.0588, "step": 7302 }, { "epoch": 0.810226264418811, "grad_norm": 0.28550535440444946, "learning_rate": 1.3220359871826473e-05, "loss": 0.0337, "step": 7305 }, { "epoch": 0.8105590062111802, "grad_norm": 1.1827166080474854, "learning_rate": 1.3216662558540794e-05, "loss": 0.0802, "step": 7308 }, { "epoch": 0.8108917480035492, "grad_norm": 0.7034807205200195, "learning_rate": 1.3212965245255117e-05, "loss": 0.0503, "step": 7311 }, { "epoch": 0.8112244897959183, "grad_norm": 0.2672681510448456, "learning_rate": 1.3209267931969436e-05, "loss": 0.0255, "step": 7314 }, { "epoch": 0.8115572315882875, "grad_norm": 0.8695570230484009, "learning_rate": 1.3205570618683758e-05, "loss": 0.0825, "step": 7317 }, { "epoch": 0.8118899733806566, "grad_norm": 0.8400031924247742, "learning_rate": 1.3201873305398079e-05, "loss": 0.0605, "step": 7320 }, { "epoch": 0.8122227151730257, "grad_norm": 0.6383177042007446, "learning_rate": 1.3198175992112398e-05, "loss": 0.049, "step": 7323 }, { "epoch": 0.8125554569653949, "grad_norm": 0.2260986566543579, "learning_rate": 1.319447867882672e-05, "loss": 0.0532, "step": 7326 }, { "epoch": 0.812888198757764, "grad_norm": 0.3341304063796997, "learning_rate": 1.3190781365541041e-05, "loss": 0.0435, "step": 7329 }, { "epoch": 0.813220940550133, "grad_norm": 0.3338419497013092, "learning_rate": 1.318708405225536e-05, "loss": 0.0287, "step": 7332 }, { "epoch": 0.8135536823425022, "grad_norm": 0.6725092530250549, "learning_rate": 1.3183386738969683e-05, "loss": 0.0698, "step": 7335 }, { "epoch": 0.8138864241348713, "grad_norm": 0.6151888370513916, "learning_rate": 1.3179689425684004e-05, "loss": 0.0527, "step": 7338 }, { "epoch": 0.8142191659272404, "grad_norm": 1.330954909324646, "learning_rate": 1.3175992112398326e-05, "loss": 0.0685, "step": 7341 }, { "epoch": 0.8145519077196096, "grad_norm": 0.4839244782924652, "learning_rate": 1.3172294799112646e-05, "loss": 0.0294, "step": 7344 }, { "epoch": 0.8148846495119787, "grad_norm": 0.2098318487405777, "learning_rate": 1.3168597485826966e-05, "loss": 0.0511, "step": 7347 }, { "epoch": 0.8152173913043478, "grad_norm": 0.43979382514953613, "learning_rate": 1.3164900172541289e-05, "loss": 0.056, "step": 7350 }, { "epoch": 0.815550133096717, "grad_norm": 0.5419193506240845, "learning_rate": 1.3161202859255608e-05, "loss": 0.0445, "step": 7353 }, { "epoch": 0.8158828748890861, "grad_norm": 0.32809171080589294, "learning_rate": 1.3157505545969929e-05, "loss": 0.0216, "step": 7356 }, { "epoch": 0.8162156166814551, "grad_norm": 0.46079355478286743, "learning_rate": 1.3153808232684251e-05, "loss": 0.0598, "step": 7359 }, { "epoch": 0.8165483584738243, "grad_norm": 0.46037569642066956, "learning_rate": 1.315011091939857e-05, "loss": 0.0985, "step": 7362 }, { "epoch": 0.8168811002661934, "grad_norm": 0.38519805669784546, "learning_rate": 1.3146413606112893e-05, "loss": 0.0537, "step": 7365 }, { "epoch": 0.8172138420585625, "grad_norm": 0.31656613945961, "learning_rate": 1.3142716292827214e-05, "loss": 0.0421, "step": 7368 }, { "epoch": 0.8175465838509317, "grad_norm": 0.3541574776172638, "learning_rate": 1.3139018979541533e-05, "loss": 0.0465, "step": 7371 }, { "epoch": 0.8178793256433008, "grad_norm": 0.751785159111023, "learning_rate": 1.3135321666255855e-05, "loss": 0.0379, "step": 7374 }, { "epoch": 0.8182120674356699, "grad_norm": 0.19678965210914612, "learning_rate": 1.3131624352970176e-05, "loss": 0.0677, "step": 7377 }, { "epoch": 0.8185448092280391, "grad_norm": 1.1748946905136108, "learning_rate": 1.3127927039684495e-05, "loss": 0.0845, "step": 7380 }, { "epoch": 0.8188775510204082, "grad_norm": 0.5204541683197021, "learning_rate": 1.3124229726398818e-05, "loss": 0.0656, "step": 7383 }, { "epoch": 0.8192102928127772, "grad_norm": 0.5619506239891052, "learning_rate": 1.3120532413113139e-05, "loss": 0.074, "step": 7386 }, { "epoch": 0.8195430346051464, "grad_norm": 0.283080130815506, "learning_rate": 1.3116835099827461e-05, "loss": 0.0409, "step": 7389 }, { "epoch": 0.8198757763975155, "grad_norm": 0.3270953893661499, "learning_rate": 1.311313778654178e-05, "loss": 0.0374, "step": 7392 }, { "epoch": 0.8202085181898846, "grad_norm": 0.6178573369979858, "learning_rate": 1.3109440473256101e-05, "loss": 0.0509, "step": 7395 }, { "epoch": 0.8205412599822538, "grad_norm": 0.6750481724739075, "learning_rate": 1.3105743159970424e-05, "loss": 0.0446, "step": 7398 }, { "epoch": 0.8208740017746229, "grad_norm": 0.30809807777404785, "learning_rate": 1.3102045846684743e-05, "loss": 0.0408, "step": 7401 }, { "epoch": 0.821206743566992, "grad_norm": 0.6466769576072693, "learning_rate": 1.3098348533399064e-05, "loss": 0.0606, "step": 7404 }, { "epoch": 0.8215394853593612, "grad_norm": 0.9239729642868042, "learning_rate": 1.3094651220113386e-05, "loss": 0.0621, "step": 7407 }, { "epoch": 0.8218722271517303, "grad_norm": 0.6958945393562317, "learning_rate": 1.3090953906827705e-05, "loss": 0.0725, "step": 7410 }, { "epoch": 0.8222049689440993, "grad_norm": 0.8560866713523865, "learning_rate": 1.3087256593542028e-05, "loss": 0.0594, "step": 7413 }, { "epoch": 0.8225377107364685, "grad_norm": 0.7524082660675049, "learning_rate": 1.3083559280256349e-05, "loss": 0.0568, "step": 7416 }, { "epoch": 0.8228704525288376, "grad_norm": 0.5793740749359131, "learning_rate": 1.3079861966970668e-05, "loss": 0.0434, "step": 7419 }, { "epoch": 0.8232031943212067, "grad_norm": 0.6920075416564941, "learning_rate": 1.307616465368499e-05, "loss": 0.048, "step": 7422 }, { "epoch": 0.8235359361135759, "grad_norm": 1.1000868082046509, "learning_rate": 1.3072467340399311e-05, "loss": 0.0471, "step": 7425 }, { "epoch": 0.823868677905945, "grad_norm": 0.3021376430988312, "learning_rate": 1.306877002711363e-05, "loss": 0.0383, "step": 7428 }, { "epoch": 0.8242014196983141, "grad_norm": 0.7696059942245483, "learning_rate": 1.3065072713827953e-05, "loss": 0.0451, "step": 7431 }, { "epoch": 0.8245341614906833, "grad_norm": 2.465219020843506, "learning_rate": 1.3061375400542274e-05, "loss": 0.0909, "step": 7434 }, { "epoch": 0.8248669032830523, "grad_norm": 0.3446286916732788, "learning_rate": 1.3057678087256596e-05, "loss": 0.1099, "step": 7437 }, { "epoch": 0.8251996450754214, "grad_norm": 0.48549941182136536, "learning_rate": 1.3053980773970915e-05, "loss": 0.0463, "step": 7440 }, { "epoch": 0.8255323868677906, "grad_norm": 0.8483194708824158, "learning_rate": 1.3050283460685236e-05, "loss": 0.0884, "step": 7443 }, { "epoch": 0.8258651286601597, "grad_norm": 0.566428542137146, "learning_rate": 1.3046586147399559e-05, "loss": 0.0478, "step": 7446 }, { "epoch": 0.8261978704525288, "grad_norm": 0.7313326597213745, "learning_rate": 1.3042888834113878e-05, "loss": 0.0465, "step": 7449 }, { "epoch": 0.826530612244898, "grad_norm": 0.37779733538627625, "learning_rate": 1.3039191520828199e-05, "loss": 0.0376, "step": 7452 }, { "epoch": 0.8268633540372671, "grad_norm": 0.31103476881980896, "learning_rate": 1.3035494207542521e-05, "loss": 0.0625, "step": 7455 }, { "epoch": 0.8271960958296362, "grad_norm": 0.6001980304718018, "learning_rate": 1.303179689425684e-05, "loss": 0.0512, "step": 7458 }, { "epoch": 0.8275288376220054, "grad_norm": 0.41833606362342834, "learning_rate": 1.3028099580971163e-05, "loss": 0.0422, "step": 7461 }, { "epoch": 0.8278615794143744, "grad_norm": 0.7734582424163818, "learning_rate": 1.3024402267685483e-05, "loss": 0.0411, "step": 7464 }, { "epoch": 0.8281943212067435, "grad_norm": 0.9161800146102905, "learning_rate": 1.3020704954399803e-05, "loss": 0.0417, "step": 7467 }, { "epoch": 0.8285270629991127, "grad_norm": 1.7327044010162354, "learning_rate": 1.3017007641114125e-05, "loss": 0.0456, "step": 7470 }, { "epoch": 0.8288598047914818, "grad_norm": 0.5528527498245239, "learning_rate": 1.3013310327828446e-05, "loss": 0.0591, "step": 7473 }, { "epoch": 0.8291925465838509, "grad_norm": 0.5668066740036011, "learning_rate": 1.3009613014542765e-05, "loss": 0.0537, "step": 7476 }, { "epoch": 0.8295252883762201, "grad_norm": 0.7676364183425903, "learning_rate": 1.3005915701257088e-05, "loss": 0.0526, "step": 7479 }, { "epoch": 0.8298580301685892, "grad_norm": 0.5485039949417114, "learning_rate": 1.3002218387971408e-05, "loss": 0.0277, "step": 7482 }, { "epoch": 0.8301907719609583, "grad_norm": 0.9598966240882874, "learning_rate": 1.2998521074685731e-05, "loss": 0.1076, "step": 7485 }, { "epoch": 0.8305235137533274, "grad_norm": 0.32002970576286316, "learning_rate": 1.299482376140005e-05, "loss": 0.0456, "step": 7488 }, { "epoch": 0.8308562555456965, "grad_norm": 0.6821205019950867, "learning_rate": 1.2991126448114371e-05, "loss": 0.0569, "step": 7491 }, { "epoch": 0.8311889973380656, "grad_norm": 0.6021560430526733, "learning_rate": 1.2987429134828693e-05, "loss": 0.0744, "step": 7494 }, { "epoch": 0.8315217391304348, "grad_norm": 0.8638403415679932, "learning_rate": 1.2983731821543013e-05, "loss": 0.0668, "step": 7497 }, { "epoch": 0.8318544809228039, "grad_norm": 0.7883462309837341, "learning_rate": 1.2980034508257333e-05, "loss": 0.0823, "step": 7500 }, { "epoch": 0.832187222715173, "grad_norm": 0.6928893327713013, "learning_rate": 1.2976337194971656e-05, "loss": 0.054, "step": 7503 }, { "epoch": 0.8325199645075422, "grad_norm": 0.6112784743309021, "learning_rate": 1.2972639881685975e-05, "loss": 0.0515, "step": 7506 }, { "epoch": 0.8328527062999113, "grad_norm": 1.8084242343902588, "learning_rate": 1.2968942568400297e-05, "loss": 0.0641, "step": 7509 }, { "epoch": 0.8331854480922803, "grad_norm": 0.6865580081939697, "learning_rate": 1.2965245255114618e-05, "loss": 0.0556, "step": 7512 }, { "epoch": 0.8335181898846495, "grad_norm": 1.205472707748413, "learning_rate": 1.2961547941828937e-05, "loss": 0.0615, "step": 7515 }, { "epoch": 0.8338509316770186, "grad_norm": 0.8076202273368835, "learning_rate": 1.295785062854326e-05, "loss": 0.1152, "step": 7518 }, { "epoch": 0.8341836734693877, "grad_norm": 0.456226646900177, "learning_rate": 1.295415331525758e-05, "loss": 0.0453, "step": 7521 }, { "epoch": 0.8345164152617569, "grad_norm": 0.3561217188835144, "learning_rate": 1.29504560019719e-05, "loss": 0.0378, "step": 7524 }, { "epoch": 0.834849157054126, "grad_norm": 0.77933269739151, "learning_rate": 1.2946758688686222e-05, "loss": 0.0633, "step": 7527 }, { "epoch": 0.8351818988464951, "grad_norm": 1.3405306339263916, "learning_rate": 1.2943061375400543e-05, "loss": 0.0491, "step": 7530 }, { "epoch": 0.8355146406388643, "grad_norm": 0.46247807145118713, "learning_rate": 1.2939364062114866e-05, "loss": 0.0698, "step": 7533 }, { "epoch": 0.8358473824312334, "grad_norm": 1.0123552083969116, "learning_rate": 1.2935666748829185e-05, "loss": 0.0598, "step": 7536 }, { "epoch": 0.8361801242236024, "grad_norm": 0.9487162828445435, "learning_rate": 1.2931969435543506e-05, "loss": 0.0673, "step": 7539 }, { "epoch": 0.8365128660159716, "grad_norm": 0.3524555265903473, "learning_rate": 1.2928272122257828e-05, "loss": 0.0604, "step": 7542 }, { "epoch": 0.8368456078083407, "grad_norm": 1.4302773475646973, "learning_rate": 1.2924574808972147e-05, "loss": 0.0514, "step": 7545 }, { "epoch": 0.8371783496007098, "grad_norm": 0.441597580909729, "learning_rate": 1.2920877495686468e-05, "loss": 0.0503, "step": 7548 }, { "epoch": 0.837511091393079, "grad_norm": 0.49569326639175415, "learning_rate": 1.291718018240079e-05, "loss": 0.0333, "step": 7551 }, { "epoch": 0.8378438331854481, "grad_norm": 1.5445497035980225, "learning_rate": 1.291348286911511e-05, "loss": 0.0344, "step": 7554 }, { "epoch": 0.8381765749778172, "grad_norm": 0.7204262018203735, "learning_rate": 1.2909785555829432e-05, "loss": 0.0589, "step": 7557 }, { "epoch": 0.8385093167701864, "grad_norm": 0.4354321360588074, "learning_rate": 1.2906088242543753e-05, "loss": 0.069, "step": 7560 }, { "epoch": 0.8388420585625554, "grad_norm": 0.39966753125190735, "learning_rate": 1.2902390929258072e-05, "loss": 0.0201, "step": 7563 }, { "epoch": 0.8391748003549245, "grad_norm": 0.2567201852798462, "learning_rate": 1.2898693615972395e-05, "loss": 0.0501, "step": 7566 }, { "epoch": 0.8395075421472937, "grad_norm": 0.7314066290855408, "learning_rate": 1.2894996302686716e-05, "loss": 0.0371, "step": 7569 }, { "epoch": 0.8398402839396628, "grad_norm": 0.5857692360877991, "learning_rate": 1.2891298989401035e-05, "loss": 0.0668, "step": 7572 }, { "epoch": 0.8401730257320319, "grad_norm": 0.7133364081382751, "learning_rate": 1.2887601676115357e-05, "loss": 0.0627, "step": 7575 }, { "epoch": 0.8405057675244011, "grad_norm": 0.4989025592803955, "learning_rate": 1.2883904362829678e-05, "loss": 0.056, "step": 7578 }, { "epoch": 0.8408385093167702, "grad_norm": 0.2834363877773285, "learning_rate": 1.2880207049544e-05, "loss": 0.0249, "step": 7581 }, { "epoch": 0.8411712511091393, "grad_norm": 0.14443466067314148, "learning_rate": 1.287650973625832e-05, "loss": 0.0421, "step": 7584 }, { "epoch": 0.8415039929015085, "grad_norm": 0.4321666657924652, "learning_rate": 1.287281242297264e-05, "loss": 0.0586, "step": 7587 }, { "epoch": 0.8418367346938775, "grad_norm": 1.0148134231567383, "learning_rate": 1.2869115109686963e-05, "loss": 0.0648, "step": 7590 }, { "epoch": 0.8421694764862466, "grad_norm": 0.30559831857681274, "learning_rate": 1.2865417796401282e-05, "loss": 0.0444, "step": 7593 }, { "epoch": 0.8425022182786158, "grad_norm": 0.3337336778640747, "learning_rate": 1.2861720483115603e-05, "loss": 0.05, "step": 7596 }, { "epoch": 0.8428349600709849, "grad_norm": 0.20605742931365967, "learning_rate": 1.2858023169829925e-05, "loss": 0.0534, "step": 7599 }, { "epoch": 0.843167701863354, "grad_norm": 0.4832225739955902, "learning_rate": 1.2854325856544245e-05, "loss": 0.0508, "step": 7602 }, { "epoch": 0.8435004436557232, "grad_norm": 0.42360275983810425, "learning_rate": 1.2850628543258567e-05, "loss": 0.0209, "step": 7605 }, { "epoch": 0.8438331854480923, "grad_norm": 0.5818816423416138, "learning_rate": 1.2846931229972888e-05, "loss": 0.0231, "step": 7608 }, { "epoch": 0.8441659272404614, "grad_norm": 0.7347850799560547, "learning_rate": 1.2843233916687207e-05, "loss": 0.0562, "step": 7611 }, { "epoch": 0.8444986690328306, "grad_norm": 0.44913449883461, "learning_rate": 1.283953660340153e-05, "loss": 0.0406, "step": 7614 }, { "epoch": 0.8448314108251996, "grad_norm": 0.9635486006736755, "learning_rate": 1.283583929011585e-05, "loss": 0.0462, "step": 7617 }, { "epoch": 0.8451641526175687, "grad_norm": 0.7667298316955566, "learning_rate": 1.283214197683017e-05, "loss": 0.036, "step": 7620 }, { "epoch": 0.8454968944099379, "grad_norm": 0.34087875485420227, "learning_rate": 1.2828444663544492e-05, "loss": 0.0441, "step": 7623 }, { "epoch": 0.845829636202307, "grad_norm": 0.7045387029647827, "learning_rate": 1.2824747350258813e-05, "loss": 0.0752, "step": 7626 }, { "epoch": 0.8461623779946761, "grad_norm": 0.5729896426200867, "learning_rate": 1.2821050036973135e-05, "loss": 0.0605, "step": 7629 }, { "epoch": 0.8464951197870453, "grad_norm": 0.6412808895111084, "learning_rate": 1.2817352723687454e-05, "loss": 0.0473, "step": 7632 }, { "epoch": 0.8468278615794144, "grad_norm": 0.52358078956604, "learning_rate": 1.2813655410401775e-05, "loss": 0.0683, "step": 7635 }, { "epoch": 0.8471606033717834, "grad_norm": 0.475503534078598, "learning_rate": 1.2809958097116098e-05, "loss": 0.0436, "step": 7638 }, { "epoch": 0.8474933451641526, "grad_norm": 0.43619441986083984, "learning_rate": 1.2806260783830417e-05, "loss": 0.0562, "step": 7641 }, { "epoch": 0.8478260869565217, "grad_norm": 0.891871988773346, "learning_rate": 1.2802563470544738e-05, "loss": 0.0498, "step": 7644 }, { "epoch": 0.8481588287488908, "grad_norm": 0.32903188467025757, "learning_rate": 1.279886615725906e-05, "loss": 0.0375, "step": 7647 }, { "epoch": 0.84849157054126, "grad_norm": 0.5414978265762329, "learning_rate": 1.279516884397338e-05, "loss": 0.056, "step": 7650 }, { "epoch": 0.8488243123336291, "grad_norm": 0.4450920820236206, "learning_rate": 1.2791471530687702e-05, "loss": 0.031, "step": 7653 }, { "epoch": 0.8491570541259982, "grad_norm": 0.1829187422990799, "learning_rate": 1.2787774217402023e-05, "loss": 0.0568, "step": 7656 }, { "epoch": 0.8494897959183674, "grad_norm": 1.0080453157424927, "learning_rate": 1.2784076904116342e-05, "loss": 0.0566, "step": 7659 }, { "epoch": 0.8498225377107365, "grad_norm": 0.5249780416488647, "learning_rate": 1.2780379590830664e-05, "loss": 0.0653, "step": 7662 }, { "epoch": 0.8501552795031055, "grad_norm": 0.47478652000427246, "learning_rate": 1.2776682277544985e-05, "loss": 0.0457, "step": 7665 }, { "epoch": 0.8504880212954747, "grad_norm": 0.3718480169773102, "learning_rate": 1.2772984964259304e-05, "loss": 0.0285, "step": 7668 }, { "epoch": 0.8508207630878438, "grad_norm": 0.8667476177215576, "learning_rate": 1.2769287650973627e-05, "loss": 0.0693, "step": 7671 }, { "epoch": 0.8511535048802129, "grad_norm": 1.1502543687820435, "learning_rate": 1.2765590337687948e-05, "loss": 0.0457, "step": 7674 }, { "epoch": 0.8514862466725821, "grad_norm": 0.6075298190116882, "learning_rate": 1.2761893024402267e-05, "loss": 0.0458, "step": 7677 }, { "epoch": 0.8518189884649512, "grad_norm": 0.6621043086051941, "learning_rate": 1.275819571111659e-05, "loss": 0.0496, "step": 7680 }, { "epoch": 0.8521517302573203, "grad_norm": 0.5987185835838318, "learning_rate": 1.275449839783091e-05, "loss": 0.0484, "step": 7683 }, { "epoch": 0.8524844720496895, "grad_norm": 1.1421009302139282, "learning_rate": 1.2750801084545233e-05, "loss": 0.0589, "step": 7686 }, { "epoch": 0.8528172138420586, "grad_norm": 0.3434879779815674, "learning_rate": 1.2747103771259552e-05, "loss": 0.0215, "step": 7689 }, { "epoch": 0.8531499556344276, "grad_norm": 0.7913606762886047, "learning_rate": 1.2743406457973873e-05, "loss": 0.0609, "step": 7692 }, { "epoch": 0.8534826974267968, "grad_norm": 0.42842620611190796, "learning_rate": 1.2739709144688195e-05, "loss": 0.0644, "step": 7695 }, { "epoch": 0.8538154392191659, "grad_norm": 0.541172206401825, "learning_rate": 1.2736011831402514e-05, "loss": 0.0477, "step": 7698 }, { "epoch": 0.854148181011535, "grad_norm": 1.0120563507080078, "learning_rate": 1.2732314518116835e-05, "loss": 0.0585, "step": 7701 }, { "epoch": 0.8544809228039042, "grad_norm": 0.3690725266933441, "learning_rate": 1.2728617204831158e-05, "loss": 0.0603, "step": 7704 }, { "epoch": 0.8548136645962733, "grad_norm": 0.7803979516029358, "learning_rate": 1.2724919891545477e-05, "loss": 0.078, "step": 7707 }, { "epoch": 0.8551464063886424, "grad_norm": 0.5937948226928711, "learning_rate": 1.27212225782598e-05, "loss": 0.0526, "step": 7710 }, { "epoch": 0.8554791481810116, "grad_norm": 0.6214296817779541, "learning_rate": 1.271752526497412e-05, "loss": 0.0799, "step": 7713 }, { "epoch": 0.8558118899733806, "grad_norm": 0.3988298177719116, "learning_rate": 1.271382795168844e-05, "loss": 0.0569, "step": 7716 }, { "epoch": 0.8561446317657497, "grad_norm": 0.727608859539032, "learning_rate": 1.2710130638402762e-05, "loss": 0.0722, "step": 7719 }, { "epoch": 0.8564773735581189, "grad_norm": 0.4343746304512024, "learning_rate": 1.2706433325117082e-05, "loss": 0.0659, "step": 7722 }, { "epoch": 0.856810115350488, "grad_norm": 0.5968846678733826, "learning_rate": 1.2702736011831403e-05, "loss": 0.0502, "step": 7725 }, { "epoch": 0.8571428571428571, "grad_norm": 0.522430419921875, "learning_rate": 1.2699038698545724e-05, "loss": 0.0648, "step": 7728 }, { "epoch": 0.8574755989352263, "grad_norm": 0.7417258620262146, "learning_rate": 1.2695341385260045e-05, "loss": 0.0647, "step": 7731 }, { "epoch": 0.8578083407275954, "grad_norm": 0.6392063498497009, "learning_rate": 1.2691644071974367e-05, "loss": 0.0387, "step": 7734 }, { "epoch": 0.8581410825199645, "grad_norm": 0.5037846565246582, "learning_rate": 1.2687946758688687e-05, "loss": 0.0361, "step": 7737 }, { "epoch": 0.8584738243123337, "grad_norm": 0.7957912087440491, "learning_rate": 1.2684249445403007e-05, "loss": 0.0576, "step": 7740 }, { "epoch": 0.8588065661047027, "grad_norm": 0.3617507219314575, "learning_rate": 1.268055213211733e-05, "loss": 0.0422, "step": 7743 }, { "epoch": 0.8591393078970718, "grad_norm": 0.6975365281105042, "learning_rate": 1.2676854818831649e-05, "loss": 0.0509, "step": 7746 }, { "epoch": 0.859472049689441, "grad_norm": 0.4433472156524658, "learning_rate": 1.267315750554597e-05, "loss": 0.0659, "step": 7749 }, { "epoch": 0.8598047914818101, "grad_norm": 0.4558544158935547, "learning_rate": 1.2669460192260292e-05, "loss": 0.0471, "step": 7752 }, { "epoch": 0.8601375332741792, "grad_norm": 0.48671403527259827, "learning_rate": 1.2665762878974612e-05, "loss": 0.0715, "step": 7755 }, { "epoch": 0.8604702750665484, "grad_norm": 0.562803328037262, "learning_rate": 1.2662065565688934e-05, "loss": 0.0417, "step": 7758 }, { "epoch": 0.8608030168589175, "grad_norm": 0.9497453570365906, "learning_rate": 1.2658368252403255e-05, "loss": 0.058, "step": 7761 }, { "epoch": 0.8611357586512866, "grad_norm": 0.7908824682235718, "learning_rate": 1.2654670939117576e-05, "loss": 0.0435, "step": 7764 }, { "epoch": 0.8614685004436557, "grad_norm": 0.3684213161468506, "learning_rate": 1.2650973625831896e-05, "loss": 0.0421, "step": 7767 }, { "epoch": 0.8618012422360248, "grad_norm": 0.46868011355400085, "learning_rate": 1.2647276312546217e-05, "loss": 0.0453, "step": 7770 }, { "epoch": 0.8621339840283939, "grad_norm": 0.4781460464000702, "learning_rate": 1.2643578999260538e-05, "loss": 0.0457, "step": 7773 }, { "epoch": 0.8624667258207631, "grad_norm": 0.5065357685089111, "learning_rate": 1.2639881685974859e-05, "loss": 0.0515, "step": 7776 }, { "epoch": 0.8627994676131322, "grad_norm": 0.3533953130245209, "learning_rate": 1.263618437268918e-05, "loss": 0.0459, "step": 7779 }, { "epoch": 0.8631322094055013, "grad_norm": 0.8925074338912964, "learning_rate": 1.2632487059403502e-05, "loss": 0.0564, "step": 7782 }, { "epoch": 0.8634649511978705, "grad_norm": 0.34452107548713684, "learning_rate": 1.2628789746117821e-05, "loss": 0.0409, "step": 7785 }, { "epoch": 0.8637976929902396, "grad_norm": 0.6587271690368652, "learning_rate": 1.2625092432832142e-05, "loss": 0.0476, "step": 7788 }, { "epoch": 0.8641304347826086, "grad_norm": 0.6241523027420044, "learning_rate": 1.2621395119546465e-05, "loss": 0.0276, "step": 7791 }, { "epoch": 0.8644631765749778, "grad_norm": 0.5111555457115173, "learning_rate": 1.2617697806260784e-05, "loss": 0.0785, "step": 7794 }, { "epoch": 0.8647959183673469, "grad_norm": 0.33080583810806274, "learning_rate": 1.2614000492975105e-05, "loss": 0.0447, "step": 7797 }, { "epoch": 0.865128660159716, "grad_norm": 0.6154320240020752, "learning_rate": 1.2610303179689427e-05, "loss": 0.0292, "step": 7800 }, { "epoch": 0.8654614019520852, "grad_norm": 1.1219552755355835, "learning_rate": 1.2606605866403746e-05, "loss": 0.0946, "step": 7803 }, { "epoch": 0.8657941437444543, "grad_norm": 0.7328761219978333, "learning_rate": 1.2602908553118069e-05, "loss": 0.0661, "step": 7806 }, { "epoch": 0.8661268855368234, "grad_norm": 0.5077900290489197, "learning_rate": 1.259921123983239e-05, "loss": 0.0342, "step": 7809 }, { "epoch": 0.8664596273291926, "grad_norm": 0.8185129761695862, "learning_rate": 1.259551392654671e-05, "loss": 0.0338, "step": 7812 }, { "epoch": 0.8667923691215617, "grad_norm": 0.23956714570522308, "learning_rate": 1.2591816613261031e-05, "loss": 0.0509, "step": 7815 }, { "epoch": 0.8671251109139307, "grad_norm": 0.6616832613945007, "learning_rate": 1.2588119299975352e-05, "loss": 0.0387, "step": 7818 }, { "epoch": 0.8674578527062999, "grad_norm": 0.3278239667415619, "learning_rate": 1.2584421986689673e-05, "loss": 0.0523, "step": 7821 }, { "epoch": 0.867790594498669, "grad_norm": 0.44100144505500793, "learning_rate": 1.2580724673403994e-05, "loss": 0.0448, "step": 7824 }, { "epoch": 0.8681233362910381, "grad_norm": 0.7832165956497192, "learning_rate": 1.2577027360118315e-05, "loss": 0.0285, "step": 7827 }, { "epoch": 0.8684560780834073, "grad_norm": 0.39864689111709595, "learning_rate": 1.2573330046832637e-05, "loss": 0.0537, "step": 7830 }, { "epoch": 0.8687888198757764, "grad_norm": 0.6101077198982239, "learning_rate": 1.2569632733546956e-05, "loss": 0.0702, "step": 7833 }, { "epoch": 0.8691215616681455, "grad_norm": 0.7591128349304199, "learning_rate": 1.2565935420261277e-05, "loss": 0.0605, "step": 7836 }, { "epoch": 0.8694543034605147, "grad_norm": 0.7345403432846069, "learning_rate": 1.25622381069756e-05, "loss": 0.0628, "step": 7839 }, { "epoch": 0.8697870452528838, "grad_norm": 0.6341096758842468, "learning_rate": 1.2558540793689919e-05, "loss": 0.0626, "step": 7842 }, { "epoch": 0.8701197870452528, "grad_norm": 0.7364404201507568, "learning_rate": 1.255484348040424e-05, "loss": 0.074, "step": 7845 }, { "epoch": 0.870452528837622, "grad_norm": 0.3706710934638977, "learning_rate": 1.2551146167118562e-05, "loss": 0.0352, "step": 7848 }, { "epoch": 0.8707852706299911, "grad_norm": 1.358520269393921, "learning_rate": 1.2547448853832883e-05, "loss": 0.0359, "step": 7851 }, { "epoch": 0.8711180124223602, "grad_norm": 0.21366752684116364, "learning_rate": 1.2543751540547204e-05, "loss": 0.0541, "step": 7854 }, { "epoch": 0.8714507542147294, "grad_norm": 0.6452783346176147, "learning_rate": 1.2540054227261524e-05, "loss": 0.055, "step": 7857 }, { "epoch": 0.8717834960070985, "grad_norm": 0.328064501285553, "learning_rate": 1.2536356913975845e-05, "loss": 0.059, "step": 7860 }, { "epoch": 0.8721162377994676, "grad_norm": 0.5891025066375732, "learning_rate": 1.2532659600690166e-05, "loss": 0.0741, "step": 7863 }, { "epoch": 0.8724489795918368, "grad_norm": 0.35679006576538086, "learning_rate": 1.2528962287404487e-05, "loss": 0.0406, "step": 7866 }, { "epoch": 0.8727817213842058, "grad_norm": 0.19496145844459534, "learning_rate": 1.2525264974118808e-05, "loss": 0.049, "step": 7869 }, { "epoch": 0.8731144631765749, "grad_norm": 0.4196733832359314, "learning_rate": 1.2521567660833129e-05, "loss": 0.0318, "step": 7872 }, { "epoch": 0.8734472049689441, "grad_norm": 0.4948839247226715, "learning_rate": 1.251787034754745e-05, "loss": 0.0377, "step": 7875 }, { "epoch": 0.8737799467613132, "grad_norm": 0.8529870510101318, "learning_rate": 1.2514173034261772e-05, "loss": 0.0882, "step": 7878 }, { "epoch": 0.8741126885536823, "grad_norm": 0.20677830278873444, "learning_rate": 1.2510475720976091e-05, "loss": 0.0402, "step": 7881 }, { "epoch": 0.8744454303460515, "grad_norm": 0.26061496138572693, "learning_rate": 1.2506778407690412e-05, "loss": 0.0523, "step": 7884 }, { "epoch": 0.8747781721384206, "grad_norm": 0.9495267868041992, "learning_rate": 1.2503081094404734e-05, "loss": 0.0629, "step": 7887 }, { "epoch": 0.8751109139307897, "grad_norm": 0.6823452115058899, "learning_rate": 1.2499383781119055e-05, "loss": 0.0627, "step": 7890 }, { "epoch": 0.8754436557231589, "grad_norm": 0.8460421562194824, "learning_rate": 1.2495686467833374e-05, "loss": 0.0685, "step": 7893 }, { "epoch": 0.8757763975155279, "grad_norm": 0.3861057460308075, "learning_rate": 1.2491989154547697e-05, "loss": 0.0436, "step": 7896 }, { "epoch": 0.876109139307897, "grad_norm": 1.0978728532791138, "learning_rate": 1.2488291841262018e-05, "loss": 0.0702, "step": 7899 }, { "epoch": 0.8764418811002662, "grad_norm": 0.5093382000923157, "learning_rate": 1.2484594527976338e-05, "loss": 0.0293, "step": 7902 }, { "epoch": 0.8767746228926353, "grad_norm": 0.7078436017036438, "learning_rate": 1.248089721469066e-05, "loss": 0.0454, "step": 7905 }, { "epoch": 0.8771073646850044, "grad_norm": 0.4556216299533844, "learning_rate": 1.247719990140498e-05, "loss": 0.0356, "step": 7908 }, { "epoch": 0.8774401064773736, "grad_norm": 0.4774017035961151, "learning_rate": 1.2473502588119301e-05, "loss": 0.0384, "step": 7911 }, { "epoch": 0.8777728482697427, "grad_norm": 0.42073532938957214, "learning_rate": 1.2469805274833622e-05, "loss": 0.06, "step": 7914 }, { "epoch": 0.8781055900621118, "grad_norm": 0.41442716121673584, "learning_rate": 1.2466107961547943e-05, "loss": 0.0236, "step": 7917 }, { "epoch": 0.878438331854481, "grad_norm": 0.625836968421936, "learning_rate": 1.2462410648262263e-05, "loss": 0.0591, "step": 7920 }, { "epoch": 0.87877107364685, "grad_norm": 0.5491658449172974, "learning_rate": 1.2458713334976584e-05, "loss": 0.0705, "step": 7923 }, { "epoch": 0.8791038154392191, "grad_norm": 0.36761680245399475, "learning_rate": 1.2455016021690907e-05, "loss": 0.0758, "step": 7926 }, { "epoch": 0.8794365572315883, "grad_norm": 0.6091301441192627, "learning_rate": 1.2451318708405228e-05, "loss": 0.042, "step": 7929 }, { "epoch": 0.8797692990239574, "grad_norm": 0.31299662590026855, "learning_rate": 1.2447621395119547e-05, "loss": 0.0404, "step": 7932 }, { "epoch": 0.8801020408163265, "grad_norm": 0.6645473837852478, "learning_rate": 1.244392408183387e-05, "loss": 0.0611, "step": 7935 }, { "epoch": 0.8804347826086957, "grad_norm": 0.48847904801368713, "learning_rate": 1.244022676854819e-05, "loss": 0.0457, "step": 7938 }, { "epoch": 0.8807675244010648, "grad_norm": 0.4157494902610779, "learning_rate": 1.2436529455262509e-05, "loss": 0.0284, "step": 7941 }, { "epoch": 0.8811002661934338, "grad_norm": 0.7567192316055298, "learning_rate": 1.2432832141976832e-05, "loss": 0.0385, "step": 7944 }, { "epoch": 0.881433007985803, "grad_norm": 0.3110061585903168, "learning_rate": 1.2429134828691152e-05, "loss": 0.0708, "step": 7947 }, { "epoch": 0.8817657497781721, "grad_norm": 0.6832455992698669, "learning_rate": 1.2425437515405473e-05, "loss": 0.0219, "step": 7950 }, { "epoch": 0.8820984915705412, "grad_norm": 0.3486057221889496, "learning_rate": 1.2421740202119794e-05, "loss": 0.0282, "step": 7953 }, { "epoch": 0.8824312333629104, "grad_norm": 0.757685124874115, "learning_rate": 1.2418042888834115e-05, "loss": 0.0604, "step": 7956 }, { "epoch": 0.8827639751552795, "grad_norm": 0.9962036609649658, "learning_rate": 1.2414345575548436e-05, "loss": 0.0905, "step": 7959 }, { "epoch": 0.8830967169476486, "grad_norm": 0.8406308889389038, "learning_rate": 1.2410648262262757e-05, "loss": 0.0617, "step": 7962 }, { "epoch": 0.8834294587400178, "grad_norm": 0.35052600502967834, "learning_rate": 1.2406950948977077e-05, "loss": 0.0405, "step": 7965 }, { "epoch": 0.8837622005323869, "grad_norm": 0.6545705795288086, "learning_rate": 1.2403253635691398e-05, "loss": 0.0512, "step": 7968 }, { "epoch": 0.8840949423247559, "grad_norm": 0.27982020378112793, "learning_rate": 1.2399556322405719e-05, "loss": 0.0606, "step": 7971 }, { "epoch": 0.8844276841171251, "grad_norm": 0.22846142947673798, "learning_rate": 1.2395859009120042e-05, "loss": 0.0604, "step": 7974 }, { "epoch": 0.8847604259094942, "grad_norm": 0.4169063866138458, "learning_rate": 1.2392161695834362e-05, "loss": 0.0489, "step": 7977 }, { "epoch": 0.8850931677018633, "grad_norm": 0.935775101184845, "learning_rate": 1.2388464382548681e-05, "loss": 0.0809, "step": 7980 }, { "epoch": 0.8854259094942325, "grad_norm": 0.7642662525177002, "learning_rate": 1.2384767069263004e-05, "loss": 0.0429, "step": 7983 }, { "epoch": 0.8857586512866016, "grad_norm": 0.4463235139846802, "learning_rate": 1.2381069755977325e-05, "loss": 0.0573, "step": 7986 }, { "epoch": 0.8860913930789707, "grad_norm": 0.6088737845420837, "learning_rate": 1.2377372442691644e-05, "loss": 0.0469, "step": 7989 }, { "epoch": 0.8864241348713399, "grad_norm": 1.0122240781784058, "learning_rate": 1.2373675129405966e-05, "loss": 0.0506, "step": 7992 }, { "epoch": 0.886756876663709, "grad_norm": 0.48188844323158264, "learning_rate": 1.2369977816120287e-05, "loss": 0.0417, "step": 7995 }, { "epoch": 0.887089618456078, "grad_norm": 0.446555495262146, "learning_rate": 1.2366280502834608e-05, "loss": 0.0699, "step": 7998 }, { "epoch": 0.8874223602484472, "grad_norm": 0.41115742921829224, "learning_rate": 1.2362583189548929e-05, "loss": 0.049, "step": 8001 }, { "epoch": 0.8877551020408163, "grad_norm": 0.5750285387039185, "learning_rate": 1.235888587626325e-05, "loss": 0.0353, "step": 8004 }, { "epoch": 0.8880878438331854, "grad_norm": 0.3349131941795349, "learning_rate": 1.235518856297757e-05, "loss": 0.0574, "step": 8007 }, { "epoch": 0.8884205856255546, "grad_norm": 0.6187319159507751, "learning_rate": 1.2351491249691891e-05, "loss": 0.0686, "step": 8010 }, { "epoch": 0.8887533274179237, "grad_norm": 0.5889426469802856, "learning_rate": 1.2347793936406212e-05, "loss": 0.0361, "step": 8013 }, { "epoch": 0.8890860692102928, "grad_norm": 0.5451267957687378, "learning_rate": 1.2344096623120535e-05, "loss": 0.0706, "step": 8016 }, { "epoch": 0.889418811002662, "grad_norm": 0.5967592597007751, "learning_rate": 1.2340399309834854e-05, "loss": 0.0356, "step": 8019 }, { "epoch": 0.889751552795031, "grad_norm": 0.34671586751937866, "learning_rate": 1.2336701996549176e-05, "loss": 0.0668, "step": 8022 }, { "epoch": 0.8900842945874001, "grad_norm": 0.6447118520736694, "learning_rate": 1.2333004683263497e-05, "loss": 0.0689, "step": 8025 }, { "epoch": 0.8904170363797693, "grad_norm": 0.4724440574645996, "learning_rate": 1.2329307369977816e-05, "loss": 0.0404, "step": 8028 }, { "epoch": 0.8907497781721384, "grad_norm": 0.6874486207962036, "learning_rate": 1.2325610056692139e-05, "loss": 0.0433, "step": 8031 }, { "epoch": 0.8910825199645075, "grad_norm": 0.9315608739852905, "learning_rate": 1.232191274340646e-05, "loss": 0.0939, "step": 8034 }, { "epoch": 0.8914152617568767, "grad_norm": 0.5204129219055176, "learning_rate": 1.2318215430120779e-05, "loss": 0.0417, "step": 8037 }, { "epoch": 0.8917480035492458, "grad_norm": 0.48837822675704956, "learning_rate": 1.2314518116835101e-05, "loss": 0.0591, "step": 8040 }, { "epoch": 0.8920807453416149, "grad_norm": 0.755622923374176, "learning_rate": 1.2310820803549422e-05, "loss": 0.0375, "step": 8043 }, { "epoch": 0.892413487133984, "grad_norm": 0.5617904663085938, "learning_rate": 1.2307123490263743e-05, "loss": 0.0507, "step": 8046 }, { "epoch": 0.8927462289263531, "grad_norm": 0.5224210619926453, "learning_rate": 1.2303426176978064e-05, "loss": 0.0671, "step": 8049 }, { "epoch": 0.8930789707187222, "grad_norm": 0.6684830784797668, "learning_rate": 1.2299728863692385e-05, "loss": 0.0623, "step": 8052 }, { "epoch": 0.8934117125110914, "grad_norm": 0.12021299451589584, "learning_rate": 1.2296031550406707e-05, "loss": 0.0277, "step": 8055 }, { "epoch": 0.8937444543034605, "grad_norm": 0.35132908821105957, "learning_rate": 1.2292334237121026e-05, "loss": 0.0489, "step": 8058 }, { "epoch": 0.8940771960958296, "grad_norm": 0.20670370757579803, "learning_rate": 1.2288636923835347e-05, "loss": 0.0704, "step": 8061 }, { "epoch": 0.8944099378881988, "grad_norm": 0.5440886616706848, "learning_rate": 1.228493961054967e-05, "loss": 0.0401, "step": 8064 }, { "epoch": 0.8947426796805679, "grad_norm": 0.8803424835205078, "learning_rate": 1.2281242297263989e-05, "loss": 0.0429, "step": 8067 }, { "epoch": 0.895075421472937, "grad_norm": 0.4431580603122711, "learning_rate": 1.2277544983978311e-05, "loss": 0.0683, "step": 8070 }, { "epoch": 0.8954081632653061, "grad_norm": 0.25200363993644714, "learning_rate": 1.2273847670692632e-05, "loss": 0.0256, "step": 8073 }, { "epoch": 0.8957409050576752, "grad_norm": 0.5720739960670471, "learning_rate": 1.2270150357406951e-05, "loss": 0.057, "step": 8076 }, { "epoch": 0.8960736468500443, "grad_norm": 0.5777389407157898, "learning_rate": 1.2266453044121274e-05, "loss": 0.0597, "step": 8079 }, { "epoch": 0.8964063886424135, "grad_norm": 0.289869099855423, "learning_rate": 1.2262755730835594e-05, "loss": 0.0452, "step": 8082 }, { "epoch": 0.8967391304347826, "grad_norm": 0.6418533325195312, "learning_rate": 1.2259058417549914e-05, "loss": 0.0359, "step": 8085 }, { "epoch": 0.8970718722271517, "grad_norm": 0.8481020331382751, "learning_rate": 1.2255361104264236e-05, "loss": 0.0769, "step": 8088 }, { "epoch": 0.8974046140195209, "grad_norm": 0.5481016635894775, "learning_rate": 1.2251663790978557e-05, "loss": 0.0485, "step": 8091 }, { "epoch": 0.89773735581189, "grad_norm": 0.37868669629096985, "learning_rate": 1.224796647769288e-05, "loss": 0.0211, "step": 8094 }, { "epoch": 0.898070097604259, "grad_norm": 0.07777074724435806, "learning_rate": 1.2244269164407199e-05, "loss": 0.0476, "step": 8097 }, { "epoch": 0.8984028393966282, "grad_norm": 0.1662084311246872, "learning_rate": 1.224057185112152e-05, "loss": 0.0437, "step": 8100 }, { "epoch": 0.8987355811889973, "grad_norm": 0.5999681949615479, "learning_rate": 1.2236874537835842e-05, "loss": 0.0867, "step": 8103 }, { "epoch": 0.8990683229813664, "grad_norm": 0.4055352210998535, "learning_rate": 1.2233177224550161e-05, "loss": 0.0481, "step": 8106 }, { "epoch": 0.8994010647737356, "grad_norm": 0.7253134250640869, "learning_rate": 1.2229479911264482e-05, "loss": 0.0448, "step": 8109 }, { "epoch": 0.8997338065661047, "grad_norm": 0.346949964761734, "learning_rate": 1.2225782597978804e-05, "loss": 0.037, "step": 8112 }, { "epoch": 0.9000665483584738, "grad_norm": 0.20739799737930298, "learning_rate": 1.2222085284693123e-05, "loss": 0.0526, "step": 8115 }, { "epoch": 0.900399290150843, "grad_norm": 0.2521004378795624, "learning_rate": 1.2218387971407444e-05, "loss": 0.0344, "step": 8118 }, { "epoch": 0.900732031943212, "grad_norm": 0.6057805418968201, "learning_rate": 1.2214690658121767e-05, "loss": 0.0645, "step": 8121 }, { "epoch": 0.9010647737355811, "grad_norm": 0.4590505063533783, "learning_rate": 1.2210993344836086e-05, "loss": 0.0613, "step": 8124 }, { "epoch": 0.9013975155279503, "grad_norm": 0.3552038371562958, "learning_rate": 1.2207296031550408e-05, "loss": 0.0364, "step": 8127 }, { "epoch": 0.9017302573203194, "grad_norm": 0.7429446578025818, "learning_rate": 1.220359871826473e-05, "loss": 0.0706, "step": 8130 }, { "epoch": 0.9020629991126885, "grad_norm": 0.3581428825855255, "learning_rate": 1.2199901404979048e-05, "loss": 0.0765, "step": 8133 }, { "epoch": 0.9023957409050577, "grad_norm": 1.0063191652297974, "learning_rate": 1.2196204091693371e-05, "loss": 0.0652, "step": 8136 }, { "epoch": 0.9027284826974268, "grad_norm": 0.5285840034484863, "learning_rate": 1.2192506778407692e-05, "loss": 0.0473, "step": 8139 }, { "epoch": 0.9030612244897959, "grad_norm": 0.6088361740112305, "learning_rate": 1.2188809465122011e-05, "loss": 0.0963, "step": 8142 }, { "epoch": 0.9033939662821651, "grad_norm": 0.3821619749069214, "learning_rate": 1.2185112151836333e-05, "loss": 0.0284, "step": 8145 }, { "epoch": 0.9037267080745341, "grad_norm": 0.7411423921585083, "learning_rate": 1.2181414838550654e-05, "loss": 0.0588, "step": 8148 }, { "epoch": 0.9040594498669032, "grad_norm": 1.0509227514266968, "learning_rate": 1.2177717525264977e-05, "loss": 0.0655, "step": 8151 }, { "epoch": 0.9043921916592724, "grad_norm": 0.6277338266372681, "learning_rate": 1.2174020211979296e-05, "loss": 0.0699, "step": 8154 }, { "epoch": 0.9047249334516415, "grad_norm": 0.398895800113678, "learning_rate": 1.2170322898693617e-05, "loss": 0.0821, "step": 8157 }, { "epoch": 0.9050576752440106, "grad_norm": 0.6099722385406494, "learning_rate": 1.216662558540794e-05, "loss": 0.0427, "step": 8160 }, { "epoch": 0.9053904170363798, "grad_norm": 0.7993012070655823, "learning_rate": 1.2162928272122258e-05, "loss": 0.0754, "step": 8163 }, { "epoch": 0.9057231588287489, "grad_norm": 0.3248973488807678, "learning_rate": 1.2159230958836579e-05, "loss": 0.0332, "step": 8166 }, { "epoch": 0.906055900621118, "grad_norm": 0.5748999714851379, "learning_rate": 1.2155533645550902e-05, "loss": 0.0743, "step": 8169 }, { "epoch": 0.9063886424134872, "grad_norm": 1.1747409105300903, "learning_rate": 1.215183633226522e-05, "loss": 0.0628, "step": 8172 }, { "epoch": 0.9067213842058562, "grad_norm": 0.6361246109008789, "learning_rate": 1.2148139018979543e-05, "loss": 0.0653, "step": 8175 }, { "epoch": 0.9070541259982253, "grad_norm": 0.5080873370170593, "learning_rate": 1.2144441705693864e-05, "loss": 0.0559, "step": 8178 }, { "epoch": 0.9073868677905945, "grad_norm": 0.7901752591133118, "learning_rate": 1.2140744392408183e-05, "loss": 0.0477, "step": 8181 }, { "epoch": 0.9077196095829636, "grad_norm": 0.6943324208259583, "learning_rate": 1.2137047079122506e-05, "loss": 0.0558, "step": 8184 }, { "epoch": 0.9080523513753327, "grad_norm": 0.14815621078014374, "learning_rate": 1.2133349765836827e-05, "loss": 0.0468, "step": 8187 }, { "epoch": 0.9083850931677019, "grad_norm": 0.6462356448173523, "learning_rate": 1.2129652452551146e-05, "loss": 0.056, "step": 8190 }, { "epoch": 0.908717834960071, "grad_norm": 0.6744848489761353, "learning_rate": 1.2125955139265468e-05, "loss": 0.0477, "step": 8193 }, { "epoch": 0.90905057675244, "grad_norm": 0.584004819393158, "learning_rate": 1.2122257825979789e-05, "loss": 0.0512, "step": 8196 }, { "epoch": 0.9093833185448092, "grad_norm": 1.3915331363677979, "learning_rate": 1.2118560512694112e-05, "loss": 0.0782, "step": 8199 }, { "epoch": 0.9097160603371783, "grad_norm": 1.4578176736831665, "learning_rate": 1.211486319940843e-05, "loss": 0.0937, "step": 8202 }, { "epoch": 0.9100488021295474, "grad_norm": 0.48209354281425476, "learning_rate": 1.2111165886122751e-05, "loss": 0.0655, "step": 8205 }, { "epoch": 0.9103815439219166, "grad_norm": 0.1904342621564865, "learning_rate": 1.2107468572837074e-05, "loss": 0.0489, "step": 8208 }, { "epoch": 0.9107142857142857, "grad_norm": 0.6230872869491577, "learning_rate": 1.2103771259551393e-05, "loss": 0.067, "step": 8211 }, { "epoch": 0.9110470275066548, "grad_norm": 0.4631042182445526, "learning_rate": 1.2100073946265714e-05, "loss": 0.0392, "step": 8214 }, { "epoch": 0.911379769299024, "grad_norm": 0.8700494170188904, "learning_rate": 1.2096376632980036e-05, "loss": 0.0317, "step": 8217 }, { "epoch": 0.9117125110913931, "grad_norm": 0.6716225743293762, "learning_rate": 1.2092679319694356e-05, "loss": 0.0364, "step": 8220 }, { "epoch": 0.9120452528837621, "grad_norm": 0.36181390285491943, "learning_rate": 1.2088982006408678e-05, "loss": 0.0202, "step": 8223 }, { "epoch": 0.9123779946761313, "grad_norm": 0.9662750363349915, "learning_rate": 1.2085284693122999e-05, "loss": 0.0491, "step": 8226 }, { "epoch": 0.9127107364685004, "grad_norm": 0.5570537447929382, "learning_rate": 1.2081587379837318e-05, "loss": 0.0448, "step": 8229 }, { "epoch": 0.9130434782608695, "grad_norm": 0.7293334007263184, "learning_rate": 1.207789006655164e-05, "loss": 0.0549, "step": 8232 }, { "epoch": 0.9133762200532387, "grad_norm": 0.6068324446678162, "learning_rate": 1.2074192753265961e-05, "loss": 0.0304, "step": 8235 }, { "epoch": 0.9137089618456078, "grad_norm": 0.5527037382125854, "learning_rate": 1.207049543998028e-05, "loss": 0.0375, "step": 8238 }, { "epoch": 0.9140417036379769, "grad_norm": 0.8950066566467285, "learning_rate": 1.2066798126694603e-05, "loss": 0.0318, "step": 8241 }, { "epoch": 0.9143744454303461, "grad_norm": 0.747451901435852, "learning_rate": 1.2063100813408924e-05, "loss": 0.0602, "step": 8244 }, { "epoch": 0.9147071872227152, "grad_norm": 0.6630997061729431, "learning_rate": 1.2059403500123246e-05, "loss": 0.0607, "step": 8247 }, { "epoch": 0.9150399290150842, "grad_norm": 0.7750911712646484, "learning_rate": 1.2055706186837565e-05, "loss": 0.0493, "step": 8250 }, { "epoch": 0.9153726708074534, "grad_norm": 0.5328304171562195, "learning_rate": 1.2052008873551886e-05, "loss": 0.0324, "step": 8253 }, { "epoch": 0.9157054125998225, "grad_norm": 0.5888455510139465, "learning_rate": 1.2048311560266209e-05, "loss": 0.0488, "step": 8256 }, { "epoch": 0.9160381543921916, "grad_norm": 0.2705594599246979, "learning_rate": 1.2044614246980528e-05, "loss": 0.0511, "step": 8259 }, { "epoch": 0.9163708961845608, "grad_norm": 0.7600246071815491, "learning_rate": 1.2040916933694849e-05, "loss": 0.0467, "step": 8262 }, { "epoch": 0.9167036379769299, "grad_norm": 0.34052133560180664, "learning_rate": 1.2037219620409171e-05, "loss": 0.0473, "step": 8265 }, { "epoch": 0.917036379769299, "grad_norm": 0.7606754899024963, "learning_rate": 1.203352230712349e-05, "loss": 0.0465, "step": 8268 }, { "epoch": 0.9173691215616682, "grad_norm": 0.9704463481903076, "learning_rate": 1.2029824993837813e-05, "loss": 0.0439, "step": 8271 }, { "epoch": 0.9177018633540373, "grad_norm": 0.6106446981430054, "learning_rate": 1.2026127680552134e-05, "loss": 0.045, "step": 8274 }, { "epoch": 0.9180346051464063, "grad_norm": 0.6747317314147949, "learning_rate": 1.2022430367266453e-05, "loss": 0.0624, "step": 8277 }, { "epoch": 0.9183673469387755, "grad_norm": 0.6766435503959656, "learning_rate": 1.2018733053980775e-05, "loss": 0.0448, "step": 8280 }, { "epoch": 0.9187000887311446, "grad_norm": 0.5824471116065979, "learning_rate": 1.2015035740695096e-05, "loss": 0.0787, "step": 8283 }, { "epoch": 0.9190328305235137, "grad_norm": 0.29710355401039124, "learning_rate": 1.2011338427409415e-05, "loss": 0.0409, "step": 8286 }, { "epoch": 0.9193655723158829, "grad_norm": 0.34799924492836, "learning_rate": 1.2007641114123738e-05, "loss": 0.0567, "step": 8289 }, { "epoch": 0.919698314108252, "grad_norm": 0.298291951417923, "learning_rate": 1.2003943800838059e-05, "loss": 0.0697, "step": 8292 }, { "epoch": 0.9200310559006211, "grad_norm": 0.9662755131721497, "learning_rate": 1.2000246487552381e-05, "loss": 0.0548, "step": 8295 }, { "epoch": 0.9203637976929903, "grad_norm": 0.3325819671154022, "learning_rate": 1.19965491742667e-05, "loss": 0.0492, "step": 8298 }, { "epoch": 0.9206965394853593, "grad_norm": 0.7265544533729553, "learning_rate": 1.1992851860981021e-05, "loss": 0.0579, "step": 8301 }, { "epoch": 0.9210292812777284, "grad_norm": 0.36567023396492004, "learning_rate": 1.1989154547695344e-05, "loss": 0.031, "step": 8304 }, { "epoch": 0.9213620230700976, "grad_norm": 0.23742149770259857, "learning_rate": 1.1985457234409663e-05, "loss": 0.0441, "step": 8307 }, { "epoch": 0.9216947648624667, "grad_norm": 2.064155340194702, "learning_rate": 1.1981759921123984e-05, "loss": 0.1065, "step": 8310 }, { "epoch": 0.9220275066548358, "grad_norm": 0.3713908791542053, "learning_rate": 1.1978062607838306e-05, "loss": 0.0363, "step": 8313 }, { "epoch": 0.922360248447205, "grad_norm": 0.24687989056110382, "learning_rate": 1.1974365294552625e-05, "loss": 0.0405, "step": 8316 }, { "epoch": 0.9226929902395741, "grad_norm": 0.7511981725692749, "learning_rate": 1.1970667981266948e-05, "loss": 0.0646, "step": 8319 }, { "epoch": 0.9230257320319432, "grad_norm": 0.3799895942211151, "learning_rate": 1.1966970667981269e-05, "loss": 0.0459, "step": 8322 }, { "epoch": 0.9233584738243124, "grad_norm": 0.2498670071363449, "learning_rate": 1.1963273354695588e-05, "loss": 0.034, "step": 8325 }, { "epoch": 0.9236912156166814, "grad_norm": 0.405750036239624, "learning_rate": 1.195957604140991e-05, "loss": 0.0555, "step": 8328 }, { "epoch": 0.9240239574090505, "grad_norm": 0.4704853594303131, "learning_rate": 1.1955878728124231e-05, "loss": 0.0484, "step": 8331 }, { "epoch": 0.9243566992014197, "grad_norm": 0.2812948524951935, "learning_rate": 1.195218141483855e-05, "loss": 0.0275, "step": 8334 }, { "epoch": 0.9246894409937888, "grad_norm": 0.7094414234161377, "learning_rate": 1.1948484101552873e-05, "loss": 0.0732, "step": 8337 }, { "epoch": 0.9250221827861579, "grad_norm": 0.8379213809967041, "learning_rate": 1.1944786788267193e-05, "loss": 0.0611, "step": 8340 }, { "epoch": 0.9253549245785271, "grad_norm": 0.40720072388648987, "learning_rate": 1.1941089474981516e-05, "loss": 0.0983, "step": 8343 }, { "epoch": 0.9256876663708962, "grad_norm": 0.40131300687789917, "learning_rate": 1.1937392161695835e-05, "loss": 0.0568, "step": 8346 }, { "epoch": 0.9260204081632653, "grad_norm": 0.5808132886886597, "learning_rate": 1.1933694848410156e-05, "loss": 0.0451, "step": 8349 }, { "epoch": 0.9263531499556344, "grad_norm": 0.21242745220661163, "learning_rate": 1.1929997535124478e-05, "loss": 0.0334, "step": 8352 }, { "epoch": 0.9266858917480035, "grad_norm": 0.2919635772705078, "learning_rate": 1.1926300221838798e-05, "loss": 0.048, "step": 8355 }, { "epoch": 0.9270186335403726, "grad_norm": 0.7061541676521301, "learning_rate": 1.1922602908553118e-05, "loss": 0.0834, "step": 8358 }, { "epoch": 0.9273513753327418, "grad_norm": 0.5661298036575317, "learning_rate": 1.1918905595267441e-05, "loss": 0.0427, "step": 8361 }, { "epoch": 0.9276841171251109, "grad_norm": 0.34316545724868774, "learning_rate": 1.191520828198176e-05, "loss": 0.0845, "step": 8364 }, { "epoch": 0.92801685891748, "grad_norm": 0.3423670530319214, "learning_rate": 1.1911510968696083e-05, "loss": 0.0486, "step": 8367 }, { "epoch": 0.9283496007098492, "grad_norm": 0.6217297315597534, "learning_rate": 1.1907813655410403e-05, "loss": 0.0641, "step": 8370 }, { "epoch": 0.9286823425022183, "grad_norm": 0.48643460869789124, "learning_rate": 1.1904116342124723e-05, "loss": 0.0527, "step": 8373 }, { "epoch": 0.9290150842945873, "grad_norm": 0.8321207165718079, "learning_rate": 1.1900419028839045e-05, "loss": 0.0561, "step": 8376 }, { "epoch": 0.9293478260869565, "grad_norm": 0.5688766837120056, "learning_rate": 1.1896721715553366e-05, "loss": 0.0479, "step": 8379 }, { "epoch": 0.9296805678793256, "grad_norm": 1.3831993341445923, "learning_rate": 1.1893024402267685e-05, "loss": 0.0858, "step": 8382 }, { "epoch": 0.9300133096716947, "grad_norm": 0.7290650010108948, "learning_rate": 1.1889327088982007e-05, "loss": 0.0318, "step": 8385 }, { "epoch": 0.9303460514640639, "grad_norm": 0.6109011769294739, "learning_rate": 1.1885629775696328e-05, "loss": 0.0787, "step": 8388 }, { "epoch": 0.930678793256433, "grad_norm": 0.2550736665725708, "learning_rate": 1.188193246241065e-05, "loss": 0.0397, "step": 8391 }, { "epoch": 0.9310115350488021, "grad_norm": 0.770127534866333, "learning_rate": 1.187823514912497e-05, "loss": 0.0343, "step": 8394 }, { "epoch": 0.9313442768411713, "grad_norm": 0.3864431381225586, "learning_rate": 1.187453783583929e-05, "loss": 0.0718, "step": 8397 }, { "epoch": 0.9316770186335404, "grad_norm": 0.7207202911376953, "learning_rate": 1.1870840522553613e-05, "loss": 0.0653, "step": 8400 }, { "epoch": 0.9320097604259094, "grad_norm": 0.3309830129146576, "learning_rate": 1.1867143209267932e-05, "loss": 0.058, "step": 8403 }, { "epoch": 0.9323425022182786, "grad_norm": 0.396769255399704, "learning_rate": 1.1863445895982253e-05, "loss": 0.0414, "step": 8406 }, { "epoch": 0.9326752440106477, "grad_norm": 0.561481773853302, "learning_rate": 1.1859748582696576e-05, "loss": 0.051, "step": 8409 }, { "epoch": 0.9330079858030168, "grad_norm": 0.41229552030563354, "learning_rate": 1.1856051269410895e-05, "loss": 0.0367, "step": 8412 }, { "epoch": 0.933340727595386, "grad_norm": 0.40834754705429077, "learning_rate": 1.1852353956125217e-05, "loss": 0.059, "step": 8415 }, { "epoch": 0.9336734693877551, "grad_norm": 0.43337926268577576, "learning_rate": 1.1848656642839538e-05, "loss": 0.0726, "step": 8418 }, { "epoch": 0.9340062111801242, "grad_norm": 0.27775007486343384, "learning_rate": 1.1844959329553857e-05, "loss": 0.0509, "step": 8421 }, { "epoch": 0.9343389529724934, "grad_norm": 0.885307252407074, "learning_rate": 1.184126201626818e-05, "loss": 0.0383, "step": 8424 }, { "epoch": 0.9346716947648624, "grad_norm": 0.5997012853622437, "learning_rate": 1.18375647029825e-05, "loss": 0.0606, "step": 8427 }, { "epoch": 0.9350044365572315, "grad_norm": 0.6481707096099854, "learning_rate": 1.183386738969682e-05, "loss": 0.0291, "step": 8430 }, { "epoch": 0.9353371783496007, "grad_norm": 0.3842674493789673, "learning_rate": 1.1830170076411142e-05, "loss": 0.0368, "step": 8433 }, { "epoch": 0.9356699201419698, "grad_norm": 0.45347484946250916, "learning_rate": 1.1826472763125463e-05, "loss": 0.0786, "step": 8436 }, { "epoch": 0.9360026619343389, "grad_norm": 0.8859057426452637, "learning_rate": 1.1822775449839786e-05, "loss": 0.0786, "step": 8439 }, { "epoch": 0.9363354037267081, "grad_norm": 0.23425766825675964, "learning_rate": 1.1819078136554105e-05, "loss": 0.05, "step": 8442 }, { "epoch": 0.9366681455190772, "grad_norm": 0.42121437191963196, "learning_rate": 1.1815380823268426e-05, "loss": 0.0347, "step": 8445 }, { "epoch": 0.9370008873114463, "grad_norm": 0.2014698088169098, "learning_rate": 1.1811683509982748e-05, "loss": 0.0247, "step": 8448 }, { "epoch": 0.9373336291038155, "grad_norm": 0.44908273220062256, "learning_rate": 1.1807986196697067e-05, "loss": 0.0867, "step": 8451 }, { "epoch": 0.9376663708961845, "grad_norm": 0.40260618925094604, "learning_rate": 1.1804288883411388e-05, "loss": 0.0569, "step": 8454 }, { "epoch": 0.9379991126885537, "grad_norm": 0.16816651821136475, "learning_rate": 1.180059157012571e-05, "loss": 0.0282, "step": 8457 }, { "epoch": 0.9383318544809228, "grad_norm": 0.4701831638813019, "learning_rate": 1.179689425684003e-05, "loss": 0.0444, "step": 8460 }, { "epoch": 0.9386645962732919, "grad_norm": 0.40188154578208923, "learning_rate": 1.1793196943554352e-05, "loss": 0.0402, "step": 8463 }, { "epoch": 0.9389973380656611, "grad_norm": 0.1905631422996521, "learning_rate": 1.1789499630268673e-05, "loss": 0.0441, "step": 8466 }, { "epoch": 0.9393300798580302, "grad_norm": 0.8586405515670776, "learning_rate": 1.1785802316982992e-05, "loss": 0.0646, "step": 8469 }, { "epoch": 0.9396628216503993, "grad_norm": 0.5134589672088623, "learning_rate": 1.1782105003697315e-05, "loss": 0.0261, "step": 8472 }, { "epoch": 0.9399955634427685, "grad_norm": 0.6675304174423218, "learning_rate": 1.1778407690411635e-05, "loss": 0.0359, "step": 8475 }, { "epoch": 0.9403283052351376, "grad_norm": 1.1138358116149902, "learning_rate": 1.1774710377125955e-05, "loss": 0.0515, "step": 8478 }, { "epoch": 0.9406610470275066, "grad_norm": 0.7612665891647339, "learning_rate": 1.1771013063840277e-05, "loss": 0.0456, "step": 8481 }, { "epoch": 0.9409937888198758, "grad_norm": 0.29424232244491577, "learning_rate": 1.1767315750554598e-05, "loss": 0.0213, "step": 8484 }, { "epoch": 0.9413265306122449, "grad_norm": 0.2540479600429535, "learning_rate": 1.176361843726892e-05, "loss": 0.034, "step": 8487 }, { "epoch": 0.941659272404614, "grad_norm": 0.3962341547012329, "learning_rate": 1.175992112398324e-05, "loss": 0.0629, "step": 8490 }, { "epoch": 0.9419920141969832, "grad_norm": 0.6541232466697693, "learning_rate": 1.175622381069756e-05, "loss": 0.0519, "step": 8493 }, { "epoch": 0.9423247559893523, "grad_norm": 0.4398234784603119, "learning_rate": 1.1752526497411883e-05, "loss": 0.0589, "step": 8496 }, { "epoch": 0.9426574977817214, "grad_norm": 0.6082584261894226, "learning_rate": 1.1748829184126202e-05, "loss": 0.0294, "step": 8499 }, { "epoch": 0.9429902395740906, "grad_norm": 1.6813539266586304, "learning_rate": 1.1745131870840523e-05, "loss": 0.0594, "step": 8502 }, { "epoch": 0.9433229813664596, "grad_norm": 0.4133315682411194, "learning_rate": 1.1741434557554845e-05, "loss": 0.0592, "step": 8505 }, { "epoch": 0.9436557231588287, "grad_norm": 0.6766602396965027, "learning_rate": 1.1737737244269164e-05, "loss": 0.061, "step": 8508 }, { "epoch": 0.9439884649511979, "grad_norm": 0.3546695411205292, "learning_rate": 1.1734039930983487e-05, "loss": 0.0529, "step": 8511 }, { "epoch": 0.944321206743567, "grad_norm": 0.38188883662223816, "learning_rate": 1.1730342617697808e-05, "loss": 0.0457, "step": 8514 }, { "epoch": 0.9446539485359361, "grad_norm": 0.5051119327545166, "learning_rate": 1.1726645304412127e-05, "loss": 0.0547, "step": 8517 }, { "epoch": 0.9449866903283053, "grad_norm": 0.3031490445137024, "learning_rate": 1.172294799112645e-05, "loss": 0.0364, "step": 8520 }, { "epoch": 0.9453194321206744, "grad_norm": 0.4432518482208252, "learning_rate": 1.171925067784077e-05, "loss": 0.0483, "step": 8523 }, { "epoch": 0.9456521739130435, "grad_norm": 0.22328118979930878, "learning_rate": 1.171555336455509e-05, "loss": 0.0311, "step": 8526 }, { "epoch": 0.9459849157054127, "grad_norm": 0.3273255527019501, "learning_rate": 1.1711856051269412e-05, "loss": 0.0323, "step": 8529 }, { "epoch": 0.9463176574977817, "grad_norm": 0.3653288781642914, "learning_rate": 1.1708158737983733e-05, "loss": 0.0558, "step": 8532 }, { "epoch": 0.9466503992901508, "grad_norm": 0.7788194417953491, "learning_rate": 1.1704461424698055e-05, "loss": 0.0828, "step": 8535 }, { "epoch": 0.94698314108252, "grad_norm": 0.46273890137672424, "learning_rate": 1.1700764111412374e-05, "loss": 0.0323, "step": 8538 }, { "epoch": 0.9473158828748891, "grad_norm": 0.22055192291736603, "learning_rate": 1.1697066798126695e-05, "loss": 0.0288, "step": 8541 }, { "epoch": 0.9476486246672582, "grad_norm": 0.9778677225112915, "learning_rate": 1.1693369484841018e-05, "loss": 0.0536, "step": 8544 }, { "epoch": 0.9479813664596274, "grad_norm": 0.6891587376594543, "learning_rate": 1.1689672171555337e-05, "loss": 0.0849, "step": 8547 }, { "epoch": 0.9483141082519965, "grad_norm": 0.8472664952278137, "learning_rate": 1.1685974858269658e-05, "loss": 0.0294, "step": 8550 }, { "epoch": 0.9486468500443656, "grad_norm": 0.2767336666584015, "learning_rate": 1.168227754498398e-05, "loss": 0.0483, "step": 8553 }, { "epoch": 0.9489795918367347, "grad_norm": 0.32361751794815063, "learning_rate": 1.16785802316983e-05, "loss": 0.0248, "step": 8556 }, { "epoch": 0.9493123336291038, "grad_norm": 0.6653692126274109, "learning_rate": 1.1674882918412622e-05, "loss": 0.0286, "step": 8559 }, { "epoch": 0.9496450754214729, "grad_norm": 1.2168161869049072, "learning_rate": 1.1671185605126943e-05, "loss": 0.089, "step": 8562 }, { "epoch": 0.9499778172138421, "grad_norm": 0.3892005980014801, "learning_rate": 1.1667488291841262e-05, "loss": 0.0341, "step": 8565 }, { "epoch": 0.9503105590062112, "grad_norm": 0.5309540629386902, "learning_rate": 1.1663790978555584e-05, "loss": 0.0293, "step": 8568 }, { "epoch": 0.9506433007985803, "grad_norm": 0.6501523852348328, "learning_rate": 1.1660093665269905e-05, "loss": 0.0251, "step": 8571 }, { "epoch": 0.9509760425909495, "grad_norm": 0.3896118998527527, "learning_rate": 1.1656396351984224e-05, "loss": 0.054, "step": 8574 }, { "epoch": 0.9513087843833186, "grad_norm": 1.113911747932434, "learning_rate": 1.1652699038698547e-05, "loss": 0.0872, "step": 8577 }, { "epoch": 0.9516415261756876, "grad_norm": 1.052460789680481, "learning_rate": 1.1649001725412868e-05, "loss": 0.0577, "step": 8580 }, { "epoch": 0.9519742679680568, "grad_norm": 0.29592350125312805, "learning_rate": 1.1645304412127187e-05, "loss": 0.0519, "step": 8583 }, { "epoch": 0.9523070097604259, "grad_norm": 0.48788049817085266, "learning_rate": 1.164160709884151e-05, "loss": 0.0725, "step": 8586 }, { "epoch": 0.952639751552795, "grad_norm": 0.44729548692703247, "learning_rate": 1.163790978555583e-05, "loss": 0.0672, "step": 8589 }, { "epoch": 0.9529724933451642, "grad_norm": 0.8626269102096558, "learning_rate": 1.1634212472270153e-05, "loss": 0.081, "step": 8592 }, { "epoch": 0.9533052351375333, "grad_norm": 0.3891582787036896, "learning_rate": 1.1630515158984472e-05, "loss": 0.0505, "step": 8595 }, { "epoch": 0.9536379769299024, "grad_norm": 0.278071790933609, "learning_rate": 1.1626817845698792e-05, "loss": 0.042, "step": 8598 }, { "epoch": 0.9539707187222716, "grad_norm": 0.704457700252533, "learning_rate": 1.1623120532413115e-05, "loss": 0.057, "step": 8601 }, { "epoch": 0.9543034605146407, "grad_norm": 0.33667972683906555, "learning_rate": 1.1619423219127434e-05, "loss": 0.0251, "step": 8604 }, { "epoch": 0.9546362023070097, "grad_norm": 0.455834299325943, "learning_rate": 1.1615725905841755e-05, "loss": 0.0593, "step": 8607 }, { "epoch": 0.9549689440993789, "grad_norm": 0.340788334608078, "learning_rate": 1.1612028592556077e-05, "loss": 0.0383, "step": 8610 }, { "epoch": 0.955301685891748, "grad_norm": 0.6947211027145386, "learning_rate": 1.1608331279270397e-05, "loss": 0.066, "step": 8613 }, { "epoch": 0.9556344276841171, "grad_norm": 0.37880590558052063, "learning_rate": 1.1604633965984719e-05, "loss": 0.0545, "step": 8616 }, { "epoch": 0.9559671694764863, "grad_norm": 1.0933235883712769, "learning_rate": 1.160093665269904e-05, "loss": 0.0528, "step": 8619 }, { "epoch": 0.9562999112688554, "grad_norm": 0.28889864683151245, "learning_rate": 1.1597239339413359e-05, "loss": 0.042, "step": 8622 }, { "epoch": 0.9566326530612245, "grad_norm": 0.5840552449226379, "learning_rate": 1.1593542026127682e-05, "loss": 0.0462, "step": 8625 }, { "epoch": 0.9569653948535937, "grad_norm": 0.4519164562225342, "learning_rate": 1.1589844712842002e-05, "loss": 0.0727, "step": 8628 }, { "epoch": 0.9572981366459627, "grad_norm": 0.28751468658447266, "learning_rate": 1.1586147399556322e-05, "loss": 0.0467, "step": 8631 }, { "epoch": 0.9576308784383318, "grad_norm": 0.5467666387557983, "learning_rate": 1.1582450086270644e-05, "loss": 0.0352, "step": 8634 }, { "epoch": 0.957963620230701, "grad_norm": 0.5028154850006104, "learning_rate": 1.1578752772984965e-05, "loss": 0.0404, "step": 8637 }, { "epoch": 0.9582963620230701, "grad_norm": 0.8858004212379456, "learning_rate": 1.1575055459699287e-05, "loss": 0.0958, "step": 8640 }, { "epoch": 0.9586291038154392, "grad_norm": 0.4841117262840271, "learning_rate": 1.1571358146413606e-05, "loss": 0.0547, "step": 8643 }, { "epoch": 0.9589618456078084, "grad_norm": 0.41097527742385864, "learning_rate": 1.1567660833127927e-05, "loss": 0.0413, "step": 8646 }, { "epoch": 0.9592945874001775, "grad_norm": 0.8401791453361511, "learning_rate": 1.156396351984225e-05, "loss": 0.0761, "step": 8649 }, { "epoch": 0.9596273291925466, "grad_norm": 0.6534838676452637, "learning_rate": 1.1560266206556569e-05, "loss": 0.0472, "step": 8652 }, { "epoch": 0.9599600709849158, "grad_norm": 0.27826571464538574, "learning_rate": 1.155656889327089e-05, "loss": 0.0402, "step": 8655 }, { "epoch": 0.9602928127772848, "grad_norm": 0.5100277066230774, "learning_rate": 1.1552871579985212e-05, "loss": 0.0508, "step": 8658 }, { "epoch": 0.9606255545696539, "grad_norm": 0.5087856650352478, "learning_rate": 1.1549174266699531e-05, "loss": 0.0521, "step": 8661 }, { "epoch": 0.9609582963620231, "grad_norm": 0.6074944138526917, "learning_rate": 1.1545476953413854e-05, "loss": 0.0321, "step": 8664 }, { "epoch": 0.9612910381543922, "grad_norm": 0.7232843637466431, "learning_rate": 1.1541779640128175e-05, "loss": 0.0668, "step": 8667 }, { "epoch": 0.9616237799467613, "grad_norm": 0.3092573285102844, "learning_rate": 1.1538082326842494e-05, "loss": 0.0301, "step": 8670 }, { "epoch": 0.9619565217391305, "grad_norm": 0.22514396905899048, "learning_rate": 1.1534385013556816e-05, "loss": 0.0397, "step": 8673 }, { "epoch": 0.9622892635314996, "grad_norm": 0.6499538421630859, "learning_rate": 1.1530687700271137e-05, "loss": 0.0424, "step": 8676 }, { "epoch": 0.9626220053238687, "grad_norm": 0.3427381217479706, "learning_rate": 1.1526990386985456e-05, "loss": 0.0448, "step": 8679 }, { "epoch": 0.9629547471162379, "grad_norm": 1.3607947826385498, "learning_rate": 1.1523293073699779e-05, "loss": 0.0683, "step": 8682 }, { "epoch": 0.9632874889086069, "grad_norm": 0.7620245814323425, "learning_rate": 1.15195957604141e-05, "loss": 0.0571, "step": 8685 }, { "epoch": 0.963620230700976, "grad_norm": 0.6273897290229797, "learning_rate": 1.1515898447128422e-05, "loss": 0.0537, "step": 8688 }, { "epoch": 0.9639529724933452, "grad_norm": 0.8978163003921509, "learning_rate": 1.1512201133842741e-05, "loss": 0.0899, "step": 8691 }, { "epoch": 0.9642857142857143, "grad_norm": 0.5633931159973145, "learning_rate": 1.1508503820557062e-05, "loss": 0.0356, "step": 8694 }, { "epoch": 0.9646184560780834, "grad_norm": 0.5698189735412598, "learning_rate": 1.1504806507271385e-05, "loss": 0.0585, "step": 8697 }, { "epoch": 0.9649511978704526, "grad_norm": 1.4076073169708252, "learning_rate": 1.1501109193985704e-05, "loss": 0.047, "step": 8700 }, { "epoch": 0.9652839396628217, "grad_norm": 0.2624446749687195, "learning_rate": 1.1497411880700025e-05, "loss": 0.0715, "step": 8703 }, { "epoch": 0.9656166814551908, "grad_norm": 0.9687386751174927, "learning_rate": 1.1493714567414347e-05, "loss": 0.0464, "step": 8706 }, { "epoch": 0.96594942324756, "grad_norm": 0.5109801292419434, "learning_rate": 1.1490017254128666e-05, "loss": 0.0566, "step": 8709 }, { "epoch": 0.966282165039929, "grad_norm": 0.5253724455833435, "learning_rate": 1.1486319940842989e-05, "loss": 0.036, "step": 8712 }, { "epoch": 0.9666149068322981, "grad_norm": 0.36865219473838806, "learning_rate": 1.148262262755731e-05, "loss": 0.0426, "step": 8715 }, { "epoch": 0.9669476486246673, "grad_norm": 0.5368445515632629, "learning_rate": 1.1478925314271629e-05, "loss": 0.0587, "step": 8718 }, { "epoch": 0.9672803904170364, "grad_norm": 0.6473378539085388, "learning_rate": 1.1475228000985951e-05, "loss": 0.0587, "step": 8721 }, { "epoch": 0.9676131322094055, "grad_norm": 0.5862790942192078, "learning_rate": 1.1471530687700272e-05, "loss": 0.0821, "step": 8724 }, { "epoch": 0.9679458740017747, "grad_norm": 0.72596275806427, "learning_rate": 1.1467833374414591e-05, "loss": 0.0385, "step": 8727 }, { "epoch": 0.9682786157941438, "grad_norm": 0.41266077756881714, "learning_rate": 1.1464136061128914e-05, "loss": 0.0332, "step": 8730 }, { "epoch": 0.9686113575865128, "grad_norm": 0.1854693591594696, "learning_rate": 1.1460438747843234e-05, "loss": 0.028, "step": 8733 }, { "epoch": 0.968944099378882, "grad_norm": 0.46978873014450073, "learning_rate": 1.1456741434557557e-05, "loss": 0.0318, "step": 8736 }, { "epoch": 0.9692768411712511, "grad_norm": 0.5211992859840393, "learning_rate": 1.1453044121271876e-05, "loss": 0.0832, "step": 8739 }, { "epoch": 0.9696095829636202, "grad_norm": 0.9025626182556152, "learning_rate": 1.1449346807986197e-05, "loss": 0.0792, "step": 8742 }, { "epoch": 0.9699423247559894, "grad_norm": 0.5201238989830017, "learning_rate": 1.144564949470052e-05, "loss": 0.0451, "step": 8745 }, { "epoch": 0.9702750665483585, "grad_norm": 0.5261621475219727, "learning_rate": 1.1441952181414839e-05, "loss": 0.0425, "step": 8748 }, { "epoch": 0.9706078083407276, "grad_norm": 0.2518812119960785, "learning_rate": 1.143825486812916e-05, "loss": 0.0375, "step": 8751 }, { "epoch": 0.9709405501330968, "grad_norm": 0.293048620223999, "learning_rate": 1.1434557554843482e-05, "loss": 0.057, "step": 8754 }, { "epoch": 0.9712732919254659, "grad_norm": 0.8447964191436768, "learning_rate": 1.1430860241557801e-05, "loss": 0.054, "step": 8757 }, { "epoch": 0.9716060337178349, "grad_norm": 0.457008957862854, "learning_rate": 1.1427162928272124e-05, "loss": 0.0583, "step": 8760 }, { "epoch": 0.9719387755102041, "grad_norm": 0.4531729519367218, "learning_rate": 1.1423465614986444e-05, "loss": 0.0355, "step": 8763 }, { "epoch": 0.9722715173025732, "grad_norm": 1.1689764261245728, "learning_rate": 1.1419768301700764e-05, "loss": 0.0663, "step": 8766 }, { "epoch": 0.9726042590949423, "grad_norm": 1.2596089839935303, "learning_rate": 1.1416070988415086e-05, "loss": 0.086, "step": 8769 }, { "epoch": 0.9729370008873115, "grad_norm": 0.31695568561553955, "learning_rate": 1.1412373675129407e-05, "loss": 0.036, "step": 8772 }, { "epoch": 0.9732697426796806, "grad_norm": 0.4507710933685303, "learning_rate": 1.1408676361843728e-05, "loss": 0.0416, "step": 8775 }, { "epoch": 0.9736024844720497, "grad_norm": 0.20956899225711823, "learning_rate": 1.1404979048558048e-05, "loss": 0.0417, "step": 8778 }, { "epoch": 0.9739352262644189, "grad_norm": 0.34692540764808655, "learning_rate": 1.140128173527237e-05, "loss": 0.0494, "step": 8781 }, { "epoch": 0.974267968056788, "grad_norm": 0.4539054334163666, "learning_rate": 1.1397584421986692e-05, "loss": 0.0313, "step": 8784 }, { "epoch": 0.974600709849157, "grad_norm": 0.631075382232666, "learning_rate": 1.1393887108701011e-05, "loss": 0.0443, "step": 8787 }, { "epoch": 0.9749334516415262, "grad_norm": 0.39394623041152954, "learning_rate": 1.1390189795415332e-05, "loss": 0.0612, "step": 8790 }, { "epoch": 0.9752661934338953, "grad_norm": 1.1218891143798828, "learning_rate": 1.1386492482129654e-05, "loss": 0.0596, "step": 8793 }, { "epoch": 0.9755989352262644, "grad_norm": 0.9880281090736389, "learning_rate": 1.1382795168843973e-05, "loss": 0.0575, "step": 8796 }, { "epoch": 0.9759316770186336, "grad_norm": 0.4506834149360657, "learning_rate": 1.1379097855558294e-05, "loss": 0.038, "step": 8799 }, { "epoch": 0.9762644188110027, "grad_norm": 0.9398662447929382, "learning_rate": 1.1375400542272617e-05, "loss": 0.0751, "step": 8802 }, { "epoch": 0.9765971606033718, "grad_norm": 0.48773857951164246, "learning_rate": 1.1371703228986936e-05, "loss": 0.056, "step": 8805 }, { "epoch": 0.976929902395741, "grad_norm": 0.5353679656982422, "learning_rate": 1.1368005915701258e-05, "loss": 0.0482, "step": 8808 }, { "epoch": 0.97726264418811, "grad_norm": 0.24682524800300598, "learning_rate": 1.136430860241558e-05, "loss": 0.0313, "step": 8811 }, { "epoch": 0.9775953859804791, "grad_norm": 0.3232914209365845, "learning_rate": 1.13606112891299e-05, "loss": 0.0331, "step": 8814 }, { "epoch": 0.9779281277728483, "grad_norm": 0.7888772487640381, "learning_rate": 1.135691397584422e-05, "loss": 0.0784, "step": 8817 }, { "epoch": 0.9782608695652174, "grad_norm": 0.4279240369796753, "learning_rate": 1.1353216662558542e-05, "loss": 0.0596, "step": 8820 }, { "epoch": 0.9785936113575865, "grad_norm": 0.3222000300884247, "learning_rate": 1.1349519349272862e-05, "loss": 0.0291, "step": 8823 }, { "epoch": 0.9789263531499557, "grad_norm": 0.23047882318496704, "learning_rate": 1.1345822035987183e-05, "loss": 0.0402, "step": 8826 }, { "epoch": 0.9792590949423248, "grad_norm": 0.43177446722984314, "learning_rate": 1.1342124722701504e-05, "loss": 0.062, "step": 8829 }, { "epoch": 0.9795918367346939, "grad_norm": 0.6597174406051636, "learning_rate": 1.1338427409415827e-05, "loss": 0.0562, "step": 8832 }, { "epoch": 0.979924578527063, "grad_norm": 0.6946485638618469, "learning_rate": 1.1334730096130146e-05, "loss": 0.0413, "step": 8835 }, { "epoch": 0.9802573203194321, "grad_norm": 0.46914249658584595, "learning_rate": 1.1331032782844467e-05, "loss": 0.0246, "step": 8838 }, { "epoch": 0.9805900621118012, "grad_norm": 0.29876211285591125, "learning_rate": 1.1327335469558789e-05, "loss": 0.0524, "step": 8841 }, { "epoch": 0.9809228039041704, "grad_norm": 1.1040805578231812, "learning_rate": 1.1323638156273108e-05, "loss": 0.0866, "step": 8844 }, { "epoch": 0.9812555456965395, "grad_norm": 0.4227003753185272, "learning_rate": 1.1319940842987429e-05, "loss": 0.0334, "step": 8847 }, { "epoch": 0.9815882874889086, "grad_norm": 0.7846102118492126, "learning_rate": 1.1316243529701752e-05, "loss": 0.0455, "step": 8850 }, { "epoch": 0.9819210292812778, "grad_norm": 0.2038019895553589, "learning_rate": 1.1312546216416072e-05, "loss": 0.0617, "step": 8853 }, { "epoch": 0.9822537710736469, "grad_norm": 0.46742337942123413, "learning_rate": 1.1308848903130393e-05, "loss": 0.035, "step": 8856 }, { "epoch": 0.982586512866016, "grad_norm": 0.4364475905895233, "learning_rate": 1.1305151589844714e-05, "loss": 0.0433, "step": 8859 }, { "epoch": 0.9829192546583851, "grad_norm": 0.6890358328819275, "learning_rate": 1.1301454276559035e-05, "loss": 0.0497, "step": 8862 }, { "epoch": 0.9832519964507542, "grad_norm": 1.4316093921661377, "learning_rate": 1.1297756963273356e-05, "loss": 0.087, "step": 8865 }, { "epoch": 0.9835847382431233, "grad_norm": 0.49540403485298157, "learning_rate": 1.1294059649987676e-05, "loss": 0.0887, "step": 8868 }, { "epoch": 0.9839174800354925, "grad_norm": 0.6155421137809753, "learning_rate": 1.1290362336701997e-05, "loss": 0.0581, "step": 8871 }, { "epoch": 0.9842502218278616, "grad_norm": 0.6967869997024536, "learning_rate": 1.1286665023416318e-05, "loss": 0.0368, "step": 8874 }, { "epoch": 0.9845829636202307, "grad_norm": 0.4944927394390106, "learning_rate": 1.1282967710130639e-05, "loss": 0.0383, "step": 8877 }, { "epoch": 0.9849157054125999, "grad_norm": 0.43020689487457275, "learning_rate": 1.1279270396844961e-05, "loss": 0.0561, "step": 8880 }, { "epoch": 0.985248447204969, "grad_norm": 0.8313381671905518, "learning_rate": 1.127557308355928e-05, "loss": 0.074, "step": 8883 }, { "epoch": 0.985581188997338, "grad_norm": 1.4125452041625977, "learning_rate": 1.1271875770273601e-05, "loss": 0.0428, "step": 8886 }, { "epoch": 0.9859139307897072, "grad_norm": 1.196557879447937, "learning_rate": 1.1268178456987924e-05, "loss": 0.0575, "step": 8889 }, { "epoch": 0.9862466725820763, "grad_norm": 0.224771648645401, "learning_rate": 1.1264481143702245e-05, "loss": 0.0321, "step": 8892 }, { "epoch": 0.9865794143744454, "grad_norm": 0.2653876841068268, "learning_rate": 1.1260783830416564e-05, "loss": 0.0401, "step": 8895 }, { "epoch": 0.9869121561668146, "grad_norm": 0.709250807762146, "learning_rate": 1.1257086517130886e-05, "loss": 0.0385, "step": 8898 }, { "epoch": 0.9872448979591837, "grad_norm": 0.6414490938186646, "learning_rate": 1.1253389203845207e-05, "loss": 0.0851, "step": 8901 }, { "epoch": 0.9875776397515528, "grad_norm": 0.9203906059265137, "learning_rate": 1.1249691890559528e-05, "loss": 0.0771, "step": 8904 }, { "epoch": 0.987910381543922, "grad_norm": 0.3993024528026581, "learning_rate": 1.1245994577273849e-05, "loss": 0.0446, "step": 8907 }, { "epoch": 0.988243123336291, "grad_norm": 0.3157324194908142, "learning_rate": 1.124229726398817e-05, "loss": 0.029, "step": 8910 }, { "epoch": 0.9885758651286601, "grad_norm": 0.2679550051689148, "learning_rate": 1.123859995070249e-05, "loss": 0.0555, "step": 8913 }, { "epoch": 0.9889086069210293, "grad_norm": 0.5210423469543457, "learning_rate": 1.1234902637416811e-05, "loss": 0.058, "step": 8916 }, { "epoch": 0.9892413487133984, "grad_norm": 1.0145424604415894, "learning_rate": 1.1231205324131132e-05, "loss": 0.0451, "step": 8919 }, { "epoch": 0.9895740905057675, "grad_norm": 0.3445710241794586, "learning_rate": 1.1227508010845453e-05, "loss": 0.074, "step": 8922 }, { "epoch": 0.9899068322981367, "grad_norm": 0.6316698789596558, "learning_rate": 1.1223810697559774e-05, "loss": 0.0612, "step": 8925 }, { "epoch": 0.9902395740905058, "grad_norm": 0.30916422605514526, "learning_rate": 1.1220113384274096e-05, "loss": 0.0381, "step": 8928 }, { "epoch": 0.9905723158828749, "grad_norm": 0.8022031784057617, "learning_rate": 1.1216416070988415e-05, "loss": 0.0412, "step": 8931 }, { "epoch": 0.9909050576752441, "grad_norm": 0.5893549919128418, "learning_rate": 1.1212718757702736e-05, "loss": 0.0595, "step": 8934 }, { "epoch": 0.9912377994676131, "grad_norm": 0.46117696166038513, "learning_rate": 1.1209021444417059e-05, "loss": 0.041, "step": 8937 }, { "epoch": 0.9915705412599822, "grad_norm": 0.555192232131958, "learning_rate": 1.120532413113138e-05, "loss": 0.0611, "step": 8940 }, { "epoch": 0.9919032830523514, "grad_norm": 0.38025546073913574, "learning_rate": 1.1201626817845699e-05, "loss": 0.0321, "step": 8943 }, { "epoch": 0.9922360248447205, "grad_norm": 0.6083112955093384, "learning_rate": 1.1197929504560021e-05, "loss": 0.0807, "step": 8946 }, { "epoch": 0.9925687666370896, "grad_norm": 0.7653153538703918, "learning_rate": 1.1194232191274342e-05, "loss": 0.0808, "step": 8949 }, { "epoch": 0.9929015084294588, "grad_norm": 0.40556052327156067, "learning_rate": 1.1190534877988663e-05, "loss": 0.0496, "step": 8952 }, { "epoch": 0.9932342502218279, "grad_norm": 0.5564432144165039, "learning_rate": 1.1186837564702984e-05, "loss": 0.0663, "step": 8955 }, { "epoch": 0.993566992014197, "grad_norm": 0.7326928377151489, "learning_rate": 1.1183140251417304e-05, "loss": 0.0415, "step": 8958 }, { "epoch": 0.9938997338065662, "grad_norm": 0.44167864322662354, "learning_rate": 1.1179442938131625e-05, "loss": 0.0561, "step": 8961 }, { "epoch": 0.9942324755989352, "grad_norm": 1.138250470161438, "learning_rate": 1.1175745624845946e-05, "loss": 0.0468, "step": 8964 }, { "epoch": 0.9945652173913043, "grad_norm": 0.4175792634487152, "learning_rate": 1.1172048311560267e-05, "loss": 0.0745, "step": 8967 }, { "epoch": 0.9948979591836735, "grad_norm": 0.6208221912384033, "learning_rate": 1.1168350998274588e-05, "loss": 0.0725, "step": 8970 }, { "epoch": 0.9952307009760426, "grad_norm": 0.33828938007354736, "learning_rate": 1.1164653684988909e-05, "loss": 0.0408, "step": 8973 }, { "epoch": 0.9955634427684117, "grad_norm": 0.6464995741844177, "learning_rate": 1.1160956371703231e-05, "loss": 0.0285, "step": 8976 }, { "epoch": 0.9958961845607809, "grad_norm": 0.3695983290672302, "learning_rate": 1.1157259058417552e-05, "loss": 0.0274, "step": 8979 }, { "epoch": 0.99622892635315, "grad_norm": 0.8291159272193909, "learning_rate": 1.1153561745131871e-05, "loss": 0.0351, "step": 8982 }, { "epoch": 0.996561668145519, "grad_norm": 0.8132166862487793, "learning_rate": 1.1149864431846194e-05, "loss": 0.0571, "step": 8985 }, { "epoch": 0.9968944099378882, "grad_norm": 0.709579348564148, "learning_rate": 1.1146167118560514e-05, "loss": 0.0428, "step": 8988 }, { "epoch": 0.9972271517302573, "grad_norm": 0.30570489168167114, "learning_rate": 1.1142469805274833e-05, "loss": 0.0327, "step": 8991 }, { "epoch": 0.9975598935226264, "grad_norm": 0.5769500136375427, "learning_rate": 1.1138772491989156e-05, "loss": 0.0312, "step": 8994 }, { "epoch": 0.9978926353149956, "grad_norm": 0.44979727268218994, "learning_rate": 1.1135075178703477e-05, "loss": 0.0243, "step": 8997 }, { "epoch": 0.9982253771073647, "grad_norm": 1.062454104423523, "learning_rate": 1.1131377865417798e-05, "loss": 0.0691, "step": 9000 }, { "epoch": 0.9985581188997338, "grad_norm": 0.4242594540119171, "learning_rate": 1.1127680552132118e-05, "loss": 0.0391, "step": 9003 }, { "epoch": 0.998890860692103, "grad_norm": 0.3390410244464874, "learning_rate": 1.112398323884644e-05, "loss": 0.0443, "step": 9006 }, { "epoch": 0.9992236024844721, "grad_norm": 0.43890079855918884, "learning_rate": 1.112028592556076e-05, "loss": 0.0317, "step": 9009 }, { "epoch": 0.9995563442768411, "grad_norm": 0.3989914357662201, "learning_rate": 1.1116588612275081e-05, "loss": 0.0348, "step": 9012 }, { "epoch": 0.9998890860692103, "grad_norm": 1.1243058443069458, "learning_rate": 1.1112891298989402e-05, "loss": 0.0659, "step": 9015 }, { "epoch": 1.0, "eval_accuracy": 0.9727642276422764, "eval_auc": 0.9906997543262077, "eval_f1": 0.9142125480153649, "eval_loss": 0.07552551478147507, "eval_precision": 0.9272727272727272, "eval_recall": 0.9015151515151515, "eval_runtime": 7.5082, "eval_samples_per_second": 327.642, "eval_steps_per_second": 1.332, "step": 9016 } ], "logging_steps": 3, "max_steps": 18032, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.01 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.258372109663753e+19, "train_batch_size": 128, "trial_name": null, "trial_params": null }