{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1024, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0009765625, "grad_norm": 0.6541444063186646, "learning_rate": 0.0, "loss": 1.0280990600585938, "step": 1 }, { "epoch": 0.001953125, "grad_norm": 0.4356674551963806, "learning_rate": 4e-05, "loss": 0.8305179476737976, "step": 2 }, { "epoch": 0.0029296875, "grad_norm": 0.3900858759880066, "learning_rate": 8e-05, "loss": 0.7835474014282227, "step": 3 }, { "epoch": 0.00390625, "grad_norm": 0.3717947006225586, "learning_rate": 0.00012, "loss": 1.1571688652038574, "step": 4 }, { "epoch": 0.0048828125, "grad_norm": 0.2760661542415619, "learning_rate": 0.00016, "loss": 0.8141135573387146, "step": 5 }, { "epoch": 0.005859375, "grad_norm": 0.24524882435798645, "learning_rate": 0.0002, "loss": 0.29919666051864624, "step": 6 }, { "epoch": 0.0068359375, "grad_norm": 0.3155483305454254, "learning_rate": 0.00019980372914622178, "loss": 0.916366696357727, "step": 7 }, { "epoch": 0.0078125, "grad_norm": 1.0419310331344604, "learning_rate": 0.00019960745829244357, "loss": 0.986505389213562, "step": 8 }, { "epoch": 0.0087890625, "grad_norm": 0.32395845651626587, "learning_rate": 0.00019941118743866537, "loss": 0.7845190167427063, "step": 9 }, { "epoch": 0.009765625, "grad_norm": 0.564084529876709, "learning_rate": 0.00019921491658488717, "loss": 1.0922366380691528, "step": 10 }, { "epoch": 0.0107421875, "grad_norm": 0.4066593647003174, "learning_rate": 0.00019901864573110893, "loss": 1.0279463529586792, "step": 11 }, { "epoch": 0.01171875, "grad_norm": 0.43442535400390625, "learning_rate": 0.00019882237487733073, "loss": 0.9713175892829895, "step": 12 }, { "epoch": 0.0126953125, "grad_norm": 0.26689526438713074, "learning_rate": 0.0001986261040235525, "loss": 0.38461241126060486, "step": 13 }, { "epoch": 0.013671875, "grad_norm": 0.41254541277885437, "learning_rate": 0.0001984298331697743, "loss": 0.7746479511260986, "step": 14 }, { "epoch": 0.0146484375, "grad_norm": 0.39432424306869507, "learning_rate": 0.0001982335623159961, "loss": 0.7843194603919983, "step": 15 }, { "epoch": 0.015625, "grad_norm": 0.4303337037563324, "learning_rate": 0.0001980372914622179, "loss": 0.6613403558731079, "step": 16 }, { "epoch": 0.0166015625, "grad_norm": 0.875269889831543, "learning_rate": 0.00019784102060843966, "loss": 1.0992671251296997, "step": 17 }, { "epoch": 0.017578125, "grad_norm": 0.21415413916110992, "learning_rate": 0.00019764474975466145, "loss": 0.2784216105937958, "step": 18 }, { "epoch": 0.0185546875, "grad_norm": 0.4318086504936218, "learning_rate": 0.00019744847890088322, "loss": 0.6146124005317688, "step": 19 }, { "epoch": 0.01953125, "grad_norm": 0.20149515569210052, "learning_rate": 0.00019725220804710502, "loss": 0.3920556306838989, "step": 20 }, { "epoch": 0.0205078125, "grad_norm": 0.358688622713089, "learning_rate": 0.0001970559371933268, "loss": 0.6672685742378235, "step": 21 }, { "epoch": 0.021484375, "grad_norm": 0.5916730165481567, "learning_rate": 0.00019685966633954858, "loss": 1.0804443359375, "step": 22 }, { "epoch": 0.0224609375, "grad_norm": 0.3139825761318207, "learning_rate": 0.00019666339548577038, "loss": 0.7358766794204712, "step": 23 }, { "epoch": 0.0234375, "grad_norm": 0.4019712805747986, "learning_rate": 0.00019646712463199215, "loss": 0.7362902164459229, "step": 24 }, { "epoch": 0.0244140625, "grad_norm": 0.2874290347099304, "learning_rate": 0.00019627085377821394, "loss": 0.6446189284324646, "step": 25 }, { "epoch": 0.025390625, "grad_norm": 0.357494592666626, "learning_rate": 0.0001960745829244357, "loss": 0.2820976972579956, "step": 26 }, { "epoch": 0.0263671875, "grad_norm": 0.22216391563415527, "learning_rate": 0.00019587831207065753, "loss": 0.6020435094833374, "step": 27 }, { "epoch": 0.02734375, "grad_norm": 0.23284995555877686, "learning_rate": 0.0001956820412168793, "loss": 0.44151532649993896, "step": 28 }, { "epoch": 0.0283203125, "grad_norm": 0.3594605028629303, "learning_rate": 0.0001954857703631011, "loss": 0.9414041042327881, "step": 29 }, { "epoch": 0.029296875, "grad_norm": 0.4460504353046417, "learning_rate": 0.00019528949950932287, "loss": 0.7148531079292297, "step": 30 }, { "epoch": 0.0302734375, "grad_norm": 0.3392362892627716, "learning_rate": 0.00019509322865554466, "loss": 0.7185512781143188, "step": 31 }, { "epoch": 0.03125, "grad_norm": 0.3340625464916229, "learning_rate": 0.00019489695780176643, "loss": 0.6613262891769409, "step": 32 }, { "epoch": 0.0322265625, "grad_norm": 0.26223355531692505, "learning_rate": 0.00019470068694798826, "loss": 0.590149462223053, "step": 33 }, { "epoch": 0.033203125, "grad_norm": 0.3481689691543579, "learning_rate": 0.00019450441609421002, "loss": 0.5590913891792297, "step": 34 }, { "epoch": 0.0341796875, "grad_norm": 0.4775488078594208, "learning_rate": 0.00019430814524043182, "loss": 0.927351176738739, "step": 35 }, { "epoch": 0.03515625, "grad_norm": 0.4474835693836212, "learning_rate": 0.0001941118743866536, "loss": 0.7719380855560303, "step": 36 }, { "epoch": 0.0361328125, "grad_norm": 0.3538999855518341, "learning_rate": 0.00019391560353287536, "loss": 1.0287561416625977, "step": 37 }, { "epoch": 0.037109375, "grad_norm": 0.5018237233161926, "learning_rate": 0.00019371933267909715, "loss": 1.049814224243164, "step": 38 }, { "epoch": 0.0380859375, "grad_norm": 0.5052743554115295, "learning_rate": 0.00019352306182531895, "loss": 0.39767658710479736, "step": 39 }, { "epoch": 0.0390625, "grad_norm": 0.46170520782470703, "learning_rate": 0.00019332679097154075, "loss": 0.9849376678466797, "step": 40 }, { "epoch": 0.0400390625, "grad_norm": 0.5961291193962097, "learning_rate": 0.00019313052011776251, "loss": 0.8527336716651917, "step": 41 }, { "epoch": 0.041015625, "grad_norm": 0.4002876579761505, "learning_rate": 0.0001929342492639843, "loss": 0.7445047497749329, "step": 42 }, { "epoch": 0.0419921875, "grad_norm": 0.6382992267608643, "learning_rate": 0.00019273797841020608, "loss": 0.7587878704071045, "step": 43 }, { "epoch": 0.04296875, "grad_norm": 0.4204530715942383, "learning_rate": 0.00019254170755642788, "loss": 0.943995475769043, "step": 44 }, { "epoch": 0.0439453125, "grad_norm": 0.29038068652153015, "learning_rate": 0.00019234543670264967, "loss": 0.4540131688117981, "step": 45 }, { "epoch": 0.044921875, "grad_norm": 0.41968628764152527, "learning_rate": 0.00019214916584887147, "loss": 0.3900204300880432, "step": 46 }, { "epoch": 0.0458984375, "grad_norm": 0.5870251059532166, "learning_rate": 0.00019195289499509324, "loss": 0.8700598478317261, "step": 47 }, { "epoch": 0.046875, "grad_norm": 0.3120124042034149, "learning_rate": 0.00019175662414131503, "loss": 0.2866731882095337, "step": 48 }, { "epoch": 0.0478515625, "grad_norm": 0.31891942024230957, "learning_rate": 0.0001915603532875368, "loss": 0.7711223363876343, "step": 49 }, { "epoch": 0.048828125, "grad_norm": 0.4250207543373108, "learning_rate": 0.0001913640824337586, "loss": 0.7499758005142212, "step": 50 }, { "epoch": 0.0498046875, "grad_norm": 0.4769924581050873, "learning_rate": 0.0001911678115799804, "loss": 0.8479812145233154, "step": 51 }, { "epoch": 0.05078125, "grad_norm": 0.2966979146003723, "learning_rate": 0.00019097154072620216, "loss": 0.8125182390213013, "step": 52 }, { "epoch": 0.0517578125, "grad_norm": 0.4924452006816864, "learning_rate": 0.00019077526987242396, "loss": 1.006331443786621, "step": 53 }, { "epoch": 0.052734375, "grad_norm": 0.5558736324310303, "learning_rate": 0.00019057899901864573, "loss": 0.8218062520027161, "step": 54 }, { "epoch": 0.0537109375, "grad_norm": 0.488903284072876, "learning_rate": 0.00019038272816486752, "loss": 0.7451006770133972, "step": 55 }, { "epoch": 0.0546875, "grad_norm": 0.6092124581336975, "learning_rate": 0.00019018645731108932, "loss": 0.3371097445487976, "step": 56 }, { "epoch": 0.0556640625, "grad_norm": 0.34885621070861816, "learning_rate": 0.00018999018645731111, "loss": 0.9263520836830139, "step": 57 }, { "epoch": 0.056640625, "grad_norm": 0.41470521688461304, "learning_rate": 0.00018979391560353288, "loss": 0.8741390109062195, "step": 58 }, { "epoch": 0.0576171875, "grad_norm": 0.32286664843559265, "learning_rate": 0.00018959764474975468, "loss": 0.6128658056259155, "step": 59 }, { "epoch": 0.05859375, "grad_norm": 0.43667954206466675, "learning_rate": 0.00018940137389597645, "loss": 0.822106122970581, "step": 60 }, { "epoch": 0.0595703125, "grad_norm": 0.5501149892807007, "learning_rate": 0.00018920510304219824, "loss": 0.2981743812561035, "step": 61 }, { "epoch": 0.060546875, "grad_norm": 0.5234649777412415, "learning_rate": 0.00018900883218842004, "loss": 0.710310161113739, "step": 62 }, { "epoch": 0.0615234375, "grad_norm": 0.5040559768676758, "learning_rate": 0.00018881256133464184, "loss": 1.0355676412582397, "step": 63 }, { "epoch": 0.0625, "grad_norm": 0.4435643255710602, "learning_rate": 0.0001886162904808636, "loss": 1.031105399131775, "step": 64 }, { "epoch": 0.0634765625, "grad_norm": 0.4987465441226959, "learning_rate": 0.0001884200196270854, "loss": 0.7753915190696716, "step": 65 }, { "epoch": 0.064453125, "grad_norm": 0.3633696436882019, "learning_rate": 0.00018822374877330717, "loss": 1.2376799583435059, "step": 66 }, { "epoch": 0.0654296875, "grad_norm": 1.0342258214950562, "learning_rate": 0.00018802747791952894, "loss": 0.6145737171173096, "step": 67 }, { "epoch": 0.06640625, "grad_norm": 0.47045138478279114, "learning_rate": 0.00018783120706575076, "loss": 0.8622407913208008, "step": 68 }, { "epoch": 0.0673828125, "grad_norm": 0.47864851355552673, "learning_rate": 0.00018763493621197253, "loss": 0.6727300882339478, "step": 69 }, { "epoch": 0.068359375, "grad_norm": 0.38102060556411743, "learning_rate": 0.00018743866535819433, "loss": 0.7417519092559814, "step": 70 }, { "epoch": 0.0693359375, "grad_norm": 0.4229515492916107, "learning_rate": 0.0001872423945044161, "loss": 0.46951866149902344, "step": 71 }, { "epoch": 0.0703125, "grad_norm": 0.4868115186691284, "learning_rate": 0.0001870461236506379, "loss": 0.32457292079925537, "step": 72 }, { "epoch": 0.0712890625, "grad_norm": 0.298020601272583, "learning_rate": 0.00018684985279685966, "loss": 0.2501494288444519, "step": 73 }, { "epoch": 0.072265625, "grad_norm": 0.49870651960372925, "learning_rate": 0.00018665358194308145, "loss": 0.5599403381347656, "step": 74 }, { "epoch": 0.0732421875, "grad_norm": 0.5717479586601257, "learning_rate": 0.00018645731108930325, "loss": 0.4725653827190399, "step": 75 }, { "epoch": 0.07421875, "grad_norm": 0.5230128765106201, "learning_rate": 0.00018626104023552505, "loss": 1.0607699155807495, "step": 76 }, { "epoch": 0.0751953125, "grad_norm": 0.4279435873031616, "learning_rate": 0.00018606476938174682, "loss": 0.5628142952919006, "step": 77 }, { "epoch": 0.076171875, "grad_norm": 0.6166331171989441, "learning_rate": 0.0001858684985279686, "loss": 0.44837141036987305, "step": 78 }, { "epoch": 0.0771484375, "grad_norm": 0.6329861879348755, "learning_rate": 0.00018567222767419038, "loss": 0.5013883709907532, "step": 79 }, { "epoch": 0.078125, "grad_norm": 0.2921103239059448, "learning_rate": 0.00018547595682041218, "loss": 0.541824996471405, "step": 80 }, { "epoch": 0.0791015625, "grad_norm": 0.36744800209999084, "learning_rate": 0.00018527968596663397, "loss": 0.3878925144672394, "step": 81 }, { "epoch": 0.080078125, "grad_norm": 0.34045904874801636, "learning_rate": 0.00018508341511285574, "loss": 0.33476194739341736, "step": 82 }, { "epoch": 0.0810546875, "grad_norm": 0.48908546566963196, "learning_rate": 0.00018488714425907754, "loss": 1.003555178642273, "step": 83 }, { "epoch": 0.08203125, "grad_norm": 0.4683694839477539, "learning_rate": 0.0001846908734052993, "loss": 0.7300649285316467, "step": 84 }, { "epoch": 0.0830078125, "grad_norm": 0.3560928404331207, "learning_rate": 0.0001844946025515211, "loss": 0.4525097608566284, "step": 85 }, { "epoch": 0.083984375, "grad_norm": 1.481307864189148, "learning_rate": 0.0001842983316977429, "loss": 0.5444833040237427, "step": 86 }, { "epoch": 0.0849609375, "grad_norm": 0.42610403895378113, "learning_rate": 0.0001841020608439647, "loss": 0.7340827584266663, "step": 87 }, { "epoch": 0.0859375, "grad_norm": 0.6035026907920837, "learning_rate": 0.00018390578999018646, "loss": 0.5589049458503723, "step": 88 }, { "epoch": 0.0869140625, "grad_norm": 0.6075074076652527, "learning_rate": 0.00018370951913640826, "loss": 0.4969009757041931, "step": 89 }, { "epoch": 0.087890625, "grad_norm": 0.6751372814178467, "learning_rate": 0.00018351324828263003, "loss": 0.46451041102409363, "step": 90 }, { "epoch": 0.0888671875, "grad_norm": 0.5816373229026794, "learning_rate": 0.00018331697742885182, "loss": 1.024427056312561, "step": 91 }, { "epoch": 0.08984375, "grad_norm": 0.6644161939620972, "learning_rate": 0.00018312070657507362, "loss": 0.778592586517334, "step": 92 }, { "epoch": 0.0908203125, "grad_norm": 0.652209997177124, "learning_rate": 0.00018292443572129541, "loss": 0.8565710783004761, "step": 93 }, { "epoch": 0.091796875, "grad_norm": 0.9109074473381042, "learning_rate": 0.00018272816486751718, "loss": 0.6693978309631348, "step": 94 }, { "epoch": 0.0927734375, "grad_norm": 0.5235186219215393, "learning_rate": 0.00018253189401373895, "loss": 0.8255172967910767, "step": 95 }, { "epoch": 0.09375, "grad_norm": 0.8362122178077698, "learning_rate": 0.00018233562315996075, "loss": 0.5858157873153687, "step": 96 }, { "epoch": 0.0947265625, "grad_norm": 0.6753116846084595, "learning_rate": 0.00018213935230618254, "loss": 0.6682421565055847, "step": 97 }, { "epoch": 0.095703125, "grad_norm": 0.5394794940948486, "learning_rate": 0.00018194308145240434, "loss": 0.3218158781528473, "step": 98 }, { "epoch": 0.0966796875, "grad_norm": 3.2796010971069336, "learning_rate": 0.0001817468105986261, "loss": 0.681085467338562, "step": 99 }, { "epoch": 0.09765625, "grad_norm": 0.38390907645225525, "learning_rate": 0.0001815505397448479, "loss": 0.39554187655448914, "step": 100 }, { "epoch": 0.0986328125, "grad_norm": 0.5289499759674072, "learning_rate": 0.00018135426889106967, "loss": 1.0264520645141602, "step": 101 }, { "epoch": 0.099609375, "grad_norm": 0.8211148977279663, "learning_rate": 0.00018115799803729147, "loss": 0.8588113784790039, "step": 102 }, { "epoch": 0.1005859375, "grad_norm": 0.4771063029766083, "learning_rate": 0.00018096172718351327, "loss": 0.7471244931221008, "step": 103 }, { "epoch": 0.1015625, "grad_norm": 0.6326794624328613, "learning_rate": 0.00018076545632973506, "loss": 0.6081597805023193, "step": 104 }, { "epoch": 0.1025390625, "grad_norm": 0.7229248285293579, "learning_rate": 0.00018056918547595683, "loss": 0.8315082788467407, "step": 105 }, { "epoch": 0.103515625, "grad_norm": 0.6803163290023804, "learning_rate": 0.00018037291462217863, "loss": 0.8308911323547363, "step": 106 }, { "epoch": 0.1044921875, "grad_norm": 0.5268850326538086, "learning_rate": 0.0001801766437684004, "loss": 0.8480656743049622, "step": 107 }, { "epoch": 0.10546875, "grad_norm": 0.7849289178848267, "learning_rate": 0.0001799803729146222, "loss": 0.8200575113296509, "step": 108 }, { "epoch": 0.1064453125, "grad_norm": 0.4259982407093048, "learning_rate": 0.00017978410206084396, "loss": 0.44367721676826477, "step": 109 }, { "epoch": 0.107421875, "grad_norm": 0.4788619577884674, "learning_rate": 0.00017958783120706576, "loss": 0.6017763018608093, "step": 110 }, { "epoch": 0.1083984375, "grad_norm": 0.34434452652931213, "learning_rate": 0.00017939156035328755, "loss": 0.29681769013404846, "step": 111 }, { "epoch": 0.109375, "grad_norm": 1.1506884098052979, "learning_rate": 0.00017919528949950932, "loss": 0.6520863771438599, "step": 112 }, { "epoch": 0.1103515625, "grad_norm": 0.8348999619483948, "learning_rate": 0.00017899901864573112, "loss": 0.6035414934158325, "step": 113 }, { "epoch": 0.111328125, "grad_norm": 0.5550518035888672, "learning_rate": 0.00017880274779195289, "loss": 0.7711564302444458, "step": 114 }, { "epoch": 0.1123046875, "grad_norm": 0.28814634680747986, "learning_rate": 0.00017860647693817468, "loss": 0.8325987458229065, "step": 115 }, { "epoch": 0.11328125, "grad_norm": 0.3833630084991455, "learning_rate": 0.00017841020608439648, "loss": 0.3345921039581299, "step": 116 }, { "epoch": 0.1142578125, "grad_norm": 0.8784507513046265, "learning_rate": 0.00017821393523061827, "loss": 0.4186948239803314, "step": 117 }, { "epoch": 0.115234375, "grad_norm": 0.7263842225074768, "learning_rate": 0.00017801766437684004, "loss": 0.5570493936538696, "step": 118 }, { "epoch": 0.1162109375, "grad_norm": 0.6391569972038269, "learning_rate": 0.00017782139352306184, "loss": 1.0257431268692017, "step": 119 }, { "epoch": 0.1171875, "grad_norm": 0.6025450229644775, "learning_rate": 0.0001776251226692836, "loss": 0.8676729202270508, "step": 120 }, { "epoch": 0.1181640625, "grad_norm": 0.3776579201221466, "learning_rate": 0.0001774288518155054, "loss": 0.5870720148086548, "step": 121 }, { "epoch": 0.119140625, "grad_norm": 0.40912336111068726, "learning_rate": 0.0001772325809617272, "loss": 0.9210044145584106, "step": 122 }, { "epoch": 0.1201171875, "grad_norm": 0.5036085247993469, "learning_rate": 0.000177036310107949, "loss": 0.47378072142601013, "step": 123 }, { "epoch": 0.12109375, "grad_norm": 0.5508134961128235, "learning_rate": 0.00017684003925417076, "loss": 0.8295834064483643, "step": 124 }, { "epoch": 0.1220703125, "grad_norm": 0.5522392392158508, "learning_rate": 0.00017664376840039253, "loss": 0.793156087398529, "step": 125 }, { "epoch": 0.123046875, "grad_norm": 1.0098820924758911, "learning_rate": 0.00017644749754661433, "loss": 0.5780155658721924, "step": 126 }, { "epoch": 0.1240234375, "grad_norm": 0.6178780198097229, "learning_rate": 0.00017625122669283612, "loss": 0.5129156708717346, "step": 127 }, { "epoch": 0.125, "grad_norm": 0.6224352121353149, "learning_rate": 0.00017605495583905792, "loss": 0.8498928546905518, "step": 128 }, { "epoch": 0.1259765625, "grad_norm": 0.7869983315467834, "learning_rate": 0.0001758586849852797, "loss": 0.9180670976638794, "step": 129 }, { "epoch": 0.126953125, "grad_norm": 0.4122680127620697, "learning_rate": 0.00017566241413150148, "loss": 0.510919988155365, "step": 130 }, { "epoch": 0.1279296875, "grad_norm": 0.7221843004226685, "learning_rate": 0.00017546614327772325, "loss": 0.3977488875389099, "step": 131 }, { "epoch": 0.12890625, "grad_norm": 1.155800461769104, "learning_rate": 0.00017526987242394505, "loss": 0.6549078226089478, "step": 132 }, { "epoch": 0.1298828125, "grad_norm": 0.7164724469184875, "learning_rate": 0.00017507360157016685, "loss": 0.8306566476821899, "step": 133 }, { "epoch": 0.130859375, "grad_norm": 0.7600284814834595, "learning_rate": 0.00017487733071638864, "loss": 0.34278520941734314, "step": 134 }, { "epoch": 0.1318359375, "grad_norm": 0.8636081218719482, "learning_rate": 0.0001746810598626104, "loss": 0.8881778717041016, "step": 135 }, { "epoch": 0.1328125, "grad_norm": 1.0904357433319092, "learning_rate": 0.0001744847890088322, "loss": 0.4423227310180664, "step": 136 }, { "epoch": 0.1337890625, "grad_norm": 0.5639862418174744, "learning_rate": 0.00017428851815505397, "loss": 0.8610935211181641, "step": 137 }, { "epoch": 0.134765625, "grad_norm": 1.05929696559906, "learning_rate": 0.00017409224730127577, "loss": 1.1729753017425537, "step": 138 }, { "epoch": 0.1357421875, "grad_norm": 1.0731761455535889, "learning_rate": 0.00017389597644749757, "loss": 0.6459341049194336, "step": 139 }, { "epoch": 0.13671875, "grad_norm": 0.7464702725410461, "learning_rate": 0.00017369970559371934, "loss": 0.5368601083755493, "step": 140 }, { "epoch": 0.1376953125, "grad_norm": 0.5722304582595825, "learning_rate": 0.00017350343473994113, "loss": 0.9642695784568787, "step": 141 }, { "epoch": 0.138671875, "grad_norm": 0.5044945478439331, "learning_rate": 0.0001733071638861629, "loss": 0.49555253982543945, "step": 142 }, { "epoch": 0.1396484375, "grad_norm": 0.8069168329238892, "learning_rate": 0.0001731108930323847, "loss": 0.8796389698982239, "step": 143 }, { "epoch": 0.140625, "grad_norm": 0.5269959568977356, "learning_rate": 0.00017291462217860646, "loss": 0.9928920269012451, "step": 144 }, { "epoch": 0.1416015625, "grad_norm": 0.6606360077857971, "learning_rate": 0.0001727183513248283, "loss": 1.0528640747070312, "step": 145 }, { "epoch": 0.142578125, "grad_norm": 0.7145242691040039, "learning_rate": 0.00017252208047105006, "loss": 1.1252766847610474, "step": 146 }, { "epoch": 0.1435546875, "grad_norm": 0.5808660984039307, "learning_rate": 0.00017232580961727185, "loss": 0.24914072453975677, "step": 147 }, { "epoch": 0.14453125, "grad_norm": 0.8544529676437378, "learning_rate": 0.00017212953876349362, "loss": 0.4420434832572937, "step": 148 }, { "epoch": 0.1455078125, "grad_norm": 0.899334728717804, "learning_rate": 0.00017193326790971542, "loss": 0.7128512263298035, "step": 149 }, { "epoch": 0.146484375, "grad_norm": 0.36327579617500305, "learning_rate": 0.00017173699705593719, "loss": 0.5503419637680054, "step": 150 }, { "epoch": 0.1474609375, "grad_norm": 0.553255021572113, "learning_rate": 0.000171540726202159, "loss": 0.5796535015106201, "step": 151 }, { "epoch": 0.1484375, "grad_norm": 0.41036659479141235, "learning_rate": 0.00017134445534838078, "loss": 0.8935849666595459, "step": 152 }, { "epoch": 0.1494140625, "grad_norm": 0.3723013997077942, "learning_rate": 0.00017114818449460257, "loss": 0.39106485247612, "step": 153 }, { "epoch": 0.150390625, "grad_norm": 0.654262900352478, "learning_rate": 0.00017095191364082434, "loss": 1.0176405906677246, "step": 154 }, { "epoch": 0.1513671875, "grad_norm": 0.5707812309265137, "learning_rate": 0.0001707556427870461, "loss": 0.6580768823623657, "step": 155 }, { "epoch": 0.15234375, "grad_norm": 0.35879406332969666, "learning_rate": 0.0001705593719332679, "loss": 0.4050876200199127, "step": 156 }, { "epoch": 0.1533203125, "grad_norm": 0.5701449513435364, "learning_rate": 0.0001703631010794897, "loss": 0.9737375974655151, "step": 157 }, { "epoch": 0.154296875, "grad_norm": 0.4461202919483185, "learning_rate": 0.0001701668302257115, "loss": 0.9864733815193176, "step": 158 }, { "epoch": 0.1552734375, "grad_norm": 0.6229621767997742, "learning_rate": 0.00016997055937193327, "loss": 0.35883933305740356, "step": 159 }, { "epoch": 0.15625, "grad_norm": 0.5390028357505798, "learning_rate": 0.00016977428851815506, "loss": 0.5791765451431274, "step": 160 }, { "epoch": 0.1572265625, "grad_norm": 0.7851611375808716, "learning_rate": 0.00016957801766437683, "loss": 0.9032300114631653, "step": 161 }, { "epoch": 0.158203125, "grad_norm": 0.6211395263671875, "learning_rate": 0.00016938174681059863, "loss": 0.5069928765296936, "step": 162 }, { "epoch": 0.1591796875, "grad_norm": 0.8290377855300903, "learning_rate": 0.00016918547595682042, "loss": 0.8917738795280457, "step": 163 }, { "epoch": 0.16015625, "grad_norm": 0.42707324028015137, "learning_rate": 0.00016898920510304222, "loss": 0.606585681438446, "step": 164 }, { "epoch": 0.1611328125, "grad_norm": 0.49472010135650635, "learning_rate": 0.000168792934249264, "loss": 1.0100075006484985, "step": 165 }, { "epoch": 0.162109375, "grad_norm": 0.48441267013549805, "learning_rate": 0.00016859666339548579, "loss": 0.7145558595657349, "step": 166 }, { "epoch": 0.1630859375, "grad_norm": 0.5181763172149658, "learning_rate": 0.00016840039254170755, "loss": 0.8088749647140503, "step": 167 }, { "epoch": 0.1640625, "grad_norm": 0.4702328145503998, "learning_rate": 0.00016820412168792935, "loss": 0.5631542801856995, "step": 168 }, { "epoch": 0.1650390625, "grad_norm": 0.35454344749450684, "learning_rate": 0.00016800785083415115, "loss": 0.31744396686553955, "step": 169 }, { "epoch": 0.166015625, "grad_norm": 0.5193122029304504, "learning_rate": 0.00016781157998037291, "loss": 0.7338438034057617, "step": 170 }, { "epoch": 0.1669921875, "grad_norm": 0.49799400568008423, "learning_rate": 0.0001676153091265947, "loss": 0.7910654544830322, "step": 171 }, { "epoch": 0.16796875, "grad_norm": 0.4855571389198303, "learning_rate": 0.00016741903827281648, "loss": 0.38415610790252686, "step": 172 }, { "epoch": 0.1689453125, "grad_norm": 0.8796041011810303, "learning_rate": 0.00016722276741903828, "loss": 0.6042807102203369, "step": 173 }, { "epoch": 0.169921875, "grad_norm": 0.6005135774612427, "learning_rate": 0.00016702649656526007, "loss": 0.6617047786712646, "step": 174 }, { "epoch": 0.1708984375, "grad_norm": 0.6359293460845947, "learning_rate": 0.00016683022571148187, "loss": 0.5227914452552795, "step": 175 }, { "epoch": 0.171875, "grad_norm": 0.46007266640663147, "learning_rate": 0.00016663395485770364, "loss": 0.6881235837936401, "step": 176 }, { "epoch": 0.1728515625, "grad_norm": 0.37411797046661377, "learning_rate": 0.00016643768400392543, "loss": 0.7384200096130371, "step": 177 }, { "epoch": 0.173828125, "grad_norm": 0.4021860659122467, "learning_rate": 0.0001662414131501472, "loss": 1.1738500595092773, "step": 178 }, { "epoch": 0.1748046875, "grad_norm": 0.3674755096435547, "learning_rate": 0.000166045142296369, "loss": 0.37539663910865784, "step": 179 }, { "epoch": 0.17578125, "grad_norm": 0.5051441788673401, "learning_rate": 0.0001658488714425908, "loss": 0.6273016333580017, "step": 180 }, { "epoch": 0.1767578125, "grad_norm": 0.6807597279548645, "learning_rate": 0.0001656526005888126, "loss": 0.4195510447025299, "step": 181 }, { "epoch": 0.177734375, "grad_norm": 0.3345419466495514, "learning_rate": 0.00016545632973503436, "loss": 0.8546851873397827, "step": 182 }, { "epoch": 0.1787109375, "grad_norm": 0.33821800351142883, "learning_rate": 0.00016526005888125615, "loss": 0.522655725479126, "step": 183 }, { "epoch": 0.1796875, "grad_norm": 0.3145562708377838, "learning_rate": 0.00016506378802747792, "loss": 0.3799128532409668, "step": 184 }, { "epoch": 0.1806640625, "grad_norm": 0.44908636808395386, "learning_rate": 0.0001648675171736997, "loss": 0.6263326406478882, "step": 185 }, { "epoch": 0.181640625, "grad_norm": 0.7736865282058716, "learning_rate": 0.00016467124631992151, "loss": 0.3385460078716278, "step": 186 }, { "epoch": 0.1826171875, "grad_norm": 0.5184527635574341, "learning_rate": 0.00016447497546614328, "loss": 0.7980771064758301, "step": 187 }, { "epoch": 0.18359375, "grad_norm": 0.41774502396583557, "learning_rate": 0.00016427870461236508, "loss": 0.7745299339294434, "step": 188 }, { "epoch": 0.1845703125, "grad_norm": 0.43824154138565063, "learning_rate": 0.00016408243375858685, "loss": 0.9190135598182678, "step": 189 }, { "epoch": 0.185546875, "grad_norm": 0.4037880301475525, "learning_rate": 0.00016388616290480864, "loss": 0.5671911239624023, "step": 190 }, { "epoch": 0.1865234375, "grad_norm": 0.3757816255092621, "learning_rate": 0.0001636898920510304, "loss": 0.39916592836380005, "step": 191 }, { "epoch": 0.1875, "grad_norm": 0.4747844636440277, "learning_rate": 0.00016349362119725224, "loss": 0.9217299818992615, "step": 192 }, { "epoch": 0.1884765625, "grad_norm": 0.42307209968566895, "learning_rate": 0.000163297350343474, "loss": 0.8852982521057129, "step": 193 }, { "epoch": 0.189453125, "grad_norm": 0.47294488549232483, "learning_rate": 0.0001631010794896958, "loss": 1.0635476112365723, "step": 194 }, { "epoch": 0.1904296875, "grad_norm": 0.3519342243671417, "learning_rate": 0.00016290480863591757, "loss": 0.33460623025894165, "step": 195 }, { "epoch": 0.19140625, "grad_norm": 0.418151319026947, "learning_rate": 0.00016270853778213936, "loss": 0.8776851296424866, "step": 196 }, { "epoch": 0.1923828125, "grad_norm": 0.3954712152481079, "learning_rate": 0.00016251226692836113, "loss": 0.9358173608779907, "step": 197 }, { "epoch": 0.193359375, "grad_norm": 0.35646897554397583, "learning_rate": 0.00016231599607458293, "loss": 0.43795716762542725, "step": 198 }, { "epoch": 0.1943359375, "grad_norm": 0.41675063967704773, "learning_rate": 0.00016211972522080473, "loss": 0.8348654508590698, "step": 199 }, { "epoch": 0.1953125, "grad_norm": 0.5800544023513794, "learning_rate": 0.0001619234543670265, "loss": 0.5580507516860962, "step": 200 }, { "epoch": 0.1962890625, "grad_norm": 0.44925832748413086, "learning_rate": 0.0001617271835132483, "loss": 0.47444453835487366, "step": 201 }, { "epoch": 0.197265625, "grad_norm": 0.48447439074516296, "learning_rate": 0.00016153091265947006, "loss": 0.5927308797836304, "step": 202 }, { "epoch": 0.1982421875, "grad_norm": 0.37814846634864807, "learning_rate": 0.00016133464180569186, "loss": 0.8504298329353333, "step": 203 }, { "epoch": 0.19921875, "grad_norm": 0.4171026051044464, "learning_rate": 0.00016113837095191365, "loss": 1.0796414613723755, "step": 204 }, { "epoch": 0.2001953125, "grad_norm": 0.4570372402667999, "learning_rate": 0.00016094210009813545, "loss": 0.6229358315467834, "step": 205 }, { "epoch": 0.201171875, "grad_norm": 0.6294324994087219, "learning_rate": 0.00016074582924435722, "loss": 0.8749011158943176, "step": 206 }, { "epoch": 0.2021484375, "grad_norm": 0.42371129989624023, "learning_rate": 0.000160549558390579, "loss": 0.9866290092468262, "step": 207 }, { "epoch": 0.203125, "grad_norm": 0.5329370498657227, "learning_rate": 0.00016035328753680078, "loss": 0.7568405270576477, "step": 208 }, { "epoch": 0.2041015625, "grad_norm": 0.37205901741981506, "learning_rate": 0.00016015701668302258, "loss": 0.7115534543991089, "step": 209 }, { "epoch": 0.205078125, "grad_norm": 0.4536517262458801, "learning_rate": 0.00015996074582924437, "loss": 0.5152509808540344, "step": 210 }, { "epoch": 0.2060546875, "grad_norm": 2.319321393966675, "learning_rate": 0.00015976447497546617, "loss": 0.2915653586387634, "step": 211 }, { "epoch": 0.20703125, "grad_norm": 0.7047526836395264, "learning_rate": 0.00015956820412168794, "loss": 0.3070187568664551, "step": 212 }, { "epoch": 0.2080078125, "grad_norm": 0.6068500280380249, "learning_rate": 0.0001593719332679097, "loss": 0.8103427290916443, "step": 213 }, { "epoch": 0.208984375, "grad_norm": 0.3588794469833374, "learning_rate": 0.0001591756624141315, "loss": 0.4655485153198242, "step": 214 }, { "epoch": 0.2099609375, "grad_norm": 0.6561040878295898, "learning_rate": 0.0001589793915603533, "loss": 0.5353362560272217, "step": 215 }, { "epoch": 0.2109375, "grad_norm": 0.6485084891319275, "learning_rate": 0.0001587831207065751, "loss": 0.8601769804954529, "step": 216 }, { "epoch": 0.2119140625, "grad_norm": 0.4718208909034729, "learning_rate": 0.00015858684985279686, "loss": 0.6897189617156982, "step": 217 }, { "epoch": 0.212890625, "grad_norm": 0.7453560829162598, "learning_rate": 0.00015839057899901866, "loss": 1.0387171506881714, "step": 218 }, { "epoch": 0.2138671875, "grad_norm": 0.41157087683677673, "learning_rate": 0.00015819430814524043, "loss": 0.4910873770713806, "step": 219 }, { "epoch": 0.21484375, "grad_norm": 0.4198990762233734, "learning_rate": 0.00015799803729146222, "loss": 0.588080108165741, "step": 220 }, { "epoch": 0.2158203125, "grad_norm": 0.7791650295257568, "learning_rate": 0.00015780176643768402, "loss": 0.754984974861145, "step": 221 }, { "epoch": 0.216796875, "grad_norm": 1.4430909156799316, "learning_rate": 0.00015760549558390581, "loss": 0.5313946008682251, "step": 222 }, { "epoch": 0.2177734375, "grad_norm": 0.4399142861366272, "learning_rate": 0.00015740922473012758, "loss": 0.523280918598175, "step": 223 }, { "epoch": 0.21875, "grad_norm": 0.4177611470222473, "learning_rate": 0.00015721295387634938, "loss": 0.7598159313201904, "step": 224 }, { "epoch": 0.2197265625, "grad_norm": 0.4408816397190094, "learning_rate": 0.00015701668302257115, "loss": 0.8131666779518127, "step": 225 }, { "epoch": 0.220703125, "grad_norm": 0.4228694438934326, "learning_rate": 0.00015682041216879294, "loss": 1.0456180572509766, "step": 226 }, { "epoch": 0.2216796875, "grad_norm": 0.6313449144363403, "learning_rate": 0.00015662414131501474, "loss": 0.496864914894104, "step": 227 }, { "epoch": 0.22265625, "grad_norm": 0.48103493452072144, "learning_rate": 0.0001564278704612365, "loss": 0.5967347621917725, "step": 228 }, { "epoch": 0.2236328125, "grad_norm": 0.3548172116279602, "learning_rate": 0.0001562315996074583, "loss": 0.3325611650943756, "step": 229 }, { "epoch": 0.224609375, "grad_norm": 0.41543763875961304, "learning_rate": 0.00015603532875368007, "loss": 0.9223452806472778, "step": 230 }, { "epoch": 0.2255859375, "grad_norm": 0.6072061061859131, "learning_rate": 0.00015583905789990187, "loss": 0.2860236167907715, "step": 231 }, { "epoch": 0.2265625, "grad_norm": 0.3232869505882263, "learning_rate": 0.00015564278704612364, "loss": 0.7308738231658936, "step": 232 }, { "epoch": 0.2275390625, "grad_norm": 0.5271327495574951, "learning_rate": 0.00015544651619234546, "loss": 1.0354498624801636, "step": 233 }, { "epoch": 0.228515625, "grad_norm": 0.626105546951294, "learning_rate": 0.00015525024533856723, "loss": 1.0841856002807617, "step": 234 }, { "epoch": 0.2294921875, "grad_norm": 0.5628311634063721, "learning_rate": 0.00015505397448478903, "loss": 0.8868529200553894, "step": 235 }, { "epoch": 0.23046875, "grad_norm": 0.4290577471256256, "learning_rate": 0.0001548577036310108, "loss": 0.5887943506240845, "step": 236 }, { "epoch": 0.2314453125, "grad_norm": 0.743786096572876, "learning_rate": 0.0001546614327772326, "loss": 0.8314348459243774, "step": 237 }, { "epoch": 0.232421875, "grad_norm": 0.34498658776283264, "learning_rate": 0.00015446516192345436, "loss": 0.6171099543571472, "step": 238 }, { "epoch": 0.2333984375, "grad_norm": 0.7894997596740723, "learning_rate": 0.00015426889106967616, "loss": 0.614283561706543, "step": 239 }, { "epoch": 0.234375, "grad_norm": 0.4631381034851074, "learning_rate": 0.00015407262021589795, "loss": 0.6744101047515869, "step": 240 }, { "epoch": 0.2353515625, "grad_norm": 0.44523295760154724, "learning_rate": 0.00015387634936211975, "loss": 0.7094103097915649, "step": 241 }, { "epoch": 0.236328125, "grad_norm": 0.7059242725372314, "learning_rate": 0.00015368007850834152, "loss": 0.6856737732887268, "step": 242 }, { "epoch": 0.2373046875, "grad_norm": 1.0360506772994995, "learning_rate": 0.00015348380765456329, "loss": 1.101341962814331, "step": 243 }, { "epoch": 0.23828125, "grad_norm": 0.6630859375, "learning_rate": 0.00015328753680078508, "loss": 0.8815068006515503, "step": 244 }, { "epoch": 0.2392578125, "grad_norm": 0.4162105321884155, "learning_rate": 0.00015309126594700688, "loss": 0.39801689982414246, "step": 245 }, { "epoch": 0.240234375, "grad_norm": 0.5786510109901428, "learning_rate": 0.00015289499509322867, "loss": 0.5399383902549744, "step": 246 }, { "epoch": 0.2412109375, "grad_norm": 0.5430185794830322, "learning_rate": 0.00015269872423945044, "loss": 0.5432325601577759, "step": 247 }, { "epoch": 0.2421875, "grad_norm": 0.3750382959842682, "learning_rate": 0.00015250245338567224, "loss": 0.49265092611312866, "step": 248 }, { "epoch": 0.2431640625, "grad_norm": 0.5081580877304077, "learning_rate": 0.000152306182531894, "loss": 0.8720104098320007, "step": 249 }, { "epoch": 0.244140625, "grad_norm": 0.5619673728942871, "learning_rate": 0.0001521099116781158, "loss": 0.4022529125213623, "step": 250 }, { "epoch": 0.2451171875, "grad_norm": 0.3996225893497467, "learning_rate": 0.0001519136408243376, "loss": 0.443879097700119, "step": 251 }, { "epoch": 0.24609375, "grad_norm": 0.4688915014266968, "learning_rate": 0.0001517173699705594, "loss": 0.47562721371650696, "step": 252 }, { "epoch": 0.2470703125, "grad_norm": 1.7595641613006592, "learning_rate": 0.00015152109911678116, "loss": 0.5174474716186523, "step": 253 }, { "epoch": 0.248046875, "grad_norm": 0.47813650965690613, "learning_rate": 0.00015132482826300296, "loss": 0.8565359711647034, "step": 254 }, { "epoch": 0.2490234375, "grad_norm": 0.49612802267074585, "learning_rate": 0.00015112855740922473, "loss": 0.4736977815628052, "step": 255 }, { "epoch": 0.25, "grad_norm": 0.4370449483394623, "learning_rate": 0.00015093228655544652, "loss": 0.7566809058189392, "step": 256 }, { "epoch": 0.2509765625, "grad_norm": 0.43916988372802734, "learning_rate": 0.00015073601570166832, "loss": 0.8396226763725281, "step": 257 }, { "epoch": 0.251953125, "grad_norm": 0.7745673060417175, "learning_rate": 0.0001505397448478901, "loss": 0.3085971772670746, "step": 258 }, { "epoch": 0.2529296875, "grad_norm": 0.4097643792629242, "learning_rate": 0.00015034347399411188, "loss": 0.2730502188205719, "step": 259 }, { "epoch": 0.25390625, "grad_norm": 0.4131183624267578, "learning_rate": 0.00015014720314033365, "loss": 0.5422588586807251, "step": 260 }, { "epoch": 0.2548828125, "grad_norm": 0.469498872756958, "learning_rate": 0.00014995093228655545, "loss": 0.6572885513305664, "step": 261 }, { "epoch": 0.255859375, "grad_norm": 0.3662133514881134, "learning_rate": 0.00014975466143277725, "loss": 0.9272421598434448, "step": 262 }, { "epoch": 0.2568359375, "grad_norm": 0.38194844126701355, "learning_rate": 0.00014955839057899904, "loss": 0.6010634303092957, "step": 263 }, { "epoch": 0.2578125, "grad_norm": 0.3645467758178711, "learning_rate": 0.0001493621197252208, "loss": 0.9131143093109131, "step": 264 }, { "epoch": 0.2587890625, "grad_norm": 0.3304290771484375, "learning_rate": 0.0001491658488714426, "loss": 0.4593530297279358, "step": 265 }, { "epoch": 0.259765625, "grad_norm": 0.7529020309448242, "learning_rate": 0.00014896957801766437, "loss": 0.5219628810882568, "step": 266 }, { "epoch": 0.2607421875, "grad_norm": 0.4974548816680908, "learning_rate": 0.00014877330716388617, "loss": 0.7617945075035095, "step": 267 }, { "epoch": 0.26171875, "grad_norm": 0.28884655237197876, "learning_rate": 0.00014857703631010797, "loss": 0.4288986921310425, "step": 268 }, { "epoch": 0.2626953125, "grad_norm": 0.5195730328559875, "learning_rate": 0.00014838076545632976, "loss": 0.830593466758728, "step": 269 }, { "epoch": 0.263671875, "grad_norm": 0.40689924359321594, "learning_rate": 0.00014818449460255153, "loss": 0.7528857588768005, "step": 270 }, { "epoch": 0.2646484375, "grad_norm": 0.33955928683280945, "learning_rate": 0.00014798822374877333, "loss": 0.5274187326431274, "step": 271 }, { "epoch": 0.265625, "grad_norm": 1.0572726726531982, "learning_rate": 0.0001477919528949951, "loss": 0.7389089465141296, "step": 272 }, { "epoch": 0.2666015625, "grad_norm": 0.5191348791122437, "learning_rate": 0.00014759568204121686, "loss": 0.4842514991760254, "step": 273 }, { "epoch": 0.267578125, "grad_norm": 0.3779315650463104, "learning_rate": 0.00014739941118743866, "loss": 0.7406666278839111, "step": 274 }, { "epoch": 0.2685546875, "grad_norm": 0.6065999865531921, "learning_rate": 0.00014720314033366046, "loss": 0.6771246790885925, "step": 275 }, { "epoch": 0.26953125, "grad_norm": 0.537529468536377, "learning_rate": 0.00014700686947988225, "loss": 0.861257791519165, "step": 276 }, { "epoch": 0.2705078125, "grad_norm": 0.3961732089519501, "learning_rate": 0.00014681059862610402, "loss": 0.9672999382019043, "step": 277 }, { "epoch": 0.271484375, "grad_norm": 0.45974740386009216, "learning_rate": 0.00014661432777232582, "loss": 0.5789016485214233, "step": 278 }, { "epoch": 0.2724609375, "grad_norm": 0.7211292386054993, "learning_rate": 0.00014641805691854759, "loss": 0.867314338684082, "step": 279 }, { "epoch": 0.2734375, "grad_norm": 0.6938930749893188, "learning_rate": 0.00014622178606476938, "loss": 0.4570122957229614, "step": 280 }, { "epoch": 0.2744140625, "grad_norm": 0.5093329548835754, "learning_rate": 0.00014602551521099118, "loss": 0.9487482309341431, "step": 281 }, { "epoch": 0.275390625, "grad_norm": 0.4403358995914459, "learning_rate": 0.00014582924435721297, "loss": 0.5330759286880493, "step": 282 }, { "epoch": 0.2763671875, "grad_norm": 0.5305198431015015, "learning_rate": 0.00014563297350343474, "loss": 0.8727459907531738, "step": 283 }, { "epoch": 0.27734375, "grad_norm": 0.49577099084854126, "learning_rate": 0.00014543670264965654, "loss": 0.6166709065437317, "step": 284 }, { "epoch": 0.2783203125, "grad_norm": 0.4856763780117035, "learning_rate": 0.0001452404317958783, "loss": 0.920722484588623, "step": 285 }, { "epoch": 0.279296875, "grad_norm": 0.3397112786769867, "learning_rate": 0.0001450441609421001, "loss": 1.001542329788208, "step": 286 }, { "epoch": 0.2802734375, "grad_norm": 0.591691792011261, "learning_rate": 0.0001448478900883219, "loss": 0.4898494780063629, "step": 287 }, { "epoch": 0.28125, "grad_norm": 0.45293164253234863, "learning_rate": 0.00014465161923454367, "loss": 0.4958389401435852, "step": 288 }, { "epoch": 0.2822265625, "grad_norm": 0.38414305448532104, "learning_rate": 0.00014445534838076546, "loss": 0.3971215784549713, "step": 289 }, { "epoch": 0.283203125, "grad_norm": 0.5568608045578003, "learning_rate": 0.00014425907752698723, "loss": 0.7953230142593384, "step": 290 }, { "epoch": 0.2841796875, "grad_norm": 0.3680984377861023, "learning_rate": 0.00014406280667320903, "loss": 0.703729510307312, "step": 291 }, { "epoch": 0.28515625, "grad_norm": 0.4263870120048523, "learning_rate": 0.00014386653581943082, "loss": 0.7433100938796997, "step": 292 }, { "epoch": 0.2861328125, "grad_norm": 1.3262213468551636, "learning_rate": 0.00014367026496565262, "loss": 0.8011248111724854, "step": 293 }, { "epoch": 0.287109375, "grad_norm": 0.44766104221343994, "learning_rate": 0.0001434739941118744, "loss": 0.6682827472686768, "step": 294 }, { "epoch": 0.2880859375, "grad_norm": 0.7399169206619263, "learning_rate": 0.00014327772325809619, "loss": 0.8356127142906189, "step": 295 }, { "epoch": 0.2890625, "grad_norm": 0.3582242727279663, "learning_rate": 0.00014308145240431795, "loss": 0.7127545475959778, "step": 296 }, { "epoch": 0.2900390625, "grad_norm": 0.5251145958900452, "learning_rate": 0.00014288518155053975, "loss": 0.7467620968818665, "step": 297 }, { "epoch": 0.291015625, "grad_norm": 0.639377772808075, "learning_rate": 0.00014268891069676155, "loss": 0.434887170791626, "step": 298 }, { "epoch": 0.2919921875, "grad_norm": 0.5007404685020447, "learning_rate": 0.00014249263984298334, "loss": 1.028229832649231, "step": 299 }, { "epoch": 0.29296875, "grad_norm": 0.41101035475730896, "learning_rate": 0.0001422963689892051, "loss": 0.8766242265701294, "step": 300 }, { "epoch": 0.2939453125, "grad_norm": 0.3938690721988678, "learning_rate": 0.0001421000981354269, "loss": 0.7176960706710815, "step": 301 }, { "epoch": 0.294921875, "grad_norm": 0.5939344763755798, "learning_rate": 0.00014190382728164868, "loss": 0.6655953526496887, "step": 302 }, { "epoch": 0.2958984375, "grad_norm": 0.47224998474121094, "learning_rate": 0.00014170755642787047, "loss": 0.9155608415603638, "step": 303 }, { "epoch": 0.296875, "grad_norm": 0.41344454884529114, "learning_rate": 0.00014151128557409227, "loss": 0.6017557382583618, "step": 304 }, { "epoch": 0.2978515625, "grad_norm": 0.514320969581604, "learning_rate": 0.00014131501472031404, "loss": 0.6184566617012024, "step": 305 }, { "epoch": 0.298828125, "grad_norm": 0.5005887150764465, "learning_rate": 0.00014111874386653583, "loss": 0.6652892231941223, "step": 306 }, { "epoch": 0.2998046875, "grad_norm": 0.5872619152069092, "learning_rate": 0.0001409224730127576, "loss": 0.8618959784507751, "step": 307 }, { "epoch": 0.30078125, "grad_norm": 0.5114542245864868, "learning_rate": 0.0001407262021589794, "loss": 0.6637990474700928, "step": 308 }, { "epoch": 0.3017578125, "grad_norm": 1.141750693321228, "learning_rate": 0.00014052993130520117, "loss": 0.6234999299049377, "step": 309 }, { "epoch": 0.302734375, "grad_norm": 0.4786873459815979, "learning_rate": 0.000140333660451423, "loss": 0.9601540565490723, "step": 310 }, { "epoch": 0.3037109375, "grad_norm": 0.6048462390899658, "learning_rate": 0.00014013738959764476, "loss": 0.5895652770996094, "step": 311 }, { "epoch": 0.3046875, "grad_norm": 0.7435188889503479, "learning_rate": 0.00013994111874386655, "loss": 1.196149468421936, "step": 312 }, { "epoch": 0.3056640625, "grad_norm": 0.7936303019523621, "learning_rate": 0.00013974484789008832, "loss": 0.6073983907699585, "step": 313 }, { "epoch": 0.306640625, "grad_norm": 0.5199156403541565, "learning_rate": 0.00013954857703631012, "loss": 0.2734944224357605, "step": 314 }, { "epoch": 0.3076171875, "grad_norm": 0.38845276832580566, "learning_rate": 0.0001393523061825319, "loss": 0.604506254196167, "step": 315 }, { "epoch": 0.30859375, "grad_norm": 0.6925122737884521, "learning_rate": 0.0001391560353287537, "loss": 1.0446012020111084, "step": 316 }, { "epoch": 0.3095703125, "grad_norm": 0.4950433075428009, "learning_rate": 0.00013895976447497548, "loss": 1.027349591255188, "step": 317 }, { "epoch": 0.310546875, "grad_norm": 0.36179935932159424, "learning_rate": 0.00013876349362119725, "loss": 0.6760075688362122, "step": 318 }, { "epoch": 0.3115234375, "grad_norm": 0.3730153739452362, "learning_rate": 0.00013856722276741904, "loss": 0.47779884934425354, "step": 319 }, { "epoch": 0.3125, "grad_norm": 0.6181739568710327, "learning_rate": 0.0001383709519136408, "loss": 0.4747524857521057, "step": 320 }, { "epoch": 0.3134765625, "grad_norm": 0.8233240246772766, "learning_rate": 0.0001381746810598626, "loss": 0.490276575088501, "step": 321 }, { "epoch": 0.314453125, "grad_norm": 0.6492604613304138, "learning_rate": 0.0001379784102060844, "loss": 0.44847172498703003, "step": 322 }, { "epoch": 0.3154296875, "grad_norm": 0.5506369471549988, "learning_rate": 0.0001377821393523062, "loss": 0.47955968976020813, "step": 323 }, { "epoch": 0.31640625, "grad_norm": 0.4187554717063904, "learning_rate": 0.00013758586849852797, "loss": 0.6466250419616699, "step": 324 }, { "epoch": 0.3173828125, "grad_norm": 0.3976380527019501, "learning_rate": 0.00013738959764474977, "loss": 0.756473183631897, "step": 325 }, { "epoch": 0.318359375, "grad_norm": 0.6089552044868469, "learning_rate": 0.00013719332679097153, "loss": 0.9309840202331543, "step": 326 }, { "epoch": 0.3193359375, "grad_norm": 0.31628501415252686, "learning_rate": 0.00013699705593719333, "loss": 0.7739764451980591, "step": 327 }, { "epoch": 0.3203125, "grad_norm": 0.6984357237815857, "learning_rate": 0.00013680078508341513, "loss": 1.0047030448913574, "step": 328 }, { "epoch": 0.3212890625, "grad_norm": 0.42705219984054565, "learning_rate": 0.00013660451422963692, "loss": 0.5215034484863281, "step": 329 }, { "epoch": 0.322265625, "grad_norm": 0.3548984229564667, "learning_rate": 0.0001364082433758587, "loss": 0.777184009552002, "step": 330 }, { "epoch": 0.3232421875, "grad_norm": 0.6042805314064026, "learning_rate": 0.00013621197252208046, "loss": 0.469806432723999, "step": 331 }, { "epoch": 0.32421875, "grad_norm": 0.43482446670532227, "learning_rate": 0.00013601570166830226, "loss": 0.8123322129249573, "step": 332 }, { "epoch": 0.3251953125, "grad_norm": 0.4851783812046051, "learning_rate": 0.00013581943081452405, "loss": 1.1560527086257935, "step": 333 }, { "epoch": 0.326171875, "grad_norm": 0.681423008441925, "learning_rate": 0.00013562315996074585, "loss": 0.5681013464927673, "step": 334 }, { "epoch": 0.3271484375, "grad_norm": 0.43838411569595337, "learning_rate": 0.00013542688910696762, "loss": 0.8758999109268188, "step": 335 }, { "epoch": 0.328125, "grad_norm": 0.5508302450180054, "learning_rate": 0.0001352306182531894, "loss": 0.7725740671157837, "step": 336 }, { "epoch": 0.3291015625, "grad_norm": 0.2603519856929779, "learning_rate": 0.00013503434739941118, "loss": 0.357033908367157, "step": 337 }, { "epoch": 0.330078125, "grad_norm": 0.38098394870758057, "learning_rate": 0.00013483807654563298, "loss": 0.41752922534942627, "step": 338 }, { "epoch": 0.3310546875, "grad_norm": 0.5308575630187988, "learning_rate": 0.00013464180569185477, "loss": 0.6187021732330322, "step": 339 }, { "epoch": 0.33203125, "grad_norm": 0.4033392369747162, "learning_rate": 0.00013444553483807657, "loss": 0.9481551647186279, "step": 340 }, { "epoch": 0.3330078125, "grad_norm": 0.3999135494232178, "learning_rate": 0.00013424926398429834, "loss": 0.6853100657463074, "step": 341 }, { "epoch": 0.333984375, "grad_norm": 0.4521353840827942, "learning_rate": 0.00013405299313052013, "loss": 1.0335659980773926, "step": 342 }, { "epoch": 0.3349609375, "grad_norm": 0.3538281321525574, "learning_rate": 0.0001338567222767419, "loss": 0.821506142616272, "step": 343 }, { "epoch": 0.3359375, "grad_norm": 0.49575889110565186, "learning_rate": 0.0001336604514229637, "loss": 0.6124354004859924, "step": 344 }, { "epoch": 0.3369140625, "grad_norm": 0.37985700368881226, "learning_rate": 0.0001334641805691855, "loss": 0.6803320646286011, "step": 345 }, { "epoch": 0.337890625, "grad_norm": 0.3533600866794586, "learning_rate": 0.00013326790971540726, "loss": 0.7260403037071228, "step": 346 }, { "epoch": 0.3388671875, "grad_norm": 0.49213504791259766, "learning_rate": 0.00013307163886162906, "loss": 0.9051091074943542, "step": 347 }, { "epoch": 0.33984375, "grad_norm": 0.37704166769981384, "learning_rate": 0.00013287536800785083, "loss": 0.4471222460269928, "step": 348 }, { "epoch": 0.3408203125, "grad_norm": 0.4309573471546173, "learning_rate": 0.00013267909715407262, "loss": 0.749025285243988, "step": 349 }, { "epoch": 0.341796875, "grad_norm": 0.7491689920425415, "learning_rate": 0.0001324828263002944, "loss": 1.1318167448043823, "step": 350 }, { "epoch": 0.3427734375, "grad_norm": 0.3965498208999634, "learning_rate": 0.00013228655544651622, "loss": 0.8451839685440063, "step": 351 }, { "epoch": 0.34375, "grad_norm": 0.4446418285369873, "learning_rate": 0.00013209028459273798, "loss": 0.7875360250473022, "step": 352 }, { "epoch": 0.3447265625, "grad_norm": 0.3396705985069275, "learning_rate": 0.00013189401373895978, "loss": 0.8446518182754517, "step": 353 }, { "epoch": 0.345703125, "grad_norm": 0.3436250388622284, "learning_rate": 0.00013169774288518155, "loss": 0.8995112180709839, "step": 354 }, { "epoch": 0.3466796875, "grad_norm": 0.33643823862075806, "learning_rate": 0.00013150147203140334, "loss": 0.6253601312637329, "step": 355 }, { "epoch": 0.34765625, "grad_norm": 0.39978718757629395, "learning_rate": 0.0001313052011776251, "loss": 0.31882500648498535, "step": 356 }, { "epoch": 0.3486328125, "grad_norm": 0.3054925799369812, "learning_rate": 0.00013110893032384694, "loss": 0.3698769807815552, "step": 357 }, { "epoch": 0.349609375, "grad_norm": 0.3789948523044586, "learning_rate": 0.0001309126594700687, "loss": 0.9039162397384644, "step": 358 }, { "epoch": 0.3505859375, "grad_norm": 0.4192582964897156, "learning_rate": 0.0001307163886162905, "loss": 0.7852678298950195, "step": 359 }, { "epoch": 0.3515625, "grad_norm": 0.5130710601806641, "learning_rate": 0.00013052011776251227, "loss": 0.7745686769485474, "step": 360 }, { "epoch": 0.3525390625, "grad_norm": 0.39334234595298767, "learning_rate": 0.00013032384690873404, "loss": 0.7644802331924438, "step": 361 }, { "epoch": 0.353515625, "grad_norm": 0.6141180992126465, "learning_rate": 0.00013012757605495583, "loss": 0.6028044819831848, "step": 362 }, { "epoch": 0.3544921875, "grad_norm": 0.33263200521469116, "learning_rate": 0.00012993130520117763, "loss": 0.6908546090126038, "step": 363 }, { "epoch": 0.35546875, "grad_norm": 0.3901807367801666, "learning_rate": 0.00012973503434739943, "loss": 0.8896909952163696, "step": 364 }, { "epoch": 0.3564453125, "grad_norm": 0.3889808654785156, "learning_rate": 0.0001295387634936212, "loss": 0.622492790222168, "step": 365 }, { "epoch": 0.357421875, "grad_norm": 0.41004979610443115, "learning_rate": 0.000129342492639843, "loss": 0.6293104887008667, "step": 366 }, { "epoch": 0.3583984375, "grad_norm": 0.32929369807243347, "learning_rate": 0.00012914622178606476, "loss": 0.7049382925033569, "step": 367 }, { "epoch": 0.359375, "grad_norm": 0.5189999341964722, "learning_rate": 0.00012894995093228656, "loss": 0.9230547547340393, "step": 368 }, { "epoch": 0.3603515625, "grad_norm": 0.290991872549057, "learning_rate": 0.00012875368007850835, "loss": 0.5716772079467773, "step": 369 }, { "epoch": 0.361328125, "grad_norm": 0.3976893126964569, "learning_rate": 0.00012855740922473015, "loss": 0.4593455493450165, "step": 370 }, { "epoch": 0.3623046875, "grad_norm": 0.38385459780693054, "learning_rate": 0.00012836113837095192, "loss": 0.4766542315483093, "step": 371 }, { "epoch": 0.36328125, "grad_norm": 0.45652449131011963, "learning_rate": 0.0001281648675171737, "loss": 0.9292062520980835, "step": 372 }, { "epoch": 0.3642578125, "grad_norm": 0.384463906288147, "learning_rate": 0.00012796859666339548, "loss": 0.7896109223365784, "step": 373 }, { "epoch": 0.365234375, "grad_norm": 0.43412724137306213, "learning_rate": 0.00012777232580961728, "loss": 0.6185650825500488, "step": 374 }, { "epoch": 0.3662109375, "grad_norm": 0.4574507772922516, "learning_rate": 0.00012757605495583907, "loss": 0.5614027380943298, "step": 375 }, { "epoch": 0.3671875, "grad_norm": 0.2921536862850189, "learning_rate": 0.00012737978410206084, "loss": 0.26786333322525024, "step": 376 }, { "epoch": 0.3681640625, "grad_norm": 0.5887529850006104, "learning_rate": 0.00012718351324828264, "loss": 0.4167410433292389, "step": 377 }, { "epoch": 0.369140625, "grad_norm": 0.3651127815246582, "learning_rate": 0.0001269872423945044, "loss": 1.0140016078948975, "step": 378 }, { "epoch": 0.3701171875, "grad_norm": 0.47206228971481323, "learning_rate": 0.0001267909715407262, "loss": 0.8293377757072449, "step": 379 }, { "epoch": 0.37109375, "grad_norm": 0.6319689154624939, "learning_rate": 0.000126594700686948, "loss": 0.7301446795463562, "step": 380 }, { "epoch": 0.3720703125, "grad_norm": 0.5163951516151428, "learning_rate": 0.0001263984298331698, "loss": 0.9944421648979187, "step": 381 }, { "epoch": 0.373046875, "grad_norm": 0.519072949886322, "learning_rate": 0.00012620215897939156, "loss": 0.6176541447639465, "step": 382 }, { "epoch": 0.3740234375, "grad_norm": 3.0750813484191895, "learning_rate": 0.00012600588812561336, "loss": 0.7531320452690125, "step": 383 }, { "epoch": 0.375, "grad_norm": 0.3246331512928009, "learning_rate": 0.00012580961727183513, "loss": 0.3269459903240204, "step": 384 }, { "epoch": 0.3759765625, "grad_norm": 1.1105197668075562, "learning_rate": 0.00012561334641805692, "loss": 0.4228656589984894, "step": 385 }, { "epoch": 0.376953125, "grad_norm": 0.6776182055473328, "learning_rate": 0.00012541707556427872, "loss": 0.791953980922699, "step": 386 }, { "epoch": 0.3779296875, "grad_norm": 0.4413786828517914, "learning_rate": 0.00012522080471050052, "loss": 0.7953442335128784, "step": 387 }, { "epoch": 0.37890625, "grad_norm": 0.4036264419555664, "learning_rate": 0.00012502453385672228, "loss": 0.6062744855880737, "step": 388 }, { "epoch": 0.3798828125, "grad_norm": 1.0638166666030884, "learning_rate": 0.00012482826300294408, "loss": 1.0578093528747559, "step": 389 }, { "epoch": 0.380859375, "grad_norm": 0.2518276572227478, "learning_rate": 0.00012463199214916585, "loss": 0.5070685148239136, "step": 390 }, { "epoch": 0.3818359375, "grad_norm": 0.3338214159011841, "learning_rate": 0.00012443572129538762, "loss": 0.7665579915046692, "step": 391 }, { "epoch": 0.3828125, "grad_norm": 0.4730507433414459, "learning_rate": 0.00012423945044160944, "loss": 0.48353517055511475, "step": 392 }, { "epoch": 0.3837890625, "grad_norm": 0.3488924503326416, "learning_rate": 0.0001240431795878312, "loss": 0.4422420561313629, "step": 393 }, { "epoch": 0.384765625, "grad_norm": 0.2397361695766449, "learning_rate": 0.000123846908734053, "loss": 0.7025644183158875, "step": 394 }, { "epoch": 0.3857421875, "grad_norm": 0.3638167679309845, "learning_rate": 0.00012365063788027478, "loss": 0.5372107625007629, "step": 395 }, { "epoch": 0.38671875, "grad_norm": 0.4088346064090729, "learning_rate": 0.00012345436702649657, "loss": 0.7636011838912964, "step": 396 }, { "epoch": 0.3876953125, "grad_norm": 0.36985111236572266, "learning_rate": 0.00012325809617271834, "loss": 0.6720612645149231, "step": 397 }, { "epoch": 0.388671875, "grad_norm": 0.37556055188179016, "learning_rate": 0.00012306182531894016, "loss": 0.8087592124938965, "step": 398 }, { "epoch": 0.3896484375, "grad_norm": 0.6851724982261658, "learning_rate": 0.00012286555446516193, "loss": 0.780835747718811, "step": 399 }, { "epoch": 0.390625, "grad_norm": 0.3453989326953888, "learning_rate": 0.00012266928361138373, "loss": 0.8235517740249634, "step": 400 }, { "epoch": 0.3916015625, "grad_norm": 0.43622198700904846, "learning_rate": 0.0001224730127576055, "loss": 0.3758167028427124, "step": 401 }, { "epoch": 0.392578125, "grad_norm": 0.4364018142223358, "learning_rate": 0.0001222767419038273, "loss": 0.7123017907142639, "step": 402 }, { "epoch": 0.3935546875, "grad_norm": 0.24169716238975525, "learning_rate": 0.00012208047105004906, "loss": 0.48390328884124756, "step": 403 }, { "epoch": 0.39453125, "grad_norm": 3.4902851581573486, "learning_rate": 0.00012188420019627087, "loss": 0.8519951105117798, "step": 404 }, { "epoch": 0.3955078125, "grad_norm": 0.8332751989364624, "learning_rate": 0.00012168792934249264, "loss": 0.7562370896339417, "step": 405 }, { "epoch": 0.396484375, "grad_norm": 0.3582589030265808, "learning_rate": 0.00012149165848871442, "loss": 0.3723471164703369, "step": 406 }, { "epoch": 0.3974609375, "grad_norm": 0.48302146792411804, "learning_rate": 0.00012129538763493622, "loss": 1.0008171796798706, "step": 407 }, { "epoch": 0.3984375, "grad_norm": 0.3510138988494873, "learning_rate": 0.000121099116781158, "loss": 0.30772703886032104, "step": 408 }, { "epoch": 0.3994140625, "grad_norm": 0.2771015763282776, "learning_rate": 0.0001209028459273798, "loss": 0.4403090178966522, "step": 409 }, { "epoch": 0.400390625, "grad_norm": 0.42239415645599365, "learning_rate": 0.00012070657507360156, "loss": 0.5451241731643677, "step": 410 }, { "epoch": 0.4013671875, "grad_norm": 0.27876874804496765, "learning_rate": 0.00012051030421982336, "loss": 0.3590753972530365, "step": 411 }, { "epoch": 0.40234375, "grad_norm": 0.42854824662208557, "learning_rate": 0.00012031403336604514, "loss": 1.0192680358886719, "step": 412 }, { "epoch": 0.4033203125, "grad_norm": 0.32980695366859436, "learning_rate": 0.00012011776251226694, "loss": 0.6476566195487976, "step": 413 }, { "epoch": 0.404296875, "grad_norm": 0.45046037435531616, "learning_rate": 0.00011992149165848872, "loss": 0.9548048973083496, "step": 414 }, { "epoch": 0.4052734375, "grad_norm": 0.4176082909107208, "learning_rate": 0.00011972522080471052, "loss": 0.3793225586414337, "step": 415 }, { "epoch": 0.40625, "grad_norm": 0.335823118686676, "learning_rate": 0.00011952894995093229, "loss": 0.5807560086250305, "step": 416 }, { "epoch": 0.4072265625, "grad_norm": 0.4758591651916504, "learning_rate": 0.00011933267909715408, "loss": 0.3924551010131836, "step": 417 }, { "epoch": 0.408203125, "grad_norm": 0.21527709066867828, "learning_rate": 0.00011913640824337586, "loss": 0.1651245653629303, "step": 418 }, { "epoch": 0.4091796875, "grad_norm": 0.31255391240119934, "learning_rate": 0.00011894013738959766, "loss": 0.6133516430854797, "step": 419 }, { "epoch": 0.41015625, "grad_norm": 0.40668365359306335, "learning_rate": 0.00011874386653581944, "loss": 0.894720196723938, "step": 420 }, { "epoch": 0.4111328125, "grad_norm": 0.35574087500572205, "learning_rate": 0.00011854759568204121, "loss": 0.9017484188079834, "step": 421 }, { "epoch": 0.412109375, "grad_norm": 0.3389612138271332, "learning_rate": 0.00011835132482826301, "loss": 0.7961660623550415, "step": 422 }, { "epoch": 0.4130859375, "grad_norm": 0.8334202766418457, "learning_rate": 0.00011815505397448479, "loss": 0.8654063940048218, "step": 423 }, { "epoch": 0.4140625, "grad_norm": 0.5917571187019348, "learning_rate": 0.00011795878312070659, "loss": 0.631730318069458, "step": 424 }, { "epoch": 0.4150390625, "grad_norm": 0.4908443093299866, "learning_rate": 0.00011776251226692835, "loss": 0.3205869495868683, "step": 425 }, { "epoch": 0.416015625, "grad_norm": 0.8349789381027222, "learning_rate": 0.00011756624141315016, "loss": 0.8526176810264587, "step": 426 }, { "epoch": 0.4169921875, "grad_norm": 0.38712671399116516, "learning_rate": 0.00011736997055937193, "loss": 0.6580482125282288, "step": 427 }, { "epoch": 0.41796875, "grad_norm": 0.766034722328186, "learning_rate": 0.00011717369970559373, "loss": 0.5494309663772583, "step": 428 }, { "epoch": 0.4189453125, "grad_norm": 0.33322349190711975, "learning_rate": 0.00011697742885181551, "loss": 0.38351887464523315, "step": 429 }, { "epoch": 0.419921875, "grad_norm": 0.411155641078949, "learning_rate": 0.00011678115799803731, "loss": 0.8139836192131042, "step": 430 }, { "epoch": 0.4208984375, "grad_norm": 0.5857217907905579, "learning_rate": 0.00011658488714425908, "loss": 0.5668150186538696, "step": 431 }, { "epoch": 0.421875, "grad_norm": 0.8849710822105408, "learning_rate": 0.00011638861629048087, "loss": 0.5478008985519409, "step": 432 }, { "epoch": 0.4228515625, "grad_norm": 0.6771020293235779, "learning_rate": 0.00011619234543670265, "loss": 0.608709454536438, "step": 433 }, { "epoch": 0.423828125, "grad_norm": 0.30138713121414185, "learning_rate": 0.00011599607458292445, "loss": 0.8240669369697571, "step": 434 }, { "epoch": 0.4248046875, "grad_norm": 0.3273598253726959, "learning_rate": 0.00011579980372914623, "loss": 0.6287229657173157, "step": 435 }, { "epoch": 0.42578125, "grad_norm": 0.5044806003570557, "learning_rate": 0.000115603532875368, "loss": 0.735835075378418, "step": 436 }, { "epoch": 0.4267578125, "grad_norm": 0.34495776891708374, "learning_rate": 0.0001154072620215898, "loss": 0.7688421010971069, "step": 437 }, { "epoch": 0.427734375, "grad_norm": 0.41923069953918457, "learning_rate": 0.00011521099116781158, "loss": 0.679617166519165, "step": 438 }, { "epoch": 0.4287109375, "grad_norm": 0.3509843945503235, "learning_rate": 0.00011501472031403338, "loss": 0.7478575110435486, "step": 439 }, { "epoch": 0.4296875, "grad_norm": 0.4758707582950592, "learning_rate": 0.00011481844946025514, "loss": 0.48871147632598877, "step": 440 }, { "epoch": 0.4306640625, "grad_norm": 0.30272597074508667, "learning_rate": 0.00011462217860647695, "loss": 0.4311315715312958, "step": 441 }, { "epoch": 0.431640625, "grad_norm": 0.5226417779922485, "learning_rate": 0.00011442590775269872, "loss": 0.8198300004005432, "step": 442 }, { "epoch": 0.4326171875, "grad_norm": 0.41183850169181824, "learning_rate": 0.00011422963689892052, "loss": 0.9958367347717285, "step": 443 }, { "epoch": 0.43359375, "grad_norm": 0.384048193693161, "learning_rate": 0.0001140333660451423, "loss": 0.3194778859615326, "step": 444 }, { "epoch": 0.4345703125, "grad_norm": 0.5035115480422974, "learning_rate": 0.0001138370951913641, "loss": 0.6455928683280945, "step": 445 }, { "epoch": 0.435546875, "grad_norm": 0.4875551462173462, "learning_rate": 0.00011364082433758587, "loss": 0.799978494644165, "step": 446 }, { "epoch": 0.4365234375, "grad_norm": 0.3395763337612152, "learning_rate": 0.00011344455348380768, "loss": 0.47672414779663086, "step": 447 }, { "epoch": 0.4375, "grad_norm": 0.5594314932823181, "learning_rate": 0.00011324828263002944, "loss": 0.4325803518295288, "step": 448 }, { "epoch": 0.4384765625, "grad_norm": 0.44647228717803955, "learning_rate": 0.00011305201177625124, "loss": 0.8119433522224426, "step": 449 }, { "epoch": 0.439453125, "grad_norm": 0.3190518915653229, "learning_rate": 0.00011285574092247302, "loss": 0.4949466288089752, "step": 450 }, { "epoch": 0.4404296875, "grad_norm": 0.5943452715873718, "learning_rate": 0.00011265947006869479, "loss": 0.8245764374732971, "step": 451 }, { "epoch": 0.44140625, "grad_norm": 0.8067309260368347, "learning_rate": 0.00011246319921491659, "loss": 0.39331740140914917, "step": 452 }, { "epoch": 0.4423828125, "grad_norm": 0.4130857288837433, "learning_rate": 0.00011226692836113837, "loss": 1.0005946159362793, "step": 453 }, { "epoch": 0.443359375, "grad_norm": 0.6839224100112915, "learning_rate": 0.00011207065750736017, "loss": 0.453269362449646, "step": 454 }, { "epoch": 0.4443359375, "grad_norm": 0.6282085180282593, "learning_rate": 0.00011187438665358195, "loss": 0.7137607932090759, "step": 455 }, { "epoch": 0.4453125, "grad_norm": 0.49894508719444275, "learning_rate": 0.00011167811579980374, "loss": 0.6289803981781006, "step": 456 }, { "epoch": 0.4462890625, "grad_norm": 0.3570895493030548, "learning_rate": 0.00011148184494602551, "loss": 0.3711976110935211, "step": 457 }, { "epoch": 0.447265625, "grad_norm": 0.28931114077568054, "learning_rate": 0.00011128557409224731, "loss": 0.5629679560661316, "step": 458 }, { "epoch": 0.4482421875, "grad_norm": 1.2492791414260864, "learning_rate": 0.00011108930323846909, "loss": 0.5821082592010498, "step": 459 }, { "epoch": 0.44921875, "grad_norm": 0.29861876368522644, "learning_rate": 0.00011089303238469089, "loss": 0.4129573106765747, "step": 460 }, { "epoch": 0.4501953125, "grad_norm": 0.5244950652122498, "learning_rate": 0.00011069676153091267, "loss": 0.8300201296806335, "step": 461 }, { "epoch": 0.451171875, "grad_norm": 0.446435809135437, "learning_rate": 0.00011050049067713446, "loss": 0.7500958442687988, "step": 462 }, { "epoch": 0.4521484375, "grad_norm": 0.4531306028366089, "learning_rate": 0.00011030421982335623, "loss": 0.8492609262466431, "step": 463 }, { "epoch": 0.453125, "grad_norm": 0.46944308280944824, "learning_rate": 0.00011010794896957802, "loss": 0.6209090948104858, "step": 464 }, { "epoch": 0.4541015625, "grad_norm": 0.5465651154518127, "learning_rate": 0.00010991167811579981, "loss": 0.5176469087600708, "step": 465 }, { "epoch": 0.455078125, "grad_norm": 0.36550402641296387, "learning_rate": 0.00010971540726202158, "loss": 0.6358295679092407, "step": 466 }, { "epoch": 0.4560546875, "grad_norm": 0.48919910192489624, "learning_rate": 0.00010951913640824338, "loss": 0.5903019905090332, "step": 467 }, { "epoch": 0.45703125, "grad_norm": 0.4378332793712616, "learning_rate": 0.00010932286555446516, "loss": 0.6710047721862793, "step": 468 }, { "epoch": 0.4580078125, "grad_norm": 0.3095405101776123, "learning_rate": 0.00010912659470068696, "loss": 0.6787213683128357, "step": 469 }, { "epoch": 0.458984375, "grad_norm": 0.40901967883110046, "learning_rate": 0.00010893032384690874, "loss": 0.6371384859085083, "step": 470 }, { "epoch": 0.4599609375, "grad_norm": 0.3962486982345581, "learning_rate": 0.00010873405299313053, "loss": 0.5823498964309692, "step": 471 }, { "epoch": 0.4609375, "grad_norm": 0.4094708263874054, "learning_rate": 0.0001085377821393523, "loss": 1.0396480560302734, "step": 472 }, { "epoch": 0.4619140625, "grad_norm": 0.5117614269256592, "learning_rate": 0.0001083415112855741, "loss": 0.6320610642433167, "step": 473 }, { "epoch": 0.462890625, "grad_norm": 0.28345227241516113, "learning_rate": 0.00010814524043179588, "loss": 0.33279290795326233, "step": 474 }, { "epoch": 0.4638671875, "grad_norm": 0.5475791096687317, "learning_rate": 0.00010794896957801768, "loss": 0.359570175409317, "step": 475 }, { "epoch": 0.46484375, "grad_norm": 0.44176843762397766, "learning_rate": 0.00010775269872423946, "loss": 0.7576714158058167, "step": 476 }, { "epoch": 0.4658203125, "grad_norm": 0.473562628030777, "learning_rate": 0.00010755642787046125, "loss": 0.8758799433708191, "step": 477 }, { "epoch": 0.466796875, "grad_norm": 0.41919219493865967, "learning_rate": 0.00010736015701668302, "loss": 0.863654375076294, "step": 478 }, { "epoch": 0.4677734375, "grad_norm": 0.4215691089630127, "learning_rate": 0.0001071638861629048, "loss": 0.5004569292068481, "step": 479 }, { "epoch": 0.46875, "grad_norm": 0.36801034212112427, "learning_rate": 0.0001069676153091266, "loss": 0.9330754280090332, "step": 480 }, { "epoch": 0.4697265625, "grad_norm": 0.42489972710609436, "learning_rate": 0.00010677134445534837, "loss": 1.0529820919036865, "step": 481 }, { "epoch": 0.470703125, "grad_norm": 0.4067368507385254, "learning_rate": 0.00010657507360157018, "loss": 0.5453970432281494, "step": 482 }, { "epoch": 0.4716796875, "grad_norm": 0.28611162304878235, "learning_rate": 0.00010637880274779195, "loss": 0.2348572313785553, "step": 483 }, { "epoch": 0.47265625, "grad_norm": 0.40047627687454224, "learning_rate": 0.00010618253189401374, "loss": 0.4776308834552765, "step": 484 }, { "epoch": 0.4736328125, "grad_norm": 0.5168628692626953, "learning_rate": 0.00010598626104023553, "loss": 0.9922167062759399, "step": 485 }, { "epoch": 0.474609375, "grad_norm": 0.3620246946811676, "learning_rate": 0.00010578999018645732, "loss": 0.7285036444664001, "step": 486 }, { "epoch": 0.4755859375, "grad_norm": 0.42711782455444336, "learning_rate": 0.00010559371933267909, "loss": 0.6387231349945068, "step": 487 }, { "epoch": 0.4765625, "grad_norm": 0.2139827311038971, "learning_rate": 0.0001053974484789009, "loss": 0.4295338988304138, "step": 488 }, { "epoch": 0.4775390625, "grad_norm": 0.31191739439964294, "learning_rate": 0.00010520117762512267, "loss": 0.42860671877861023, "step": 489 }, { "epoch": 0.478515625, "grad_norm": 0.2909379303455353, "learning_rate": 0.00010500490677134447, "loss": 0.47065097093582153, "step": 490 }, { "epoch": 0.4794921875, "grad_norm": 0.48990437388420105, "learning_rate": 0.00010480863591756625, "loss": 0.8870656490325928, "step": 491 }, { "epoch": 0.48046875, "grad_norm": 0.5662127733230591, "learning_rate": 0.00010461236506378804, "loss": 0.8007984161376953, "step": 492 }, { "epoch": 0.4814453125, "grad_norm": 0.3656634986400604, "learning_rate": 0.00010441609421000981, "loss": 0.41389334201812744, "step": 493 }, { "epoch": 0.482421875, "grad_norm": 0.39840465784072876, "learning_rate": 0.0001042198233562316, "loss": 0.6927056908607483, "step": 494 }, { "epoch": 0.4833984375, "grad_norm": 0.641647219657898, "learning_rate": 0.00010402355250245339, "loss": 0.7912976145744324, "step": 495 }, { "epoch": 0.484375, "grad_norm": 0.4522266685962677, "learning_rate": 0.00010382728164867517, "loss": 0.615374743938446, "step": 496 }, { "epoch": 0.4853515625, "grad_norm": 0.415444016456604, "learning_rate": 0.00010363101079489697, "loss": 0.8559135794639587, "step": 497 }, { "epoch": 0.486328125, "grad_norm": 0.4477578401565552, "learning_rate": 0.00010343473994111874, "loss": 0.6109384298324585, "step": 498 }, { "epoch": 0.4873046875, "grad_norm": 0.33097633719444275, "learning_rate": 0.00010323846908734053, "loss": 0.6325762271881104, "step": 499 }, { "epoch": 0.48828125, "grad_norm": 0.38771572709083557, "learning_rate": 0.00010304219823356232, "loss": 0.5979640483856201, "step": 500 }, { "epoch": 0.4892578125, "grad_norm": 0.3339928984642029, "learning_rate": 0.00010284592737978411, "loss": 0.6619001626968384, "step": 501 }, { "epoch": 0.490234375, "grad_norm": 0.6400135159492493, "learning_rate": 0.00010264965652600588, "loss": 0.28338727355003357, "step": 502 }, { "epoch": 0.4912109375, "grad_norm": 0.35763970017433167, "learning_rate": 0.00010245338567222769, "loss": 0.6373124122619629, "step": 503 }, { "epoch": 0.4921875, "grad_norm": 0.2136622965335846, "learning_rate": 0.00010225711481844946, "loss": 0.2315329760313034, "step": 504 }, { "epoch": 0.4931640625, "grad_norm": 0.6324110627174377, "learning_rate": 0.00010206084396467126, "loss": 1.0045514106750488, "step": 505 }, { "epoch": 0.494140625, "grad_norm": 0.4471307694911957, "learning_rate": 0.00010186457311089304, "loss": 0.5188390016555786, "step": 506 }, { "epoch": 0.4951171875, "grad_norm": 0.38222211599349976, "learning_rate": 0.00010166830225711483, "loss": 0.7351740598678589, "step": 507 }, { "epoch": 0.49609375, "grad_norm": 0.41885000467300415, "learning_rate": 0.0001014720314033366, "loss": 0.9071688055992126, "step": 508 }, { "epoch": 0.4970703125, "grad_norm": 0.8193621635437012, "learning_rate": 0.00010127576054955839, "loss": 0.7240473031997681, "step": 509 }, { "epoch": 0.498046875, "grad_norm": 0.2846645712852478, "learning_rate": 0.00010107948969578018, "loss": 0.351628839969635, "step": 510 }, { "epoch": 0.4990234375, "grad_norm": 0.4778954088687897, "learning_rate": 0.00010088321884200196, "loss": 0.7705833911895752, "step": 511 }, { "epoch": 0.5, "grad_norm": 0.3384702503681183, "learning_rate": 0.00010068694798822376, "loss": 0.5467265248298645, "step": 512 }, { "epoch": 0.5009765625, "grad_norm": 0.43917056918144226, "learning_rate": 0.00010049067713444553, "loss": 0.9810686707496643, "step": 513 }, { "epoch": 0.501953125, "grad_norm": 0.4351615607738495, "learning_rate": 0.00010029440628066732, "loss": 0.9716764688491821, "step": 514 }, { "epoch": 0.5029296875, "grad_norm": 0.49873459339141846, "learning_rate": 0.00010009813542688911, "loss": 0.9183788299560547, "step": 515 }, { "epoch": 0.50390625, "grad_norm": 0.36710789799690247, "learning_rate": 9.990186457311089e-05, "loss": 0.49884548783302307, "step": 516 }, { "epoch": 0.5048828125, "grad_norm": 0.5402531623840332, "learning_rate": 9.970559371933269e-05, "loss": 0.6645570993423462, "step": 517 }, { "epoch": 0.505859375, "grad_norm": 0.4990559220314026, "learning_rate": 9.950932286555447e-05, "loss": 1.0321924686431885, "step": 518 }, { "epoch": 0.5068359375, "grad_norm": 0.4634752869606018, "learning_rate": 9.931305201177625e-05, "loss": 0.8484972715377808, "step": 519 }, { "epoch": 0.5078125, "grad_norm": 0.38584330677986145, "learning_rate": 9.911678115799805e-05, "loss": 0.3424939513206482, "step": 520 }, { "epoch": 0.5087890625, "grad_norm": 0.41148415207862854, "learning_rate": 9.892051030421983e-05, "loss": 0.7890703678131104, "step": 521 }, { "epoch": 0.509765625, "grad_norm": 0.35891374945640564, "learning_rate": 9.872423945044161e-05, "loss": 0.7387750744819641, "step": 522 }, { "epoch": 0.5107421875, "grad_norm": 0.4174203872680664, "learning_rate": 9.85279685966634e-05, "loss": 0.5610706806182861, "step": 523 }, { "epoch": 0.51171875, "grad_norm": 0.4062010645866394, "learning_rate": 9.833169774288519e-05, "loss": 0.6016039252281189, "step": 524 }, { "epoch": 0.5126953125, "grad_norm": 0.35915061831474304, "learning_rate": 9.813542688910697e-05, "loss": 0.37933990359306335, "step": 525 }, { "epoch": 0.513671875, "grad_norm": 0.49826234579086304, "learning_rate": 9.793915603532877e-05, "loss": 0.9650976657867432, "step": 526 }, { "epoch": 0.5146484375, "grad_norm": 0.4122180938720703, "learning_rate": 9.774288518155055e-05, "loss": 0.5477824211120605, "step": 527 }, { "epoch": 0.515625, "grad_norm": 0.3824058175086975, "learning_rate": 9.754661432777233e-05, "loss": 0.5163108706474304, "step": 528 }, { "epoch": 0.5166015625, "grad_norm": 0.4485555589199066, "learning_rate": 9.735034347399413e-05, "loss": 0.9402418732643127, "step": 529 }, { "epoch": 0.517578125, "grad_norm": 0.4053209722042084, "learning_rate": 9.715407262021591e-05, "loss": 0.9314478039741516, "step": 530 }, { "epoch": 0.5185546875, "grad_norm": 0.3183811604976654, "learning_rate": 9.695780176643768e-05, "loss": 0.6706205606460571, "step": 531 }, { "epoch": 0.51953125, "grad_norm": 0.40083932876586914, "learning_rate": 9.676153091265947e-05, "loss": 1.102424144744873, "step": 532 }, { "epoch": 0.5205078125, "grad_norm": 0.5949054956436157, "learning_rate": 9.656526005888126e-05, "loss": 0.8396608829498291, "step": 533 }, { "epoch": 0.521484375, "grad_norm": 0.41966959834098816, "learning_rate": 9.636898920510304e-05, "loss": 0.5641101002693176, "step": 534 }, { "epoch": 0.5224609375, "grad_norm": 0.448281466960907, "learning_rate": 9.617271835132484e-05, "loss": 0.44873932003974915, "step": 535 }, { "epoch": 0.5234375, "grad_norm": 0.47785645723342896, "learning_rate": 9.597644749754662e-05, "loss": 0.8799008131027222, "step": 536 }, { "epoch": 0.5244140625, "grad_norm": 0.45459261536598206, "learning_rate": 9.57801766437684e-05, "loss": 0.8261788487434387, "step": 537 }, { "epoch": 0.525390625, "grad_norm": 0.6168074607849121, "learning_rate": 9.55839057899902e-05, "loss": 0.9762136936187744, "step": 538 }, { "epoch": 0.5263671875, "grad_norm": 0.6500818133354187, "learning_rate": 9.538763493621198e-05, "loss": 0.9044640064239502, "step": 539 }, { "epoch": 0.52734375, "grad_norm": 0.31668490171432495, "learning_rate": 9.519136408243376e-05, "loss": 0.42503029108047485, "step": 540 }, { "epoch": 0.5283203125, "grad_norm": 0.4041314721107483, "learning_rate": 9.499509322865556e-05, "loss": 0.6643175482749939, "step": 541 }, { "epoch": 0.529296875, "grad_norm": 1.011020541191101, "learning_rate": 9.479882237487734e-05, "loss": 0.7636033892631531, "step": 542 }, { "epoch": 0.5302734375, "grad_norm": 0.3690396845340729, "learning_rate": 9.460255152109912e-05, "loss": 1.0516947507858276, "step": 543 }, { "epoch": 0.53125, "grad_norm": 0.288604199886322, "learning_rate": 9.440628066732092e-05, "loss": 0.3806208372116089, "step": 544 }, { "epoch": 0.5322265625, "grad_norm": 0.4247501790523529, "learning_rate": 9.42100098135427e-05, "loss": 0.8651745319366455, "step": 545 }, { "epoch": 0.533203125, "grad_norm": 1.1893255710601807, "learning_rate": 9.401373895976447e-05, "loss": 0.28601521253585815, "step": 546 }, { "epoch": 0.5341796875, "grad_norm": 0.3229619562625885, "learning_rate": 9.381746810598626e-05, "loss": 0.8316909670829773, "step": 547 }, { "epoch": 0.53515625, "grad_norm": 0.390278160572052, "learning_rate": 9.362119725220805e-05, "loss": 0.7263185977935791, "step": 548 }, { "epoch": 0.5361328125, "grad_norm": 0.2949998378753662, "learning_rate": 9.342492639842983e-05, "loss": 0.5417062044143677, "step": 549 }, { "epoch": 0.537109375, "grad_norm": 0.47482210397720337, "learning_rate": 9.322865554465163e-05, "loss": 0.6505849361419678, "step": 550 }, { "epoch": 0.5380859375, "grad_norm": 0.3653123676776886, "learning_rate": 9.303238469087341e-05, "loss": 0.7270935773849487, "step": 551 }, { "epoch": 0.5390625, "grad_norm": 0.5652351975440979, "learning_rate": 9.283611383709519e-05, "loss": 0.8330069780349731, "step": 552 }, { "epoch": 0.5400390625, "grad_norm": 0.448408842086792, "learning_rate": 9.263984298331699e-05, "loss": 0.8804951310157776, "step": 553 }, { "epoch": 0.541015625, "grad_norm": 0.7700690031051636, "learning_rate": 9.244357212953877e-05, "loss": 0.6466813087463379, "step": 554 }, { "epoch": 0.5419921875, "grad_norm": 0.45755863189697266, "learning_rate": 9.224730127576055e-05, "loss": 0.5548572540283203, "step": 555 }, { "epoch": 0.54296875, "grad_norm": 0.4113846719264984, "learning_rate": 9.205103042198235e-05, "loss": 0.9286736845970154, "step": 556 }, { "epoch": 0.5439453125, "grad_norm": 0.4555431604385376, "learning_rate": 9.185475956820413e-05, "loss": 0.8332977890968323, "step": 557 }, { "epoch": 0.544921875, "grad_norm": 0.5103408098220825, "learning_rate": 9.165848871442591e-05, "loss": 1.0110094547271729, "step": 558 }, { "epoch": 0.5458984375, "grad_norm": 0.299912691116333, "learning_rate": 9.146221786064771e-05, "loss": 0.3136459290981293, "step": 559 }, { "epoch": 0.546875, "grad_norm": 0.40499091148376465, "learning_rate": 9.126594700686948e-05, "loss": 0.6785961389541626, "step": 560 }, { "epoch": 0.5478515625, "grad_norm": 0.4190375804901123, "learning_rate": 9.106967615309127e-05, "loss": 0.9891744256019592, "step": 561 }, { "epoch": 0.548828125, "grad_norm": 0.6265519261360168, "learning_rate": 9.087340529931305e-05, "loss": 0.48712462186813354, "step": 562 }, { "epoch": 0.5498046875, "grad_norm": 0.466420978307724, "learning_rate": 9.067713444553484e-05, "loss": 0.5573943257331848, "step": 563 }, { "epoch": 0.55078125, "grad_norm": 0.3990301191806793, "learning_rate": 9.048086359175663e-05, "loss": 0.5893411040306091, "step": 564 }, { "epoch": 0.5517578125, "grad_norm": 0.31471043825149536, "learning_rate": 9.028459273797842e-05, "loss": 0.593424379825592, "step": 565 }, { "epoch": 0.552734375, "grad_norm": 0.46789905428886414, "learning_rate": 9.00883218842002e-05, "loss": 0.9398684501647949, "step": 566 }, { "epoch": 0.5537109375, "grad_norm": 0.48358282446861267, "learning_rate": 8.989205103042198e-05, "loss": 0.895098865032196, "step": 567 }, { "epoch": 0.5546875, "grad_norm": 0.25878453254699707, "learning_rate": 8.969578017664378e-05, "loss": 0.4817226231098175, "step": 568 }, { "epoch": 0.5556640625, "grad_norm": 0.5319378972053528, "learning_rate": 8.949950932286556e-05, "loss": 0.6119651794433594, "step": 569 }, { "epoch": 0.556640625, "grad_norm": 0.3002898097038269, "learning_rate": 8.930323846908734e-05, "loss": 0.28599199652671814, "step": 570 }, { "epoch": 0.5576171875, "grad_norm": 0.37161317467689514, "learning_rate": 8.910696761530914e-05, "loss": 0.3307079076766968, "step": 571 }, { "epoch": 0.55859375, "grad_norm": 0.4755436182022095, "learning_rate": 8.891069676153092e-05, "loss": 0.5868921279907227, "step": 572 }, { "epoch": 0.5595703125, "grad_norm": 0.3264123499393463, "learning_rate": 8.87144259077527e-05, "loss": 0.6682905554771423, "step": 573 }, { "epoch": 0.560546875, "grad_norm": 0.43468573689460754, "learning_rate": 8.85181550539745e-05, "loss": 0.6316066980361938, "step": 574 }, { "epoch": 0.5615234375, "grad_norm": 0.5759416222572327, "learning_rate": 8.832188420019627e-05, "loss": 0.5687480568885803, "step": 575 }, { "epoch": 0.5625, "grad_norm": 0.39352041482925415, "learning_rate": 8.812561334641806e-05, "loss": 0.3803275525569916, "step": 576 }, { "epoch": 0.5634765625, "grad_norm": 0.4155440926551819, "learning_rate": 8.792934249263984e-05, "loss": 0.3923049569129944, "step": 577 }, { "epoch": 0.564453125, "grad_norm": 0.34934133291244507, "learning_rate": 8.773307163886163e-05, "loss": 0.7100962996482849, "step": 578 }, { "epoch": 0.5654296875, "grad_norm": 0.3993069529533386, "learning_rate": 8.753680078508342e-05, "loss": 0.6711176037788391, "step": 579 }, { "epoch": 0.56640625, "grad_norm": 0.3445776700973511, "learning_rate": 8.73405299313052e-05, "loss": 0.6986067295074463, "step": 580 }, { "epoch": 0.5673828125, "grad_norm": 0.45837292075157166, "learning_rate": 8.714425907752699e-05, "loss": 0.9020513892173767, "step": 581 }, { "epoch": 0.568359375, "grad_norm": 0.3630208671092987, "learning_rate": 8.694798822374878e-05, "loss": 0.42499858140945435, "step": 582 }, { "epoch": 0.5693359375, "grad_norm": 0.41205838322639465, "learning_rate": 8.675171736997057e-05, "loss": 0.5535018444061279, "step": 583 }, { "epoch": 0.5703125, "grad_norm": 0.2596284747123718, "learning_rate": 8.655544651619235e-05, "loss": 0.3234618902206421, "step": 584 }, { "epoch": 0.5712890625, "grad_norm": 0.3716956079006195, "learning_rate": 8.635917566241414e-05, "loss": 0.7567611932754517, "step": 585 }, { "epoch": 0.572265625, "grad_norm": 0.42999619245529175, "learning_rate": 8.616290480863593e-05, "loss": 0.8695427179336548, "step": 586 }, { "epoch": 0.5732421875, "grad_norm": 0.3309305012226105, "learning_rate": 8.596663395485771e-05, "loss": 0.989714503288269, "step": 587 }, { "epoch": 0.57421875, "grad_norm": 0.40024474263191223, "learning_rate": 8.57703631010795e-05, "loss": 1.0608711242675781, "step": 588 }, { "epoch": 0.5751953125, "grad_norm": 0.453950434923172, "learning_rate": 8.557409224730129e-05, "loss": 0.7340632677078247, "step": 589 }, { "epoch": 0.576171875, "grad_norm": 0.4473342299461365, "learning_rate": 8.537782139352306e-05, "loss": 0.7264219522476196, "step": 590 }, { "epoch": 0.5771484375, "grad_norm": 0.420469731092453, "learning_rate": 8.518155053974485e-05, "loss": 0.8141539692878723, "step": 591 }, { "epoch": 0.578125, "grad_norm": 0.4068243205547333, "learning_rate": 8.498527968596663e-05, "loss": 0.5802872180938721, "step": 592 }, { "epoch": 0.5791015625, "grad_norm": 0.4243272840976715, "learning_rate": 8.478900883218842e-05, "loss": 0.350595086812973, "step": 593 }, { "epoch": 0.580078125, "grad_norm": 0.4519834518432617, "learning_rate": 8.459273797841021e-05, "loss": 0.7131458520889282, "step": 594 }, { "epoch": 0.5810546875, "grad_norm": 0.34145745635032654, "learning_rate": 8.4396467124632e-05, "loss": 0.7618221640586853, "step": 595 }, { "epoch": 0.58203125, "grad_norm": 0.46494174003601074, "learning_rate": 8.420019627085378e-05, "loss": 0.5102145075798035, "step": 596 }, { "epoch": 0.5830078125, "grad_norm": 0.3305060565471649, "learning_rate": 8.400392541707557e-05, "loss": 0.7812811732292175, "step": 597 }, { "epoch": 0.583984375, "grad_norm": 0.47092583775520325, "learning_rate": 8.380765456329736e-05, "loss": 0.7497634887695312, "step": 598 }, { "epoch": 0.5849609375, "grad_norm": 0.38902655243873596, "learning_rate": 8.361138370951914e-05, "loss": 0.4198119640350342, "step": 599 }, { "epoch": 0.5859375, "grad_norm": 0.43659287691116333, "learning_rate": 8.341511285574093e-05, "loss": 0.824333667755127, "step": 600 }, { "epoch": 0.5869140625, "grad_norm": 0.4277879595756531, "learning_rate": 8.321884200196272e-05, "loss": 0.445267915725708, "step": 601 }, { "epoch": 0.587890625, "grad_norm": 0.3186829090118408, "learning_rate": 8.30225711481845e-05, "loss": 0.9906235337257385, "step": 602 }, { "epoch": 0.5888671875, "grad_norm": 0.2983294427394867, "learning_rate": 8.28263002944063e-05, "loss": 0.5342146754264832, "step": 603 }, { "epoch": 0.58984375, "grad_norm": 0.4127228856086731, "learning_rate": 8.263002944062808e-05, "loss": 0.41288450360298157, "step": 604 }, { "epoch": 0.5908203125, "grad_norm": 0.3961617052555084, "learning_rate": 8.243375858684985e-05, "loss": 0.43576663732528687, "step": 605 }, { "epoch": 0.591796875, "grad_norm": 0.4124387502670288, "learning_rate": 8.223748773307164e-05, "loss": 0.5837401747703552, "step": 606 }, { "epoch": 0.5927734375, "grad_norm": 0.4274151921272278, "learning_rate": 8.204121687929342e-05, "loss": 0.8666547536849976, "step": 607 }, { "epoch": 0.59375, "grad_norm": 0.3881700932979584, "learning_rate": 8.18449460255152e-05, "loss": 0.9063656330108643, "step": 608 }, { "epoch": 0.5947265625, "grad_norm": 0.46216556429862976, "learning_rate": 8.1648675171737e-05, "loss": 0.4573599696159363, "step": 609 }, { "epoch": 0.595703125, "grad_norm": 0.3843960762023926, "learning_rate": 8.145240431795878e-05, "loss": 0.6214632391929626, "step": 610 }, { "epoch": 0.5966796875, "grad_norm": 0.538301408290863, "learning_rate": 8.125613346418057e-05, "loss": 0.8800979852676392, "step": 611 }, { "epoch": 0.59765625, "grad_norm": 0.49643319845199585, "learning_rate": 8.105986261040236e-05, "loss": 0.48715031147003174, "step": 612 }, { "epoch": 0.5986328125, "grad_norm": 0.4753062427043915, "learning_rate": 8.086359175662415e-05, "loss": 0.8127011060714722, "step": 613 }, { "epoch": 0.599609375, "grad_norm": 0.7572022676467896, "learning_rate": 8.066732090284593e-05, "loss": 0.7151535153388977, "step": 614 }, { "epoch": 0.6005859375, "grad_norm": 0.35117295384407043, "learning_rate": 8.047105004906772e-05, "loss": 0.9221618175506592, "step": 615 }, { "epoch": 0.6015625, "grad_norm": 0.2643633186817169, "learning_rate": 8.02747791952895e-05, "loss": 0.5025840401649475, "step": 616 }, { "epoch": 0.6025390625, "grad_norm": 0.45553916692733765, "learning_rate": 8.007850834151129e-05, "loss": 0.452494740486145, "step": 617 }, { "epoch": 0.603515625, "grad_norm": 0.386594295501709, "learning_rate": 7.988223748773308e-05, "loss": 0.7942792773246765, "step": 618 }, { "epoch": 0.6044921875, "grad_norm": 0.3616650700569153, "learning_rate": 7.968596663395485e-05, "loss": 0.5697340965270996, "step": 619 }, { "epoch": 0.60546875, "grad_norm": 0.3885051906108856, "learning_rate": 7.948969578017665e-05, "loss": 0.7082506418228149, "step": 620 }, { "epoch": 0.6064453125, "grad_norm": 0.4484117329120636, "learning_rate": 7.929342492639843e-05, "loss": 0.5993860960006714, "step": 621 }, { "epoch": 0.607421875, "grad_norm": 0.44654563069343567, "learning_rate": 7.909715407262021e-05, "loss": 0.5804839134216309, "step": 622 }, { "epoch": 0.6083984375, "grad_norm": 0.3943687081336975, "learning_rate": 7.890088321884201e-05, "loss": 0.6422688364982605, "step": 623 }, { "epoch": 0.609375, "grad_norm": 0.4153381288051605, "learning_rate": 7.870461236506379e-05, "loss": 0.6437400579452515, "step": 624 }, { "epoch": 0.6103515625, "grad_norm": 0.38221171498298645, "learning_rate": 7.850834151128557e-05, "loss": 0.8738820552825928, "step": 625 }, { "epoch": 0.611328125, "grad_norm": 0.339599609375, "learning_rate": 7.831207065750737e-05, "loss": 0.517478883266449, "step": 626 }, { "epoch": 0.6123046875, "grad_norm": 0.7177076935768127, "learning_rate": 7.811579980372915e-05, "loss": 0.7372115254402161, "step": 627 }, { "epoch": 0.61328125, "grad_norm": 0.47573140263557434, "learning_rate": 7.791952894995093e-05, "loss": 0.649010181427002, "step": 628 }, { "epoch": 0.6142578125, "grad_norm": 0.44851094484329224, "learning_rate": 7.772325809617273e-05, "loss": 0.6269842386245728, "step": 629 }, { "epoch": 0.615234375, "grad_norm": 0.3544669449329376, "learning_rate": 7.752698724239451e-05, "loss": 0.8870983123779297, "step": 630 }, { "epoch": 0.6162109375, "grad_norm": 0.4103491008281708, "learning_rate": 7.73307163886163e-05, "loss": 0.8711034059524536, "step": 631 }, { "epoch": 0.6171875, "grad_norm": 0.3651062548160553, "learning_rate": 7.713444553483808e-05, "loss": 0.8420337438583374, "step": 632 }, { "epoch": 0.6181640625, "grad_norm": 0.4135638475418091, "learning_rate": 7.693817468105987e-05, "loss": 0.601078450679779, "step": 633 }, { "epoch": 0.619140625, "grad_norm": 0.5965299010276794, "learning_rate": 7.674190382728164e-05, "loss": 0.604471743106842, "step": 634 }, { "epoch": 0.6201171875, "grad_norm": 0.4340416491031647, "learning_rate": 7.654563297350344e-05, "loss": 0.905183732509613, "step": 635 }, { "epoch": 0.62109375, "grad_norm": 0.361518919467926, "learning_rate": 7.634936211972522e-05, "loss": 0.6569675207138062, "step": 636 }, { "epoch": 0.6220703125, "grad_norm": 1.04604971408844, "learning_rate": 7.6153091265947e-05, "loss": 0.7399482727050781, "step": 637 }, { "epoch": 0.623046875, "grad_norm": 0.8039460778236389, "learning_rate": 7.59568204121688e-05, "loss": 0.6003617644309998, "step": 638 }, { "epoch": 0.6240234375, "grad_norm": 0.5462118983268738, "learning_rate": 7.576054955839058e-05, "loss": 0.7750217914581299, "step": 639 }, { "epoch": 0.625, "grad_norm": 0.29333505034446716, "learning_rate": 7.556427870461236e-05, "loss": 0.47371456027030945, "step": 640 }, { "epoch": 0.6259765625, "grad_norm": 0.2468312531709671, "learning_rate": 7.536800785083416e-05, "loss": 0.4615188241004944, "step": 641 }, { "epoch": 0.626953125, "grad_norm": 0.48467332124710083, "learning_rate": 7.517173699705594e-05, "loss": 0.6456693410873413, "step": 642 }, { "epoch": 0.6279296875, "grad_norm": 0.5471943020820618, "learning_rate": 7.497546614327772e-05, "loss": 0.5899155139923096, "step": 643 }, { "epoch": 0.62890625, "grad_norm": 0.3715604841709137, "learning_rate": 7.477919528949952e-05, "loss": 0.7910970449447632, "step": 644 }, { "epoch": 0.6298828125, "grad_norm": 0.3298327922821045, "learning_rate": 7.45829244357213e-05, "loss": 0.5769776701927185, "step": 645 }, { "epoch": 0.630859375, "grad_norm": 0.44131916761398315, "learning_rate": 7.438665358194309e-05, "loss": 0.8805806636810303, "step": 646 }, { "epoch": 0.6318359375, "grad_norm": 0.4686948359012604, "learning_rate": 7.419038272816488e-05, "loss": 0.7262091636657715, "step": 647 }, { "epoch": 0.6328125, "grad_norm": 0.48123931884765625, "learning_rate": 7.399411187438666e-05, "loss": 0.8481992483139038, "step": 648 }, { "epoch": 0.6337890625, "grad_norm": 0.5582646131515503, "learning_rate": 7.379784102060843e-05, "loss": 0.4963653087615967, "step": 649 }, { "epoch": 0.634765625, "grad_norm": 0.30464881658554077, "learning_rate": 7.360157016683023e-05, "loss": 0.6772556900978088, "step": 650 }, { "epoch": 0.6357421875, "grad_norm": 0.44710803031921387, "learning_rate": 7.340529931305201e-05, "loss": 0.5476983189582825, "step": 651 }, { "epoch": 0.63671875, "grad_norm": 0.35922887921333313, "learning_rate": 7.320902845927379e-05, "loss": 0.8256508111953735, "step": 652 }, { "epoch": 0.6376953125, "grad_norm": 0.40085500478744507, "learning_rate": 7.301275760549559e-05, "loss": 0.5783500671386719, "step": 653 }, { "epoch": 0.638671875, "grad_norm": 0.47579512000083923, "learning_rate": 7.281648675171737e-05, "loss": 0.5591031908988953, "step": 654 }, { "epoch": 0.6396484375, "grad_norm": 0.5594353675842285, "learning_rate": 7.262021589793915e-05, "loss": 0.8133666515350342, "step": 655 }, { "epoch": 0.640625, "grad_norm": 0.44030821323394775, "learning_rate": 7.242394504416095e-05, "loss": 1.0282940864562988, "step": 656 }, { "epoch": 0.6416015625, "grad_norm": 0.7038627862930298, "learning_rate": 7.222767419038273e-05, "loss": 0.2322971373796463, "step": 657 }, { "epoch": 0.642578125, "grad_norm": 0.223698228597641, "learning_rate": 7.203140333660451e-05, "loss": 0.7056642174720764, "step": 658 }, { "epoch": 0.6435546875, "grad_norm": 0.3815765976905823, "learning_rate": 7.183513248282631e-05, "loss": 1.074477195739746, "step": 659 }, { "epoch": 0.64453125, "grad_norm": 0.35606271028518677, "learning_rate": 7.163886162904809e-05, "loss": 0.4300801753997803, "step": 660 }, { "epoch": 0.6455078125, "grad_norm": 0.32899999618530273, "learning_rate": 7.144259077526988e-05, "loss": 0.5923078060150146, "step": 661 }, { "epoch": 0.646484375, "grad_norm": 0.49968358874320984, "learning_rate": 7.124631992149167e-05, "loss": 0.8295183181762695, "step": 662 }, { "epoch": 0.6474609375, "grad_norm": 0.3393777012825012, "learning_rate": 7.105004906771345e-05, "loss": 0.30383622646331787, "step": 663 }, { "epoch": 0.6484375, "grad_norm": 0.24977968633174896, "learning_rate": 7.085377821393524e-05, "loss": 0.429612934589386, "step": 664 }, { "epoch": 0.6494140625, "grad_norm": 0.35886242985725403, "learning_rate": 7.065750736015702e-05, "loss": 0.9189084768295288, "step": 665 }, { "epoch": 0.650390625, "grad_norm": 0.3856249153614044, "learning_rate": 7.04612365063788e-05, "loss": 0.4880048930644989, "step": 666 }, { "epoch": 0.6513671875, "grad_norm": 0.4439884424209595, "learning_rate": 7.026496565260058e-05, "loss": 0.7537186145782471, "step": 667 }, { "epoch": 0.65234375, "grad_norm": 0.29563215374946594, "learning_rate": 7.006869479882238e-05, "loss": 0.38701343536376953, "step": 668 }, { "epoch": 0.6533203125, "grad_norm": 0.1909576952457428, "learning_rate": 6.987242394504416e-05, "loss": 0.15140604972839355, "step": 669 }, { "epoch": 0.654296875, "grad_norm": 0.3344849944114685, "learning_rate": 6.967615309126594e-05, "loss": 0.527427077293396, "step": 670 }, { "epoch": 0.6552734375, "grad_norm": 0.3609422743320465, "learning_rate": 6.947988223748774e-05, "loss": 0.29116177558898926, "step": 671 }, { "epoch": 0.65625, "grad_norm": 0.4419811964035034, "learning_rate": 6.928361138370952e-05, "loss": 0.7166855931282043, "step": 672 }, { "epoch": 0.6572265625, "grad_norm": 0.31890806555747986, "learning_rate": 6.90873405299313e-05, "loss": 0.5259425640106201, "step": 673 }, { "epoch": 0.658203125, "grad_norm": 0.39572352170944214, "learning_rate": 6.88910696761531e-05, "loss": 0.5964791774749756, "step": 674 }, { "epoch": 0.6591796875, "grad_norm": 0.4501058757305145, "learning_rate": 6.869479882237488e-05, "loss": 0.2289922833442688, "step": 675 }, { "epoch": 0.66015625, "grad_norm": 0.2884235680103302, "learning_rate": 6.849852796859666e-05, "loss": 0.2730886936187744, "step": 676 }, { "epoch": 0.6611328125, "grad_norm": 0.32970431447029114, "learning_rate": 6.830225711481846e-05, "loss": 0.4283568859100342, "step": 677 }, { "epoch": 0.662109375, "grad_norm": 0.39025789499282837, "learning_rate": 6.810598626104023e-05, "loss": 0.9361288547515869, "step": 678 }, { "epoch": 0.6630859375, "grad_norm": 0.48386886715888977, "learning_rate": 6.790971540726203e-05, "loss": 0.4907494783401489, "step": 679 }, { "epoch": 0.6640625, "grad_norm": 0.41783151030540466, "learning_rate": 6.771344455348381e-05, "loss": 0.7485824823379517, "step": 680 }, { "epoch": 0.6650390625, "grad_norm": 0.4826144278049469, "learning_rate": 6.751717369970559e-05, "loss": 0.6413211226463318, "step": 681 }, { "epoch": 0.666015625, "grad_norm": 0.27521079778671265, "learning_rate": 6.732090284592739e-05, "loss": 0.5747159123420715, "step": 682 }, { "epoch": 0.6669921875, "grad_norm": 0.3745660185813904, "learning_rate": 6.712463199214917e-05, "loss": 0.414341002702713, "step": 683 }, { "epoch": 0.66796875, "grad_norm": 0.45048731565475464, "learning_rate": 6.692836113837095e-05, "loss": 0.3665570318698883, "step": 684 }, { "epoch": 0.6689453125, "grad_norm": 0.5048633217811584, "learning_rate": 6.673209028459275e-05, "loss": 0.5923498272895813, "step": 685 }, { "epoch": 0.669921875, "grad_norm": 0.46423155069351196, "learning_rate": 6.653581943081453e-05, "loss": 0.7506915330886841, "step": 686 }, { "epoch": 0.6708984375, "grad_norm": 0.42965108156204224, "learning_rate": 6.633954857703631e-05, "loss": 0.7576399445533752, "step": 687 }, { "epoch": 0.671875, "grad_norm": 0.48331597447395325, "learning_rate": 6.614327772325811e-05, "loss": 0.5249682068824768, "step": 688 }, { "epoch": 0.6728515625, "grad_norm": 0.4685790240764618, "learning_rate": 6.594700686947989e-05, "loss": 0.8056750297546387, "step": 689 }, { "epoch": 0.673828125, "grad_norm": 0.46440044045448303, "learning_rate": 6.575073601570167e-05, "loss": 0.9252493381500244, "step": 690 }, { "epoch": 0.6748046875, "grad_norm": 0.46564289927482605, "learning_rate": 6.555446516192347e-05, "loss": 0.8182022571563721, "step": 691 }, { "epoch": 0.67578125, "grad_norm": 0.4397750496864319, "learning_rate": 6.535819430814525e-05, "loss": 0.7928388118743896, "step": 692 }, { "epoch": 0.6767578125, "grad_norm": 0.3233174681663513, "learning_rate": 6.516192345436702e-05, "loss": 0.5252426862716675, "step": 693 }, { "epoch": 0.677734375, "grad_norm": 0.6012148857116699, "learning_rate": 6.496565260058882e-05, "loss": 0.44195663928985596, "step": 694 }, { "epoch": 0.6787109375, "grad_norm": 0.6329052448272705, "learning_rate": 6.47693817468106e-05, "loss": 0.5354570150375366, "step": 695 }, { "epoch": 0.6796875, "grad_norm": 0.47926270961761475, "learning_rate": 6.457311089303238e-05, "loss": 0.4950491786003113, "step": 696 }, { "epoch": 0.6806640625, "grad_norm": 0.5051383972167969, "learning_rate": 6.437684003925418e-05, "loss": 0.6795849204063416, "step": 697 }, { "epoch": 0.681640625, "grad_norm": 0.4022398591041565, "learning_rate": 6.418056918547596e-05, "loss": 1.0388166904449463, "step": 698 }, { "epoch": 0.6826171875, "grad_norm": 0.4309573471546173, "learning_rate": 6.398429833169774e-05, "loss": 0.6022897362709045, "step": 699 }, { "epoch": 0.68359375, "grad_norm": 0.3301983177661896, "learning_rate": 6.378802747791954e-05, "loss": 0.6451660394668579, "step": 700 }, { "epoch": 0.6845703125, "grad_norm": 0.6647156476974487, "learning_rate": 6.359175662414132e-05, "loss": 0.9699732661247253, "step": 701 }, { "epoch": 0.685546875, "grad_norm": 0.37545597553253174, "learning_rate": 6.33954857703631e-05, "loss": 0.43181508779525757, "step": 702 }, { "epoch": 0.6865234375, "grad_norm": 0.40882429480552673, "learning_rate": 6.31992149165849e-05, "loss": 0.665264368057251, "step": 703 }, { "epoch": 0.6875, "grad_norm": 0.46597936749458313, "learning_rate": 6.300294406280668e-05, "loss": 0.8813620209693909, "step": 704 }, { "epoch": 0.6884765625, "grad_norm": 0.4355461597442627, "learning_rate": 6.280667320902846e-05, "loss": 0.595770537853241, "step": 705 }, { "epoch": 0.689453125, "grad_norm": 0.45896056294441223, "learning_rate": 6.261040235525026e-05, "loss": 0.7571601271629333, "step": 706 }, { "epoch": 0.6904296875, "grad_norm": 0.37643495202064514, "learning_rate": 6.241413150147204e-05, "loss": 0.47930869460105896, "step": 707 }, { "epoch": 0.69140625, "grad_norm": 0.49690738320350647, "learning_rate": 6.221786064769381e-05, "loss": 0.3727263808250427, "step": 708 }, { "epoch": 0.6923828125, "grad_norm": 0.44111907482147217, "learning_rate": 6.20215897939156e-05, "loss": 0.7276532649993896, "step": 709 }, { "epoch": 0.693359375, "grad_norm": 0.44872644543647766, "learning_rate": 6.182531894013739e-05, "loss": 0.5082123279571533, "step": 710 }, { "epoch": 0.6943359375, "grad_norm": 0.3345314562320709, "learning_rate": 6.162904808635917e-05, "loss": 0.5472716093063354, "step": 711 }, { "epoch": 0.6953125, "grad_norm": 0.4269154667854309, "learning_rate": 6.143277723258097e-05, "loss": 0.7036910057067871, "step": 712 }, { "epoch": 0.6962890625, "grad_norm": 0.5314676761627197, "learning_rate": 6.123650637880275e-05, "loss": 0.8663474917411804, "step": 713 }, { "epoch": 0.697265625, "grad_norm": 0.2820166349411011, "learning_rate": 6.104023552502453e-05, "loss": 0.6397068500518799, "step": 714 }, { "epoch": 0.6982421875, "grad_norm": 0.40954726934432983, "learning_rate": 6.084396467124632e-05, "loss": 0.5477964282035828, "step": 715 }, { "epoch": 0.69921875, "grad_norm": 0.6858615279197693, "learning_rate": 6.064769381746811e-05, "loss": 0.694764256477356, "step": 716 }, { "epoch": 0.7001953125, "grad_norm": 2.901998281478882, "learning_rate": 6.04514229636899e-05, "loss": 0.5803335309028625, "step": 717 }, { "epoch": 0.701171875, "grad_norm": 0.6065869927406311, "learning_rate": 6.025515210991168e-05, "loss": 0.49790292978286743, "step": 718 }, { "epoch": 0.7021484375, "grad_norm": 0.3678690195083618, "learning_rate": 6.005888125613347e-05, "loss": 0.38595882058143616, "step": 719 }, { "epoch": 0.703125, "grad_norm": 0.32496991753578186, "learning_rate": 5.986261040235526e-05, "loss": 0.3554360866546631, "step": 720 }, { "epoch": 0.7041015625, "grad_norm": 0.5348960161209106, "learning_rate": 5.966633954857704e-05, "loss": 1.0386948585510254, "step": 721 }, { "epoch": 0.705078125, "grad_norm": 0.42248818278312683, "learning_rate": 5.947006869479883e-05, "loss": 0.4950508177280426, "step": 722 }, { "epoch": 0.7060546875, "grad_norm": 0.36575669050216675, "learning_rate": 5.9273797841020606e-05, "loss": 0.8793643712997437, "step": 723 }, { "epoch": 0.70703125, "grad_norm": 0.30802977085113525, "learning_rate": 5.9077526987242395e-05, "loss": 0.7557331919670105, "step": 724 }, { "epoch": 0.7080078125, "grad_norm": 0.36057788133621216, "learning_rate": 5.888125613346418e-05, "loss": 0.793386697769165, "step": 725 }, { "epoch": 0.708984375, "grad_norm": 0.5049283504486084, "learning_rate": 5.8684985279685966e-05, "loss": 0.3805343210697174, "step": 726 }, { "epoch": 0.7099609375, "grad_norm": 0.4448167681694031, "learning_rate": 5.8488714425907756e-05, "loss": 0.8297110199928284, "step": 727 }, { "epoch": 0.7109375, "grad_norm": 0.5144803524017334, "learning_rate": 5.829244357212954e-05, "loss": 0.8582932949066162, "step": 728 }, { "epoch": 0.7119140625, "grad_norm": 0.48559248447418213, "learning_rate": 5.809617271835133e-05, "loss": 0.851997971534729, "step": 729 }, { "epoch": 0.712890625, "grad_norm": 0.5277959704399109, "learning_rate": 5.7899901864573116e-05, "loss": 0.8560271859169006, "step": 730 }, { "epoch": 0.7138671875, "grad_norm": 0.39055025577545166, "learning_rate": 5.77036310107949e-05, "loss": 0.5023626685142517, "step": 731 }, { "epoch": 0.71484375, "grad_norm": 0.4014328718185425, "learning_rate": 5.750736015701669e-05, "loss": 0.7782986760139465, "step": 732 }, { "epoch": 0.7158203125, "grad_norm": 0.9840988516807556, "learning_rate": 5.731108930323848e-05, "loss": 0.5097107887268066, "step": 733 }, { "epoch": 0.716796875, "grad_norm": 0.512140691280365, "learning_rate": 5.711481844946026e-05, "loss": 0.5448895692825317, "step": 734 }, { "epoch": 0.7177734375, "grad_norm": 0.45195046067237854, "learning_rate": 5.691854759568205e-05, "loss": 0.7583330273628235, "step": 735 }, { "epoch": 0.71875, "grad_norm": 0.4155009090900421, "learning_rate": 5.672227674190384e-05, "loss": 0.5220797061920166, "step": 736 }, { "epoch": 0.7197265625, "grad_norm": 0.552148699760437, "learning_rate": 5.652600588812562e-05, "loss": 0.8043540716171265, "step": 737 }, { "epoch": 0.720703125, "grad_norm": 0.30510297417640686, "learning_rate": 5.6329735034347396e-05, "loss": 0.5110808610916138, "step": 738 }, { "epoch": 0.7216796875, "grad_norm": 0.522339940071106, "learning_rate": 5.6133464180569185e-05, "loss": 1.0245096683502197, "step": 739 }, { "epoch": 0.72265625, "grad_norm": 0.27751341462135315, "learning_rate": 5.5937193326790974e-05, "loss": 0.6376601457595825, "step": 740 }, { "epoch": 0.7236328125, "grad_norm": 0.4283340573310852, "learning_rate": 5.5740922473012756e-05, "loss": 1.1317777633666992, "step": 741 }, { "epoch": 0.724609375, "grad_norm": 0.541248619556427, "learning_rate": 5.5544651619234545e-05, "loss": 0.8086187839508057, "step": 742 }, { "epoch": 0.7255859375, "grad_norm": 0.24750906229019165, "learning_rate": 5.5348380765456335e-05, "loss": 0.4873177409172058, "step": 743 }, { "epoch": 0.7265625, "grad_norm": 0.42374616861343384, "learning_rate": 5.515210991167812e-05, "loss": 0.41606956720352173, "step": 744 }, { "epoch": 0.7275390625, "grad_norm": 0.35455161333084106, "learning_rate": 5.4955839057899906e-05, "loss": 0.49936947226524353, "step": 745 }, { "epoch": 0.728515625, "grad_norm": 0.4243617653846741, "learning_rate": 5.475956820412169e-05, "loss": 0.6650359630584717, "step": 746 }, { "epoch": 0.7294921875, "grad_norm": 0.4106060862541199, "learning_rate": 5.456329735034348e-05, "loss": 0.37870654463768005, "step": 747 }, { "epoch": 0.73046875, "grad_norm": 0.3536394536495209, "learning_rate": 5.436702649656527e-05, "loss": 1.0944924354553223, "step": 748 }, { "epoch": 0.7314453125, "grad_norm": 0.3067559003829956, "learning_rate": 5.417075564278705e-05, "loss": 0.6380996704101562, "step": 749 }, { "epoch": 0.732421875, "grad_norm": 0.40423691272735596, "learning_rate": 5.397448478900884e-05, "loss": 0.712358295917511, "step": 750 }, { "epoch": 0.7333984375, "grad_norm": 0.451038658618927, "learning_rate": 5.377821393523063e-05, "loss": 0.6221305727958679, "step": 751 }, { "epoch": 0.734375, "grad_norm": 0.32606229186058044, "learning_rate": 5.35819430814524e-05, "loss": 0.6600078344345093, "step": 752 }, { "epoch": 0.7353515625, "grad_norm": 0.746896505355835, "learning_rate": 5.3385672227674185e-05, "loss": 0.5533967614173889, "step": 753 }, { "epoch": 0.736328125, "grad_norm": 0.403277724981308, "learning_rate": 5.3189401373895974e-05, "loss": 0.7483388185501099, "step": 754 }, { "epoch": 0.7373046875, "grad_norm": 0.6016709208488464, "learning_rate": 5.2993130520117764e-05, "loss": 0.539909839630127, "step": 755 }, { "epoch": 0.73828125, "grad_norm": 0.39885231852531433, "learning_rate": 5.2796859666339546e-05, "loss": 0.7900533676147461, "step": 756 }, { "epoch": 0.7392578125, "grad_norm": 0.3245362639427185, "learning_rate": 5.2600588812561335e-05, "loss": 0.42862433195114136, "step": 757 }, { "epoch": 0.740234375, "grad_norm": 0.47334104776382446, "learning_rate": 5.2404317958783124e-05, "loss": 0.3249909281730652, "step": 758 }, { "epoch": 0.7412109375, "grad_norm": 0.3029737174510956, "learning_rate": 5.220804710500491e-05, "loss": 0.4264957308769226, "step": 759 }, { "epoch": 0.7421875, "grad_norm": 0.33878564834594727, "learning_rate": 5.2011776251226696e-05, "loss": 0.4446904957294464, "step": 760 }, { "epoch": 0.7431640625, "grad_norm": 0.3307798206806183, "learning_rate": 5.1815505397448485e-05, "loss": 0.461605966091156, "step": 761 }, { "epoch": 0.744140625, "grad_norm": 0.4146850109100342, "learning_rate": 5.161923454367027e-05, "loss": 0.758568525314331, "step": 762 }, { "epoch": 0.7451171875, "grad_norm": 0.3531327545642853, "learning_rate": 5.1422963689892056e-05, "loss": 0.4580535292625427, "step": 763 }, { "epoch": 0.74609375, "grad_norm": 0.3952695429325104, "learning_rate": 5.1226692836113846e-05, "loss": 0.333244651556015, "step": 764 }, { "epoch": 0.7470703125, "grad_norm": 0.5774162411689758, "learning_rate": 5.103042198233563e-05, "loss": 0.6433362364768982, "step": 765 }, { "epoch": 0.748046875, "grad_norm": 0.49668964743614197, "learning_rate": 5.083415112855742e-05, "loss": 0.8478100895881653, "step": 766 }, { "epoch": 0.7490234375, "grad_norm": 0.3303810954093933, "learning_rate": 5.063788027477919e-05, "loss": 0.7296837568283081, "step": 767 }, { "epoch": 0.75, "grad_norm": 0.27652832865715027, "learning_rate": 5.044160942100098e-05, "loss": 0.6442312598228455, "step": 768 }, { "epoch": 0.7509765625, "grad_norm": 1.0828924179077148, "learning_rate": 5.0245338567222764e-05, "loss": 0.9848635196685791, "step": 769 }, { "epoch": 0.751953125, "grad_norm": 0.38959333300590515, "learning_rate": 5.0049067713444553e-05, "loss": 0.722776472568512, "step": 770 }, { "epoch": 0.7529296875, "grad_norm": 0.3470323383808136, "learning_rate": 4.985279685966634e-05, "loss": 0.6584157943725586, "step": 771 }, { "epoch": 0.75390625, "grad_norm": 0.4060254693031311, "learning_rate": 4.9656526005888125e-05, "loss": 0.6276923418045044, "step": 772 }, { "epoch": 0.7548828125, "grad_norm": 0.34566962718963623, "learning_rate": 4.9460255152109914e-05, "loss": 0.972516655921936, "step": 773 }, { "epoch": 0.755859375, "grad_norm": 0.41829708218574524, "learning_rate": 4.92639842983317e-05, "loss": 0.6937177181243896, "step": 774 }, { "epoch": 0.7568359375, "grad_norm": 0.7653974294662476, "learning_rate": 4.9067713444553486e-05, "loss": 0.6027823090553284, "step": 775 }, { "epoch": 0.7578125, "grad_norm": 1.0477155447006226, "learning_rate": 4.8871442590775275e-05, "loss": 0.925806999206543, "step": 776 }, { "epoch": 0.7587890625, "grad_norm": 0.43484824895858765, "learning_rate": 4.8675171736997064e-05, "loss": 0.7783142328262329, "step": 777 }, { "epoch": 0.759765625, "grad_norm": 0.33719849586486816, "learning_rate": 4.847890088321884e-05, "loss": 0.6108527779579163, "step": 778 }, { "epoch": 0.7607421875, "grad_norm": 0.3983028531074524, "learning_rate": 4.828263002944063e-05, "loss": 0.9976012706756592, "step": 779 }, { "epoch": 0.76171875, "grad_norm": 0.3278787136077881, "learning_rate": 4.808635917566242e-05, "loss": 0.5754845142364502, "step": 780 }, { "epoch": 0.7626953125, "grad_norm": 0.42433467507362366, "learning_rate": 4.78900883218842e-05, "loss": 0.8455826640129089, "step": 781 }, { "epoch": 0.763671875, "grad_norm": 0.33245334029197693, "learning_rate": 4.769381746810599e-05, "loss": 0.5207083225250244, "step": 782 }, { "epoch": 0.7646484375, "grad_norm": 0.4390372931957245, "learning_rate": 4.749754661432778e-05, "loss": 0.7208432555198669, "step": 783 }, { "epoch": 0.765625, "grad_norm": 0.325720876455307, "learning_rate": 4.730127576054956e-05, "loss": 0.3017955422401428, "step": 784 }, { "epoch": 0.7666015625, "grad_norm": 0.3036203980445862, "learning_rate": 4.710500490677135e-05, "loss": 0.47869423031806946, "step": 785 }, { "epoch": 0.767578125, "grad_norm": 0.4316065013408661, "learning_rate": 4.690873405299313e-05, "loss": 0.7984920740127563, "step": 786 }, { "epoch": 0.7685546875, "grad_norm": 0.46907728910446167, "learning_rate": 4.6712463199214915e-05, "loss": 0.7288491725921631, "step": 787 }, { "epoch": 0.76953125, "grad_norm": 0.38269418478012085, "learning_rate": 4.6516192345436704e-05, "loss": 0.46745771169662476, "step": 788 }, { "epoch": 0.7705078125, "grad_norm": 0.6045718193054199, "learning_rate": 4.631992149165849e-05, "loss": 0.5405256152153015, "step": 789 }, { "epoch": 0.771484375, "grad_norm": 0.3303053677082062, "learning_rate": 4.6123650637880275e-05, "loss": 0.6721948981285095, "step": 790 }, { "epoch": 0.7724609375, "grad_norm": 0.42014074325561523, "learning_rate": 4.5927379784102065e-05, "loss": 0.9322581887245178, "step": 791 }, { "epoch": 0.7734375, "grad_norm": 0.3720149099826813, "learning_rate": 4.5731108930323854e-05, "loss": 0.7807843685150146, "step": 792 }, { "epoch": 0.7744140625, "grad_norm": 0.31559938192367554, "learning_rate": 4.5534838076545636e-05, "loss": 0.8503724336624146, "step": 793 }, { "epoch": 0.775390625, "grad_norm": 0.4096013903617859, "learning_rate": 4.533856722276742e-05, "loss": 0.6950633525848389, "step": 794 }, { "epoch": 0.7763671875, "grad_norm": 0.3791837990283966, "learning_rate": 4.514229636898921e-05, "loss": 0.7583197951316833, "step": 795 }, { "epoch": 0.77734375, "grad_norm": 0.5274584889411926, "learning_rate": 4.494602551521099e-05, "loss": 0.4712093770503998, "step": 796 }, { "epoch": 0.7783203125, "grad_norm": 0.29654791951179504, "learning_rate": 4.474975466143278e-05, "loss": 0.552979588508606, "step": 797 }, { "epoch": 0.779296875, "grad_norm": 0.25629475712776184, "learning_rate": 4.455348380765457e-05, "loss": 0.5225521922111511, "step": 798 }, { "epoch": 0.7802734375, "grad_norm": 0.2676495611667633, "learning_rate": 4.435721295387635e-05, "loss": 0.4382556080818176, "step": 799 }, { "epoch": 0.78125, "grad_norm": 0.4117366075515747, "learning_rate": 4.416094210009813e-05, "loss": 0.5639417767524719, "step": 800 }, { "epoch": 0.7822265625, "grad_norm": 0.26305386424064636, "learning_rate": 4.396467124631992e-05, "loss": 0.28840768337249756, "step": 801 }, { "epoch": 0.783203125, "grad_norm": 0.7253789305686951, "learning_rate": 4.376840039254171e-05, "loss": 0.4104336202144623, "step": 802 }, { "epoch": 0.7841796875, "grad_norm": 0.371288001537323, "learning_rate": 4.3572129538763494e-05, "loss": 0.609147310256958, "step": 803 }, { "epoch": 0.78515625, "grad_norm": 0.634273111820221, "learning_rate": 4.337585868498528e-05, "loss": 0.5141665935516357, "step": 804 }, { "epoch": 0.7861328125, "grad_norm": 0.4442044496536255, "learning_rate": 4.317958783120707e-05, "loss": 0.4882044494152069, "step": 805 }, { "epoch": 0.787109375, "grad_norm": 0.3099007308483124, "learning_rate": 4.2983316977428854e-05, "loss": 0.3148588538169861, "step": 806 }, { "epoch": 0.7880859375, "grad_norm": 0.41893890500068665, "learning_rate": 4.2787046123650643e-05, "loss": 0.6678078174591064, "step": 807 }, { "epoch": 0.7890625, "grad_norm": 0.47682809829711914, "learning_rate": 4.2590775269872426e-05, "loss": 0.46614763140678406, "step": 808 }, { "epoch": 0.7900390625, "grad_norm": 0.25193366408348083, "learning_rate": 4.239450441609421e-05, "loss": 0.3707652986049652, "step": 809 }, { "epoch": 0.791015625, "grad_norm": 0.3425232768058777, "learning_rate": 4.2198233562316e-05, "loss": 0.604179859161377, "step": 810 }, { "epoch": 0.7919921875, "grad_norm": 0.31459808349609375, "learning_rate": 4.2001962708537786e-05, "loss": 0.748989999294281, "step": 811 }, { "epoch": 0.79296875, "grad_norm": 0.3478514850139618, "learning_rate": 4.180569185475957e-05, "loss": 0.6651142835617065, "step": 812 }, { "epoch": 0.7939453125, "grad_norm": 0.3951675295829773, "learning_rate": 4.160942100098136e-05, "loss": 0.7293418049812317, "step": 813 }, { "epoch": 0.794921875, "grad_norm": 0.26888158917427063, "learning_rate": 4.141315014720315e-05, "loss": 0.2181730419397354, "step": 814 }, { "epoch": 0.7958984375, "grad_norm": 0.17496585845947266, "learning_rate": 4.121687929342492e-05, "loss": 0.18257993459701538, "step": 815 }, { "epoch": 0.796875, "grad_norm": 0.3386918306350708, "learning_rate": 4.102060843964671e-05, "loss": 0.43010956048965454, "step": 816 }, { "epoch": 0.7978515625, "grad_norm": 0.5185137987136841, "learning_rate": 4.08243375858685e-05, "loss": 0.9117882251739502, "step": 817 }, { "epoch": 0.798828125, "grad_norm": 0.499529093503952, "learning_rate": 4.0628066732090283e-05, "loss": 0.8601939678192139, "step": 818 }, { "epoch": 0.7998046875, "grad_norm": 0.44401317834854126, "learning_rate": 4.043179587831207e-05, "loss": 0.8643960356712341, "step": 819 }, { "epoch": 0.80078125, "grad_norm": 0.30553653836250305, "learning_rate": 4.023552502453386e-05, "loss": 0.7741817235946655, "step": 820 }, { "epoch": 0.8017578125, "grad_norm": 0.443541944026947, "learning_rate": 4.0039254170755644e-05, "loss": 0.9571224451065063, "step": 821 }, { "epoch": 0.802734375, "grad_norm": 0.2611587643623352, "learning_rate": 3.9842983316977426e-05, "loss": 0.4755222201347351, "step": 822 }, { "epoch": 0.8037109375, "grad_norm": 0.38695722818374634, "learning_rate": 3.9646712463199216e-05, "loss": 0.9597996473312378, "step": 823 }, { "epoch": 0.8046875, "grad_norm": 0.505346953868866, "learning_rate": 3.9450441609421005e-05, "loss": 0.328266441822052, "step": 824 }, { "epoch": 0.8056640625, "grad_norm": 0.38910478353500366, "learning_rate": 3.925417075564279e-05, "loss": 0.4758382737636566, "step": 825 }, { "epoch": 0.806640625, "grad_norm": 0.4268342852592468, "learning_rate": 3.9057899901864576e-05, "loss": 0.6131553649902344, "step": 826 }, { "epoch": 0.8076171875, "grad_norm": 0.32205328345298767, "learning_rate": 3.8861629048086365e-05, "loss": 0.6047544479370117, "step": 827 }, { "epoch": 0.80859375, "grad_norm": 0.6975948214530945, "learning_rate": 3.866535819430815e-05, "loss": 0.7599061727523804, "step": 828 }, { "epoch": 0.8095703125, "grad_norm": 0.20186780393123627, "learning_rate": 3.846908734052994e-05, "loss": 0.3639545738697052, "step": 829 }, { "epoch": 0.810546875, "grad_norm": 0.443435937166214, "learning_rate": 3.827281648675172e-05, "loss": 0.6933274269104004, "step": 830 }, { "epoch": 0.8115234375, "grad_norm": 0.44157811999320984, "learning_rate": 3.80765456329735e-05, "loss": 0.5135524272918701, "step": 831 }, { "epoch": 0.8125, "grad_norm": 0.3959600031375885, "learning_rate": 3.788027477919529e-05, "loss": 0.6713152527809143, "step": 832 }, { "epoch": 0.8134765625, "grad_norm": 0.5439519882202148, "learning_rate": 3.768400392541708e-05, "loss": 0.3603706359863281, "step": 833 }, { "epoch": 0.814453125, "grad_norm": 0.36693719029426575, "learning_rate": 3.748773307163886e-05, "loss": 0.8574247360229492, "step": 834 }, { "epoch": 0.8154296875, "grad_norm": 0.3476804792881012, "learning_rate": 3.729146221786065e-05, "loss": 0.6845530867576599, "step": 835 }, { "epoch": 0.81640625, "grad_norm": 0.48850229382514954, "learning_rate": 3.709519136408244e-05, "loss": 0.788569450378418, "step": 836 }, { "epoch": 0.8173828125, "grad_norm": 0.5997111797332764, "learning_rate": 3.6898920510304216e-05, "loss": 0.5885312557220459, "step": 837 }, { "epoch": 0.818359375, "grad_norm": 0.43312472105026245, "learning_rate": 3.6702649656526005e-05, "loss": 0.5300126075744629, "step": 838 }, { "epoch": 0.8193359375, "grad_norm": 0.6505857110023499, "learning_rate": 3.6506378802747795e-05, "loss": 0.7164736986160278, "step": 839 }, { "epoch": 0.8203125, "grad_norm": 0.34061765670776367, "learning_rate": 3.631010794896958e-05, "loss": 0.5405696034431458, "step": 840 }, { "epoch": 0.8212890625, "grad_norm": 0.4188057780265808, "learning_rate": 3.6113837095191366e-05, "loss": 1.0057684183120728, "step": 841 }, { "epoch": 0.822265625, "grad_norm": 0.392007052898407, "learning_rate": 3.5917566241413155e-05, "loss": 0.6687936782836914, "step": 842 }, { "epoch": 0.8232421875, "grad_norm": 0.44254210591316223, "learning_rate": 3.572129538763494e-05, "loss": 0.39150726795196533, "step": 843 }, { "epoch": 0.82421875, "grad_norm": 0.41756534576416016, "learning_rate": 3.552502453385673e-05, "loss": 0.764665961265564, "step": 844 }, { "epoch": 0.8251953125, "grad_norm": 0.9839560985565186, "learning_rate": 3.532875368007851e-05, "loss": 0.45259296894073486, "step": 845 }, { "epoch": 0.826171875, "grad_norm": 0.3465111553668976, "learning_rate": 3.513248282630029e-05, "loss": 0.5895928740501404, "step": 846 }, { "epoch": 0.8271484375, "grad_norm": 0.4883447289466858, "learning_rate": 3.493621197252208e-05, "loss": 0.8401346802711487, "step": 847 }, { "epoch": 0.828125, "grad_norm": 0.3590312898159027, "learning_rate": 3.473994111874387e-05, "loss": 0.6134470105171204, "step": 848 }, { "epoch": 0.8291015625, "grad_norm": 0.48273324966430664, "learning_rate": 3.454367026496565e-05, "loss": 0.6351644992828369, "step": 849 }, { "epoch": 0.830078125, "grad_norm": 0.32156500220298767, "learning_rate": 3.434739941118744e-05, "loss": 0.5098355412483215, "step": 850 }, { "epoch": 0.8310546875, "grad_norm": 0.38239747285842896, "learning_rate": 3.415112855740923e-05, "loss": 1.0178660154342651, "step": 851 }, { "epoch": 0.83203125, "grad_norm": 0.6875290274620056, "learning_rate": 3.395485770363101e-05, "loss": 0.4496825337409973, "step": 852 }, { "epoch": 0.8330078125, "grad_norm": 0.27034860849380493, "learning_rate": 3.3758586849852795e-05, "loss": 0.41253381967544556, "step": 853 }, { "epoch": 0.833984375, "grad_norm": 0.5166223049163818, "learning_rate": 3.3562315996074584e-05, "loss": 0.7344639897346497, "step": 854 }, { "epoch": 0.8349609375, "grad_norm": 0.39597758650779724, "learning_rate": 3.3366045142296373e-05, "loss": 0.6066821217536926, "step": 855 }, { "epoch": 0.8359375, "grad_norm": 0.44033098220825195, "learning_rate": 3.3169774288518156e-05, "loss": 0.7928174734115601, "step": 856 }, { "epoch": 0.8369140625, "grad_norm": 0.3340597450733185, "learning_rate": 3.2973503434739945e-05, "loss": 0.4783233404159546, "step": 857 }, { "epoch": 0.837890625, "grad_norm": 0.5634653568267822, "learning_rate": 3.2777232580961734e-05, "loss": 0.785845935344696, "step": 858 }, { "epoch": 0.8388671875, "grad_norm": 0.24581296741962433, "learning_rate": 3.258096172718351e-05, "loss": 0.36480462551116943, "step": 859 }, { "epoch": 0.83984375, "grad_norm": 0.316773384809494, "learning_rate": 3.23846908734053e-05, "loss": 0.886894941329956, "step": 860 }, { "epoch": 0.8408203125, "grad_norm": 0.4605409502983093, "learning_rate": 3.218842001962709e-05, "loss": 0.7125131487846375, "step": 861 }, { "epoch": 0.841796875, "grad_norm": 0.5473557114601135, "learning_rate": 3.199214916584887e-05, "loss": 0.45582157373428345, "step": 862 }, { "epoch": 0.8427734375, "grad_norm": 0.4604926109313965, "learning_rate": 3.179587831207066e-05, "loss": 0.5392733812332153, "step": 863 }, { "epoch": 0.84375, "grad_norm": 0.3192322552204132, "learning_rate": 3.159960745829245e-05, "loss": 0.3216538727283478, "step": 864 }, { "epoch": 0.8447265625, "grad_norm": 0.4225713610649109, "learning_rate": 3.140333660451423e-05, "loss": 0.36403900384902954, "step": 865 }, { "epoch": 0.845703125, "grad_norm": 0.7738484740257263, "learning_rate": 3.120706575073602e-05, "loss": 0.5428112149238586, "step": 866 }, { "epoch": 0.8466796875, "grad_norm": 0.7795976400375366, "learning_rate": 3.10107948969578e-05, "loss": 0.838668704032898, "step": 867 }, { "epoch": 0.84765625, "grad_norm": 0.4240044355392456, "learning_rate": 3.0814524043179585e-05, "loss": 0.5039677023887634, "step": 868 }, { "epoch": 0.8486328125, "grad_norm": 0.7870606780052185, "learning_rate": 3.0618253189401374e-05, "loss": 0.2639703154563904, "step": 869 }, { "epoch": 0.849609375, "grad_norm": 4.898192405700684, "learning_rate": 3.042198233562316e-05, "loss": 0.9641809463500977, "step": 870 }, { "epoch": 0.8505859375, "grad_norm": 0.4090663194656372, "learning_rate": 3.022571148184495e-05, "loss": 0.5249053835868835, "step": 871 }, { "epoch": 0.8515625, "grad_norm": 0.5761129856109619, "learning_rate": 3.0029440628066735e-05, "loss": 0.8987921476364136, "step": 872 }, { "epoch": 0.8525390625, "grad_norm": 0.2440023124217987, "learning_rate": 2.983316977428852e-05, "loss": 0.3279159367084503, "step": 873 }, { "epoch": 0.853515625, "grad_norm": 0.438519150018692, "learning_rate": 2.9636898920510303e-05, "loss": 0.8272308111190796, "step": 874 }, { "epoch": 0.8544921875, "grad_norm": 0.4011988639831543, "learning_rate": 2.944062806673209e-05, "loss": 0.3140803873538971, "step": 875 }, { "epoch": 0.85546875, "grad_norm": 0.5748201012611389, "learning_rate": 2.9244357212953878e-05, "loss": 0.6699116230010986, "step": 876 }, { "epoch": 0.8564453125, "grad_norm": 0.3001462519168854, "learning_rate": 2.9048086359175664e-05, "loss": 0.19382989406585693, "step": 877 }, { "epoch": 0.857421875, "grad_norm": 0.40844887495040894, "learning_rate": 2.885181550539745e-05, "loss": 0.6494845747947693, "step": 878 }, { "epoch": 0.8583984375, "grad_norm": 0.3480914235115051, "learning_rate": 2.865554465161924e-05, "loss": 0.5555131435394287, "step": 879 }, { "epoch": 0.859375, "grad_norm": 0.3903101682662964, "learning_rate": 2.8459273797841024e-05, "loss": 0.6830955147743225, "step": 880 }, { "epoch": 0.8603515625, "grad_norm": 0.3058629333972931, "learning_rate": 2.826300294406281e-05, "loss": 0.3747236728668213, "step": 881 }, { "epoch": 0.861328125, "grad_norm": 0.49275287985801697, "learning_rate": 2.8066732090284592e-05, "loss": 1.0192487239837646, "step": 882 }, { "epoch": 0.8623046875, "grad_norm": 0.4016769826412201, "learning_rate": 2.7870461236506378e-05, "loss": 0.4012300372123718, "step": 883 }, { "epoch": 0.86328125, "grad_norm": 0.4790811240673065, "learning_rate": 2.7674190382728167e-05, "loss": 0.6936056613922119, "step": 884 }, { "epoch": 0.8642578125, "grad_norm": 0.39931413531303406, "learning_rate": 2.7477919528949953e-05, "loss": 0.3612633943557739, "step": 885 }, { "epoch": 0.865234375, "grad_norm": 0.3250795006752014, "learning_rate": 2.728164867517174e-05, "loss": 0.5146504640579224, "step": 886 }, { "epoch": 0.8662109375, "grad_norm": 0.5216737985610962, "learning_rate": 2.7085377821393525e-05, "loss": 0.6185201406478882, "step": 887 }, { "epoch": 0.8671875, "grad_norm": 0.5681923031806946, "learning_rate": 2.6889106967615314e-05, "loss": 0.9492973685264587, "step": 888 }, { "epoch": 0.8681640625, "grad_norm": 0.5284391045570374, "learning_rate": 2.6692836113837093e-05, "loss": 0.7801765203475952, "step": 889 }, { "epoch": 0.869140625, "grad_norm": 0.42510825395584106, "learning_rate": 2.6496565260058882e-05, "loss": 0.4871942102909088, "step": 890 }, { "epoch": 0.8701171875, "grad_norm": 0.39092326164245605, "learning_rate": 2.6300294406280668e-05, "loss": 0.5123960375785828, "step": 891 }, { "epoch": 0.87109375, "grad_norm": 0.37694281339645386, "learning_rate": 2.6104023552502453e-05, "loss": 0.3543451428413391, "step": 892 }, { "epoch": 0.8720703125, "grad_norm": 0.26519376039505005, "learning_rate": 2.5907752698724242e-05, "loss": 0.2388455718755722, "step": 893 }, { "epoch": 0.873046875, "grad_norm": 0.6303861141204834, "learning_rate": 2.5711481844946028e-05, "loss": 0.7195224761962891, "step": 894 }, { "epoch": 0.8740234375, "grad_norm": 0.4436159133911133, "learning_rate": 2.5515210991167814e-05, "loss": 0.8888048529624939, "step": 895 }, { "epoch": 0.875, "grad_norm": 0.6473313570022583, "learning_rate": 2.5318940137389596e-05, "loss": 0.8557075262069702, "step": 896 }, { "epoch": 0.8759765625, "grad_norm": 0.6625436544418335, "learning_rate": 2.5122669283611382e-05, "loss": 0.7132158279418945, "step": 897 }, { "epoch": 0.876953125, "grad_norm": 0.7241202592849731, "learning_rate": 2.492639842983317e-05, "loss": 0.9367854595184326, "step": 898 }, { "epoch": 0.8779296875, "grad_norm": 0.5321157574653625, "learning_rate": 2.4730127576054957e-05, "loss": 1.0013937950134277, "step": 899 }, { "epoch": 0.87890625, "grad_norm": 0.3287423253059387, "learning_rate": 2.4533856722276743e-05, "loss": 0.4560258984565735, "step": 900 }, { "epoch": 0.8798828125, "grad_norm": 0.5040727257728577, "learning_rate": 2.4337585868498532e-05, "loss": 0.5655212998390198, "step": 901 }, { "epoch": 0.880859375, "grad_norm": 0.4150228202342987, "learning_rate": 2.4141315014720314e-05, "loss": 0.43106216192245483, "step": 902 }, { "epoch": 0.8818359375, "grad_norm": 0.4006192684173584, "learning_rate": 2.39450441609421e-05, "loss": 0.4401901364326477, "step": 903 }, { "epoch": 0.8828125, "grad_norm": 0.5145865678787231, "learning_rate": 2.374877330716389e-05, "loss": 0.9345691800117493, "step": 904 }, { "epoch": 0.8837890625, "grad_norm": 0.7273013591766357, "learning_rate": 2.3552502453385675e-05, "loss": 0.27768659591674805, "step": 905 }, { "epoch": 0.884765625, "grad_norm": 0.3039482831954956, "learning_rate": 2.3356231599607457e-05, "loss": 0.6196010112762451, "step": 906 }, { "epoch": 0.8857421875, "grad_norm": 0.35697150230407715, "learning_rate": 2.3159960745829247e-05, "loss": 0.34777021408081055, "step": 907 }, { "epoch": 0.88671875, "grad_norm": 0.356717050075531, "learning_rate": 2.2963689892051032e-05, "loss": 0.4651508331298828, "step": 908 }, { "epoch": 0.8876953125, "grad_norm": 0.485963374376297, "learning_rate": 2.2767419038272818e-05, "loss": 0.3906201720237732, "step": 909 }, { "epoch": 0.888671875, "grad_norm": 0.38827836513519287, "learning_rate": 2.2571148184494604e-05, "loss": 0.48782849311828613, "step": 910 }, { "epoch": 0.8896484375, "grad_norm": 0.39589494466781616, "learning_rate": 2.237487733071639e-05, "loss": 0.5089969635009766, "step": 911 }, { "epoch": 0.890625, "grad_norm": 0.6619493365287781, "learning_rate": 2.2178606476938175e-05, "loss": 0.9266189932823181, "step": 912 }, { "epoch": 0.8916015625, "grad_norm": 0.407817542552948, "learning_rate": 2.198233562315996e-05, "loss": 0.3518386483192444, "step": 913 }, { "epoch": 0.892578125, "grad_norm": 0.4645719826221466, "learning_rate": 2.1786064769381747e-05, "loss": 0.9297075271606445, "step": 914 }, { "epoch": 0.8935546875, "grad_norm": 0.434517502784729, "learning_rate": 2.1589793915603536e-05, "loss": 0.7716128826141357, "step": 915 }, { "epoch": 0.89453125, "grad_norm": 0.49387747049331665, "learning_rate": 2.1393523061825322e-05, "loss": 0.5475488901138306, "step": 916 }, { "epoch": 0.8955078125, "grad_norm": 0.5593905448913574, "learning_rate": 2.1197252208047104e-05, "loss": 0.7304456233978271, "step": 917 }, { "epoch": 0.896484375, "grad_norm": 0.3386078178882599, "learning_rate": 2.1000981354268893e-05, "loss": 0.7872465252876282, "step": 918 }, { "epoch": 0.8974609375, "grad_norm": 0.2872868478298187, "learning_rate": 2.080471050049068e-05, "loss": 0.3295198976993561, "step": 919 }, { "epoch": 0.8984375, "grad_norm": 0.4897945523262024, "learning_rate": 2.060843964671246e-05, "loss": 0.3939395546913147, "step": 920 }, { "epoch": 0.8994140625, "grad_norm": 0.5068129897117615, "learning_rate": 2.041216879293425e-05, "loss": 0.4646037817001343, "step": 921 }, { "epoch": 0.900390625, "grad_norm": 0.3769625425338745, "learning_rate": 2.0215897939156036e-05, "loss": 0.811498761177063, "step": 922 }, { "epoch": 0.9013671875, "grad_norm": 0.380655974149704, "learning_rate": 2.0019627085377822e-05, "loss": 0.6260181665420532, "step": 923 }, { "epoch": 0.90234375, "grad_norm": 0.5810602903366089, "learning_rate": 1.9823356231599608e-05, "loss": 0.7125158309936523, "step": 924 }, { "epoch": 0.9033203125, "grad_norm": 0.4367387592792511, "learning_rate": 1.9627085377821394e-05, "loss": 0.7728107571601868, "step": 925 }, { "epoch": 0.904296875, "grad_norm": 0.604702353477478, "learning_rate": 1.9430814524043183e-05, "loss": 0.5136534571647644, "step": 926 }, { "epoch": 0.9052734375, "grad_norm": 0.40865615010261536, "learning_rate": 1.923454367026497e-05, "loss": 0.5040115714073181, "step": 927 }, { "epoch": 0.90625, "grad_norm": 0.3602078855037689, "learning_rate": 1.903827281648675e-05, "loss": 0.4498569965362549, "step": 928 }, { "epoch": 0.9072265625, "grad_norm": 0.46351152658462524, "learning_rate": 1.884200196270854e-05, "loss": 0.8635745644569397, "step": 929 }, { "epoch": 0.908203125, "grad_norm": 0.5490495562553406, "learning_rate": 1.8645731108930326e-05, "loss": 0.9265761375427246, "step": 930 }, { "epoch": 0.9091796875, "grad_norm": 0.4198157489299774, "learning_rate": 1.8449460255152108e-05, "loss": 0.8148217797279358, "step": 931 }, { "epoch": 0.91015625, "grad_norm": 0.5183578729629517, "learning_rate": 1.8253189401373897e-05, "loss": 0.7837534546852112, "step": 932 }, { "epoch": 0.9111328125, "grad_norm": 0.41839340329170227, "learning_rate": 1.8056918547595683e-05, "loss": 0.7239848971366882, "step": 933 }, { "epoch": 0.912109375, "grad_norm": 0.49158063530921936, "learning_rate": 1.786064769381747e-05, "loss": 0.7751527428627014, "step": 934 }, { "epoch": 0.9130859375, "grad_norm": 0.20171599090099335, "learning_rate": 1.7664376840039255e-05, "loss": 0.181843563914299, "step": 935 }, { "epoch": 0.9140625, "grad_norm": 0.36237961053848267, "learning_rate": 1.746810598626104e-05, "loss": 0.5150234699249268, "step": 936 }, { "epoch": 0.9150390625, "grad_norm": 0.4587535858154297, "learning_rate": 1.7271835132482826e-05, "loss": 0.6178685426712036, "step": 937 }, { "epoch": 0.916015625, "grad_norm": 0.392635703086853, "learning_rate": 1.7075564278704615e-05, "loss": 0.7002321481704712, "step": 938 }, { "epoch": 0.9169921875, "grad_norm": 0.28255772590637207, "learning_rate": 1.6879293424926398e-05, "loss": 0.6161627769470215, "step": 939 }, { "epoch": 0.91796875, "grad_norm": 0.31382182240486145, "learning_rate": 1.6683022571148187e-05, "loss": 0.6143029928207397, "step": 940 }, { "epoch": 0.9189453125, "grad_norm": 0.5099475383758545, "learning_rate": 1.6486751717369972e-05, "loss": 0.9116108417510986, "step": 941 }, { "epoch": 0.919921875, "grad_norm": 0.4015892446041107, "learning_rate": 1.6290480863591755e-05, "loss": 0.7331390380859375, "step": 942 }, { "epoch": 0.9208984375, "grad_norm": 0.4519053101539612, "learning_rate": 1.6094210009813544e-05, "loss": 0.6662384867668152, "step": 943 }, { "epoch": 0.921875, "grad_norm": 0.5565328598022461, "learning_rate": 1.589793915603533e-05, "loss": 0.37386590242385864, "step": 944 }, { "epoch": 0.9228515625, "grad_norm": 0.398419588804245, "learning_rate": 1.5701668302257116e-05, "loss": 0.9127399325370789, "step": 945 }, { "epoch": 0.923828125, "grad_norm": 0.37491804361343384, "learning_rate": 1.55053974484789e-05, "loss": 0.47025924921035767, "step": 946 }, { "epoch": 0.9248046875, "grad_norm": 0.49557894468307495, "learning_rate": 1.5309126594700687e-05, "loss": 0.6349594593048096, "step": 947 }, { "epoch": 0.92578125, "grad_norm": 0.2361314743757248, "learning_rate": 1.5112855740922475e-05, "loss": 0.3594982922077179, "step": 948 }, { "epoch": 0.9267578125, "grad_norm": 0.40022003650665283, "learning_rate": 1.491658488714426e-05, "loss": 0.41701436042785645, "step": 949 }, { "epoch": 0.927734375, "grad_norm": 0.349528431892395, "learning_rate": 1.4720314033366044e-05, "loss": 0.2943156063556671, "step": 950 }, { "epoch": 0.9287109375, "grad_norm": 0.4660559892654419, "learning_rate": 1.4524043179587832e-05, "loss": 0.3633948564529419, "step": 951 }, { "epoch": 0.9296875, "grad_norm": 0.28590673208236694, "learning_rate": 1.432777232580962e-05, "loss": 0.4886907935142517, "step": 952 }, { "epoch": 0.9306640625, "grad_norm": 0.4388448894023895, "learning_rate": 1.4131501472031405e-05, "loss": 0.6123654246330261, "step": 953 }, { "epoch": 0.931640625, "grad_norm": 0.4807531237602234, "learning_rate": 1.3935230618253189e-05, "loss": 0.32400381565093994, "step": 954 }, { "epoch": 0.9326171875, "grad_norm": 0.3903636932373047, "learning_rate": 1.3738959764474977e-05, "loss": 0.6839208006858826, "step": 955 }, { "epoch": 0.93359375, "grad_norm": 0.2925507426261902, "learning_rate": 1.3542688910696762e-05, "loss": 0.5898708701133728, "step": 956 }, { "epoch": 0.9345703125, "grad_norm": 0.39300912618637085, "learning_rate": 1.3346418056918546e-05, "loss": 0.3898833692073822, "step": 957 }, { "epoch": 0.935546875, "grad_norm": 0.4321513772010803, "learning_rate": 1.3150147203140334e-05, "loss": 0.5717346668243408, "step": 958 }, { "epoch": 0.9365234375, "grad_norm": 0.47681212425231934, "learning_rate": 1.2953876349362121e-05, "loss": 0.9711145162582397, "step": 959 }, { "epoch": 0.9375, "grad_norm": 0.524958610534668, "learning_rate": 1.2757605495583907e-05, "loss": 0.6577808260917664, "step": 960 }, { "epoch": 0.9384765625, "grad_norm": 0.40814298391342163, "learning_rate": 1.2561334641805691e-05, "loss": 0.5148733258247375, "step": 961 }, { "epoch": 0.939453125, "grad_norm": 0.3122687041759491, "learning_rate": 1.2365063788027479e-05, "loss": 0.884072482585907, "step": 962 }, { "epoch": 0.9404296875, "grad_norm": 0.4473840594291687, "learning_rate": 1.2168792934249266e-05, "loss": 0.660685658454895, "step": 963 }, { "epoch": 0.94140625, "grad_norm": 0.3491450548171997, "learning_rate": 1.197252208047105e-05, "loss": 0.8680378794670105, "step": 964 }, { "epoch": 0.9423828125, "grad_norm": 0.6323879957199097, "learning_rate": 1.1776251226692837e-05, "loss": 0.8196921348571777, "step": 965 }, { "epoch": 0.943359375, "grad_norm": 0.354900062084198, "learning_rate": 1.1579980372914623e-05, "loss": 0.5380838513374329, "step": 966 }, { "epoch": 0.9443359375, "grad_norm": 0.3235265612602234, "learning_rate": 1.1383709519136409e-05, "loss": 0.39993464946746826, "step": 967 }, { "epoch": 0.9453125, "grad_norm": 0.3700491786003113, "learning_rate": 1.1187438665358195e-05, "loss": 0.6613435745239258, "step": 968 }, { "epoch": 0.9462890625, "grad_norm": 0.29880228638648987, "learning_rate": 1.099116781157998e-05, "loss": 0.5756196975708008, "step": 969 }, { "epoch": 0.947265625, "grad_norm": 0.4585433304309845, "learning_rate": 1.0794896957801768e-05, "loss": 0.5012968182563782, "step": 970 }, { "epoch": 0.9482421875, "grad_norm": 0.5275799632072449, "learning_rate": 1.0598626104023552e-05, "loss": 0.4986013174057007, "step": 971 }, { "epoch": 0.94921875, "grad_norm": 0.30642619729042053, "learning_rate": 1.040235525024534e-05, "loss": 0.29793277382850647, "step": 972 }, { "epoch": 0.9501953125, "grad_norm": 0.7356166243553162, "learning_rate": 1.0206084396467125e-05, "loss": 0.6518126726150513, "step": 973 }, { "epoch": 0.951171875, "grad_norm": 0.6069150567054749, "learning_rate": 1.0009813542688911e-05, "loss": 0.7005544900894165, "step": 974 }, { "epoch": 0.9521484375, "grad_norm": 0.500067949295044, "learning_rate": 9.813542688910697e-06, "loss": 0.5567950010299683, "step": 975 }, { "epoch": 0.953125, "grad_norm": 0.5926097631454468, "learning_rate": 9.617271835132484e-06, "loss": 0.6974345445632935, "step": 976 }, { "epoch": 0.9541015625, "grad_norm": 0.28873002529144287, "learning_rate": 9.42100098135427e-06, "loss": 0.28231939673423767, "step": 977 }, { "epoch": 0.955078125, "grad_norm": 0.6644822359085083, "learning_rate": 9.224730127576054e-06, "loss": 0.46575701236724854, "step": 978 }, { "epoch": 0.9560546875, "grad_norm": 0.34748774766921997, "learning_rate": 9.028459273797842e-06, "loss": 0.7192713022232056, "step": 979 }, { "epoch": 0.95703125, "grad_norm": 0.4444558024406433, "learning_rate": 8.832188420019627e-06, "loss": 0.34014150500297546, "step": 980 }, { "epoch": 0.9580078125, "grad_norm": 0.4814091920852661, "learning_rate": 8.635917566241413e-06, "loss": 0.8042552471160889, "step": 981 }, { "epoch": 0.958984375, "grad_norm": 0.5443412661552429, "learning_rate": 8.439646712463199e-06, "loss": 0.6534023880958557, "step": 982 }, { "epoch": 0.9599609375, "grad_norm": 0.40025195479393005, "learning_rate": 8.243375858684986e-06, "loss": 0.9056930541992188, "step": 983 }, { "epoch": 0.9609375, "grad_norm": 0.41958069801330566, "learning_rate": 8.047105004906772e-06, "loss": 0.5610394477844238, "step": 984 }, { "epoch": 0.9619140625, "grad_norm": 0.33056482672691345, "learning_rate": 7.850834151128558e-06, "loss": 0.5796000361442566, "step": 985 }, { "epoch": 0.962890625, "grad_norm": 0.5056169629096985, "learning_rate": 7.654563297350344e-06, "loss": 0.7795373201370239, "step": 986 }, { "epoch": 0.9638671875, "grad_norm": 0.4030667543411255, "learning_rate": 7.45829244357213e-06, "loss": 0.761528491973877, "step": 987 }, { "epoch": 0.96484375, "grad_norm": 0.22716952860355377, "learning_rate": 7.262021589793916e-06, "loss": 0.21712671220302582, "step": 988 }, { "epoch": 0.9658203125, "grad_norm": 0.4826786518096924, "learning_rate": 7.0657507360157025e-06, "loss": 0.6192560791969299, "step": 989 }, { "epoch": 0.966796875, "grad_norm": 0.3611379861831665, "learning_rate": 6.869479882237488e-06, "loss": 0.5660407543182373, "step": 990 }, { "epoch": 0.9677734375, "grad_norm": 0.44197750091552734, "learning_rate": 6.673209028459273e-06, "loss": 0.8223164081573486, "step": 991 }, { "epoch": 0.96875, "grad_norm": 0.45650866627693176, "learning_rate": 6.476938174681061e-06, "loss": 0.5810177326202393, "step": 992 }, { "epoch": 0.9697265625, "grad_norm": 0.6275922060012817, "learning_rate": 6.2806673209028455e-06, "loss": 0.46302127838134766, "step": 993 }, { "epoch": 0.970703125, "grad_norm": 0.29163289070129395, "learning_rate": 6.084396467124633e-06, "loss": 0.49744415283203125, "step": 994 }, { "epoch": 0.9716796875, "grad_norm": 0.4289768934249878, "learning_rate": 5.888125613346419e-06, "loss": 0.39710360765457153, "step": 995 }, { "epoch": 0.97265625, "grad_norm": 0.43311089277267456, "learning_rate": 5.6918547595682045e-06, "loss": 0.4934995174407959, "step": 996 }, { "epoch": 0.9736328125, "grad_norm": 0.4249640703201294, "learning_rate": 5.49558390578999e-06, "loss": 0.6822129487991333, "step": 997 }, { "epoch": 0.974609375, "grad_norm": 0.4080635607242584, "learning_rate": 5.299313052011776e-06, "loss": 0.2851019501686096, "step": 998 }, { "epoch": 0.9755859375, "grad_norm": 0.3082174062728882, "learning_rate": 5.103042198233563e-06, "loss": 0.8851650357246399, "step": 999 }, { "epoch": 0.9765625, "grad_norm": 0.5285578370094299, "learning_rate": 4.906771344455348e-06, "loss": 0.5684286952018738, "step": 1000 }, { "epoch": 0.9775390625, "grad_norm": 0.37052616477012634, "learning_rate": 4.710500490677135e-06, "loss": 0.8170924782752991, "step": 1001 }, { "epoch": 0.978515625, "grad_norm": 0.46926191449165344, "learning_rate": 4.514229636898921e-06, "loss": 0.665911853313446, "step": 1002 }, { "epoch": 0.9794921875, "grad_norm": 0.38110095262527466, "learning_rate": 4.3179587831207065e-06, "loss": 0.9365942478179932, "step": 1003 }, { "epoch": 0.98046875, "grad_norm": 0.3803754150867462, "learning_rate": 4.121687929342493e-06, "loss": 0.756361722946167, "step": 1004 }, { "epoch": 0.9814453125, "grad_norm": 0.6576887965202332, "learning_rate": 3.925417075564279e-06, "loss": 0.6846331357955933, "step": 1005 }, { "epoch": 0.982421875, "grad_norm": 0.6425113081932068, "learning_rate": 3.729146221786065e-06, "loss": 0.7665562629699707, "step": 1006 }, { "epoch": 0.9833984375, "grad_norm": 0.28858375549316406, "learning_rate": 3.5328753680078512e-06, "loss": 0.2748746871948242, "step": 1007 }, { "epoch": 0.984375, "grad_norm": 0.38693365454673767, "learning_rate": 3.3366045142296366e-06, "loss": 0.6602081060409546, "step": 1008 }, { "epoch": 0.9853515625, "grad_norm": 0.39297735691070557, "learning_rate": 3.1403336604514228e-06, "loss": 0.43784576654434204, "step": 1009 }, { "epoch": 0.986328125, "grad_norm": 0.4182215929031372, "learning_rate": 2.9440628066732094e-06, "loss": 0.7852948307991028, "step": 1010 }, { "epoch": 0.9873046875, "grad_norm": 0.4079328775405884, "learning_rate": 2.747791952894995e-06, "loss": 0.5413305759429932, "step": 1011 }, { "epoch": 0.98828125, "grad_norm": 0.41826963424682617, "learning_rate": 2.5515210991167813e-06, "loss": 0.449452668428421, "step": 1012 }, { "epoch": 0.9892578125, "grad_norm": 0.31969836354255676, "learning_rate": 2.3552502453385675e-06, "loss": 0.26595592498779297, "step": 1013 }, { "epoch": 0.990234375, "grad_norm": 0.466192364692688, "learning_rate": 2.1589793915603533e-06, "loss": 0.6175995469093323, "step": 1014 }, { "epoch": 0.9912109375, "grad_norm": 0.4734349846839905, "learning_rate": 1.9627085377821394e-06, "loss": 0.6440984010696411, "step": 1015 }, { "epoch": 0.9921875, "grad_norm": 0.4446095824241638, "learning_rate": 1.7664376840039256e-06, "loss": 0.5738557577133179, "step": 1016 }, { "epoch": 0.9931640625, "grad_norm": 0.24098840355873108, "learning_rate": 1.5701668302257114e-06, "loss": 0.6320365071296692, "step": 1017 }, { "epoch": 0.994140625, "grad_norm": 0.5342791676521301, "learning_rate": 1.3738959764474976e-06, "loss": 0.9431695938110352, "step": 1018 }, { "epoch": 0.9951171875, "grad_norm": 0.31406712532043457, "learning_rate": 1.1776251226692837e-06, "loss": 0.6406105160713196, "step": 1019 }, { "epoch": 0.99609375, "grad_norm": 0.5162865519523621, "learning_rate": 9.813542688910697e-07, "loss": 0.7935853004455566, "step": 1020 }, { "epoch": 0.9970703125, "grad_norm": 0.4624859690666199, "learning_rate": 7.850834151128557e-07, "loss": 0.9667851328849792, "step": 1021 }, { "epoch": 0.998046875, "grad_norm": 0.43549951910972595, "learning_rate": 5.888125613346419e-07, "loss": 0.73248291015625, "step": 1022 }, { "epoch": 0.9990234375, "grad_norm": 0.6080308556556702, "learning_rate": 3.9254170755642785e-07, "loss": 0.5045021772384644, "step": 1023 }, { "epoch": 1.0, "grad_norm": 0.3927266299724579, "learning_rate": 1.9627085377821392e-07, "loss": 0.37262263894081116, "step": 1024 } ], "logging_steps": 1, "max_steps": 1024, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.871410239702333e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }