{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 1860, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.016148566814695196, "grad_norm": 0.5020289421081543, "learning_rate": 8.999999999999999e-05, "loss": 2.439466857910156, "step": 10 }, { "epoch": 0.03229713362939039, "grad_norm": 0.6233699321746826, "learning_rate": 0.00019, "loss": 2.4973506927490234, "step": 20 }, { "epoch": 0.04844570044408559, "grad_norm": 0.6460151076316833, "learning_rate": 0.00029, "loss": 2.579629898071289, "step": 30 }, { "epoch": 0.06459426725878079, "grad_norm": 0.5584802031517029, "learning_rate": 0.00039000000000000005, "loss": 2.5269956588745117, "step": 40 }, { "epoch": 0.08074283407347597, "grad_norm": 0.5267783403396606, "learning_rate": 0.00049, "loss": 2.5723337173461913, "step": 50 }, { "epoch": 0.09689140088817118, "grad_norm": 0.492374062538147, "learning_rate": 0.00059, "loss": 2.6084869384765623, "step": 60 }, { "epoch": 0.11303996770286637, "grad_norm": 0.533662736415863, "learning_rate": 0.00069, "loss": 2.527654838562012, "step": 70 }, { "epoch": 0.12918853451756157, "grad_norm": 0.568081796169281, "learning_rate": 0.00079, "loss": 2.584039497375488, "step": 80 }, { "epoch": 0.14533710133225677, "grad_norm": 0.5420770049095154, "learning_rate": 0.0008900000000000001, "loss": 2.5998212814331056, "step": 90 }, { "epoch": 0.16148566814695195, "grad_norm": 0.5972040295600891, "learning_rate": 0.00099, "loss": 2.61820011138916, "step": 100 }, { "epoch": 0.17763423496164715, "grad_norm": 0.622533917427063, "learning_rate": 0.0009999354806331361, "loss": 2.5819944381713866, "step": 110 }, { "epoch": 0.19378280177634236, "grad_norm": 0.6911935210227966, "learning_rate": 0.0009997124721002689, "loss": 2.5768102645874023, "step": 120 }, { "epoch": 0.20993136859103753, "grad_norm": 0.7095156908035278, "learning_rate": 0.000999330248902402, "loss": 2.6130306243896486, "step": 130 }, { "epoch": 0.22607993540573273, "grad_norm": 0.5791682600975037, "learning_rate": 0.0009987889328206437, "loss": 2.562555503845215, "step": 140 }, { "epoch": 0.24222850222042794, "grad_norm": 0.5518380403518677, "learning_rate": 0.0009980886963250907, "loss": 2.518760108947754, "step": 150 }, { "epoch": 0.25837706903512314, "grad_norm": 0.4897823631763458, "learning_rate": 0.000997229762519879, "loss": 2.6402866363525392, "step": 160 }, { "epoch": 0.2745256358498183, "grad_norm": 0.5050747394561768, "learning_rate": 0.0009962124050720978, "loss": 2.684323310852051, "step": 170 }, { "epoch": 0.29067420266451355, "grad_norm": 0.468423068523407, "learning_rate": 0.0009950369481245985, "loss": 2.6152185440063476, "step": 180 }, { "epoch": 0.3068227694792087, "grad_norm": 0.5091232657432556, "learning_rate": 0.0009937037661927161, "loss": 2.531853675842285, "step": 190 }, { "epoch": 0.3229713362939039, "grad_norm": 0.4922482967376709, "learning_rate": 0.0009922132840449458, "loss": 2.6094560623168945, "step": 200 }, { "epoch": 0.33911990310859913, "grad_norm": 0.5051465034484863, "learning_rate": 0.0009905659765676053, "loss": 2.559980583190918, "step": 210 }, { "epoch": 0.3552684699232943, "grad_norm": 0.4865105450153351, "learning_rate": 0.0009887623686135306, "loss": 2.508647346496582, "step": 220 }, { "epoch": 0.3714170367379895, "grad_norm": 0.5287356376647949, "learning_rate": 0.0009868030348348512, "loss": 2.6150222778320313, "step": 230 }, { "epoch": 0.3875656035526847, "grad_norm": 0.4643533229827881, "learning_rate": 0.0009846885994998983, "loss": 2.6150381088256838, "step": 240 }, { "epoch": 0.4037141703673799, "grad_norm": 0.46244189143180847, "learning_rate": 0.0009824197362943063, "loss": 2.5374935150146483, "step": 250 }, { "epoch": 0.41986273718207506, "grad_norm": 0.480276882648468, "learning_rate": 0.000979997168106366, "loss": 2.5654741287231446, "step": 260 }, { "epoch": 0.4360113039967703, "grad_norm": 0.5552269816398621, "learning_rate": 0.0009774216667967062, "loss": 2.6036794662475584, "step": 270 }, { "epoch": 0.45215987081146547, "grad_norm": 0.511289656162262, "learning_rate": 0.000974694052952366, "loss": 2.610031318664551, "step": 280 }, { "epoch": 0.4683084376261607, "grad_norm": 0.5037236213684082, "learning_rate": 0.000971815195625348, "loss": 2.516169548034668, "step": 290 }, { "epoch": 0.4844570044408559, "grad_norm": 0.5199276804924011, "learning_rate": 0.000968786012055726, "loss": 2.5069480895996095, "step": 300 }, { "epoch": 0.500605571255551, "grad_norm": 0.5875343680381775, "learning_rate": 0.0009656074673794017, "loss": 2.639841651916504, "step": 310 }, { "epoch": 0.5167541380702463, "grad_norm": 0.5489600896835327, "learning_rate": 0.0009622805743205998, "loss": 2.5628652572631836, "step": 320 }, { "epoch": 0.5329027048849415, "grad_norm": 0.4753468334674835, "learning_rate": 0.0009588063928692012, "loss": 2.5956233978271483, "step": 330 }, { "epoch": 0.5490512716996366, "grad_norm": 0.5152420997619629, "learning_rate": 0.0009551860299430173, "loss": 2.597700500488281, "step": 340 }, { "epoch": 0.5651998385143319, "grad_norm": 0.4520896375179291, "learning_rate": 0.0009514206390351116, "loss": 2.586415481567383, "step": 350 }, { "epoch": 0.5813484053290271, "grad_norm": 0.5123590230941772, "learning_rate": 0.0009475114198462837, "loss": 2.555033302307129, "step": 360 }, { "epoch": 0.5974969721437222, "grad_norm": 0.6008352637290955, "learning_rate": 0.0009434596179028271, "loss": 2.6199901580810545, "step": 370 }, { "epoch": 0.6136455389584174, "grad_norm": 0.4769132435321808, "learning_rate": 0.0009392665241596914, "loss": 2.5420787811279295, "step": 380 }, { "epoch": 0.6297941057731127, "grad_norm": 0.4474424421787262, "learning_rate": 0.0009349334745891666, "loss": 2.549270820617676, "step": 390 }, { "epoch": 0.6459426725878078, "grad_norm": 0.5046530365943909, "learning_rate": 0.0009304618497552253, "loss": 2.540346145629883, "step": 400 }, { "epoch": 0.662091239402503, "grad_norm": 0.5442773699760437, "learning_rate": 0.0009258530743736586, "loss": 2.550925636291504, "step": 410 }, { "epoch": 0.6782398062171983, "grad_norm": 0.45155641436576843, "learning_rate": 0.0009211086168581433, "loss": 2.5896928787231444, "step": 420 }, { "epoch": 0.6943883730318934, "grad_norm": 0.48448678851127625, "learning_rate": 0.0009162299888523867, "loss": 2.568522834777832, "step": 430 }, { "epoch": 0.7105369398465886, "grad_norm": 0.4634808897972107, "learning_rate": 0.0009112187447484979, "loss": 2.543706512451172, "step": 440 }, { "epoch": 0.7266855066612838, "grad_norm": 0.520962655544281, "learning_rate": 0.0009060764811917397, "loss": 2.4791580200195313, "step": 450 }, { "epoch": 0.742834073475979, "grad_norm": 0.495394766330719, "learning_rate": 0.0009008048365718167, "loss": 2.6086082458496094, "step": 460 }, { "epoch": 0.7589826402906742, "grad_norm": 0.4325544834136963, "learning_rate": 0.0008954054905008639, "loss": 2.5405605316162108, "step": 470 }, { "epoch": 0.7751312071053694, "grad_norm": 0.4992341697216034, "learning_rate": 0.0008898801632783013, "loss": 2.6021982192993165, "step": 480 }, { "epoch": 0.7912797739200645, "grad_norm": 0.5032821893692017, "learning_rate": 0.0008842306153427246, "loss": 2.5671119689941406, "step": 490 }, { "epoch": 0.8074283407347598, "grad_norm": 0.44175952672958374, "learning_rate": 0.000878458646711008, "loss": 2.5145410537719726, "step": 500 }, { "epoch": 0.823576907549455, "grad_norm": 0.48530301451683044, "learning_rate": 0.0008725660964047959, "loss": 2.4978832244873046, "step": 510 }, { "epoch": 0.8397254743641501, "grad_norm": 0.4604915976524353, "learning_rate": 0.0008665548418645672, "loss": 2.5596445083618162, "step": 520 }, { "epoch": 0.8558740411788454, "grad_norm": 0.45781826972961426, "learning_rate": 0.0008604267983514594, "loss": 2.6085268020629884, "step": 530 }, { "epoch": 0.8720226079935406, "grad_norm": 0.48819592595100403, "learning_rate": 0.000854183918337043, "loss": 2.5747554779052733, "step": 540 }, { "epoch": 0.8881711748082358, "grad_norm": 0.4929693043231964, "learning_rate": 0.0008478281908812387, "loss": 2.543058395385742, "step": 550 }, { "epoch": 0.9043197416229309, "grad_norm": 0.4913038909435272, "learning_rate": 0.0008413616409985779, "loss": 2.5399295806884767, "step": 560 }, { "epoch": 0.9204683084376262, "grad_norm": 0.47384563088417053, "learning_rate": 0.0008347863290130087, "loss": 2.5927974700927736, "step": 570 }, { "epoch": 0.9366168752523214, "grad_norm": 0.4775764048099518, "learning_rate": 0.0008281043499014498, "loss": 2.5593168258666994, "step": 580 }, { "epoch": 0.9527654420670165, "grad_norm": 0.5058761239051819, "learning_rate": 0.0008213178326263049, "loss": 2.552435111999512, "step": 590 }, { "epoch": 0.9689140088817118, "grad_norm": 0.4922596216201782, "learning_rate": 0.0008144289394571484, "loss": 2.5472679138183594, "step": 600 }, { "epoch": 0.985062575696407, "grad_norm": 0.49022358655929565, "learning_rate": 0.0008074398652817998, "loss": 2.5109460830688475, "step": 610 }, { "epoch": 1.0, "grad_norm": 1.6876914501190186, "learning_rate": 0.0008003528369070043, "loss": 2.465944290161133, "step": 620 }, { "epoch": 1.0161485668146952, "grad_norm": 0.4119199812412262, "learning_rate": 0.0007931701123489439, "loss": 2.520769500732422, "step": 630 }, { "epoch": 1.0322971336293905, "grad_norm": 0.4147922992706299, "learning_rate": 0.000785893980113806, "loss": 2.5548782348632812, "step": 640 }, { "epoch": 1.0484457004440857, "grad_norm": 0.5313045382499695, "learning_rate": 0.0007785267584686366, "loss": 2.6331764221191407, "step": 650 }, { "epoch": 1.0645942672587807, "grad_norm": 0.45284605026245117, "learning_rate": 0.00077107079470271, "loss": 2.519462013244629, "step": 660 }, { "epoch": 1.080742834073476, "grad_norm": 0.5042719841003418, "learning_rate": 0.0007635284643796545, "loss": 2.4921422958374024, "step": 670 }, { "epoch": 1.0968914008881712, "grad_norm": 0.4403098225593567, "learning_rate": 0.0007559021705805671, "loss": 2.454839897155762, "step": 680 }, { "epoch": 1.1130399677028664, "grad_norm": 0.4963165819644928, "learning_rate": 0.0007481943431383622, "loss": 2.5821470260620116, "step": 690 }, { "epoch": 1.1291885345175616, "grad_norm": 0.4509197175502777, "learning_rate": 0.000740407437863596, "loss": 2.4973094940185545, "step": 700 }, { "epoch": 1.1453371013322569, "grad_norm": 0.49439796805381775, "learning_rate": 0.0007325439357620147, "loss": 2.6476114273071287, "step": 710 }, { "epoch": 1.1614856681469519, "grad_norm": 0.5243302583694458, "learning_rate": 0.0007246063422440747, "loss": 2.4758913040161135, "step": 720 }, { "epoch": 1.177634234961647, "grad_norm": 0.6043158769607544, "learning_rate": 0.0007165971863266878, "loss": 2.5820987701416014, "step": 730 }, { "epoch": 1.1937828017763423, "grad_norm": 0.44207850098609924, "learning_rate": 0.0007085190198274438, "loss": 2.4599235534667967, "step": 740 }, { "epoch": 1.2099313685910376, "grad_norm": 0.4254566431045532, "learning_rate": 0.0007003744165515704, "loss": 2.4942739486694334, "step": 750 }, { "epoch": 1.2260799354057328, "grad_norm": 0.41949278116226196, "learning_rate": 0.0006921659714718863, "loss": 2.505445098876953, "step": 760 }, { "epoch": 1.242228502220428, "grad_norm": 0.4123310148715973, "learning_rate": 0.0006838962999020094, "loss": 2.5693735122680663, "step": 770 }, { "epoch": 1.2583770690351233, "grad_norm": 0.4186009168624878, "learning_rate": 0.0006755680366630865, "loss": 2.4493398666381836, "step": 780 }, { "epoch": 1.2745256358498183, "grad_norm": 0.444654643535614, "learning_rate": 0.0006671838352443049, "loss": 2.5728691101074217, "step": 790 }, { "epoch": 1.2906742026645135, "grad_norm": 0.40202823281288147, "learning_rate": 0.0006587463669574584, "loss": 2.526685333251953, "step": 800 }, { "epoch": 1.3068227694792087, "grad_norm": 0.4873361885547638, "learning_rate": 0.0006502583200858335, "loss": 2.5635454177856447, "step": 810 }, { "epoch": 1.322971336293904, "grad_norm": 0.5394927859306335, "learning_rate": 0.0006417223990276883, "loss": 2.6018707275390627, "step": 820 }, { "epoch": 1.3391199031085992, "grad_norm": 0.5254472494125366, "learning_rate": 0.0006331413234345977, "loss": 2.5202842712402345, "step": 830 }, { "epoch": 1.3552684699232942, "grad_norm": 0.4611901342868805, "learning_rate": 0.0006245178273449383, "loss": 2.527310371398926, "step": 840 }, { "epoch": 1.3714170367379894, "grad_norm": 0.46280530095100403, "learning_rate": 0.0006158546583127886, "loss": 2.5010074615478515, "step": 850 }, { "epoch": 1.3875656035526847, "grad_norm": 0.47044530510902405, "learning_rate": 0.0006071545765325253, "loss": 2.658343505859375, "step": 860 }, { "epoch": 1.4037141703673799, "grad_norm": 0.5649057030677795, "learning_rate": 0.0005984203539593897, "loss": 2.514650344848633, "step": 870 }, { "epoch": 1.4198627371820751, "grad_norm": 0.5100232362747192, "learning_rate": 0.0005896547734263077, "loss": 2.4994720458984374, "step": 880 }, { "epoch": 1.4360113039967703, "grad_norm": 0.4990105926990509, "learning_rate": 0.0005808606277572453, "loss": 2.489163398742676, "step": 890 }, { "epoch": 1.4521598708114656, "grad_norm": 0.473297655582428, "learning_rate": 0.0005720407188773791, "loss": 2.534769630432129, "step": 900 }, { "epoch": 1.4683084376261606, "grad_norm": 0.5112643837928772, "learning_rate": 0.000563197856920368, "loss": 2.5247997283935546, "step": 910 }, { "epoch": 1.4844570044408558, "grad_norm": 0.4357326924800873, "learning_rate": 0.0005543348593330093, "loss": 2.508163642883301, "step": 920 }, { "epoch": 1.500605571255551, "grad_norm": 0.47014695405960083, "learning_rate": 0.0005454545499775651, "loss": 2.6127824783325195, "step": 930 }, { "epoch": 1.5167541380702463, "grad_norm": 0.4659437835216522, "learning_rate": 0.0005365597582320436, "loss": 2.4793100357055664, "step": 940 }, { "epoch": 1.5329027048849415, "grad_norm": 0.5408269166946411, "learning_rate": 0.0005276533180887248, "loss": 2.45506591796875, "step": 950 }, { "epoch": 1.5490512716996365, "grad_norm": 0.5870039463043213, "learning_rate": 0.000518738067251214, "loss": 2.480586814880371, "step": 960 }, { "epoch": 1.565199838514332, "grad_norm": 0.48131951689720154, "learning_rate": 0.0005098168462303141, "loss": 2.589830207824707, "step": 970 }, { "epoch": 1.581348405329027, "grad_norm": 0.4618188440799713, "learning_rate": 0.0005008924974390041, "loss": 2.5488056182861327, "step": 980 }, { "epoch": 1.5974969721437222, "grad_norm": 0.49082285165786743, "learning_rate": 0.0004919678642868092, "loss": 2.4781982421875, "step": 990 }, { "epoch": 1.6136455389584174, "grad_norm": 0.47744420170783997, "learning_rate": 0.0004830457902738558, "loss": 2.517325210571289, "step": 1000 }, { "epoch": 1.6297941057731127, "grad_norm": 0.507945716381073, "learning_rate": 0.0004741291180848961, "loss": 2.5076452255249024, "step": 1010 }, { "epoch": 1.645942672587808, "grad_norm": 0.42816001176834106, "learning_rate": 0.000465220688683594, "loss": 2.671817398071289, "step": 1020 }, { "epoch": 1.662091239402503, "grad_norm": 0.4258963167667389, "learning_rate": 0.00045632334040735764, "loss": 2.533784103393555, "step": 1030 }, { "epoch": 1.6782398062171984, "grad_norm": 0.494028240442276, "learning_rate": 0.00044743990806300917, "loss": 2.514291000366211, "step": 1040 }, { "epoch": 1.6943883730318934, "grad_norm": 0.4230322539806366, "learning_rate": 0.00043857322202358066, "loss": 2.5531696319580077, "step": 1050 }, { "epoch": 1.7105369398465886, "grad_norm": 0.5738111734390259, "learning_rate": 0.00042972610732652105, "loss": 2.5059112548828124, "step": 1060 }, { "epoch": 1.7266855066612838, "grad_norm": 0.5092839002609253, "learning_rate": 0.0004209013827736042, "loss": 2.5219795227050783, "step": 1070 }, { "epoch": 1.7428340734759789, "grad_norm": 0.4812857508659363, "learning_rate": 0.00041210186003282274, "loss": 2.5235408782958983, "step": 1080 }, { "epoch": 1.7589826402906743, "grad_norm": 0.47096627950668335, "learning_rate": 0.000403330342742556, "loss": 2.574551582336426, "step": 1090 }, { "epoch": 1.7751312071053693, "grad_norm": 0.4319113790988922, "learning_rate": 0.0003945896256182949, "loss": 2.610904502868652, "step": 1100 }, { "epoch": 1.7912797739200645, "grad_norm": 0.44047966599464417, "learning_rate": 0.0003858824935622115, "loss": 2.5323257446289062, "step": 1110 }, { "epoch": 1.8074283407347598, "grad_norm": 0.548047661781311, "learning_rate": 0.00037721172077585287, "loss": 2.5165468215942384, "step": 1120 }, { "epoch": 1.823576907549455, "grad_norm": 0.46526339650154114, "learning_rate": 0.00036858006987624723, "loss": 2.502303886413574, "step": 1130 }, { "epoch": 1.8397254743641502, "grad_norm": 0.4635223150253296, "learning_rate": 0.0003599902910156984, "loss": 2.5442089080810546, "step": 1140 }, { "epoch": 1.8558740411788452, "grad_norm": 0.5317935347557068, "learning_rate": 0.0003514451210055527, "loss": 2.600077247619629, "step": 1150 }, { "epoch": 1.8720226079935407, "grad_norm": 0.5463606119155884, "learning_rate": 0.00034294728244421756, "loss": 2.574476623535156, "step": 1160 }, { "epoch": 1.8881711748082357, "grad_norm": 0.4650241732597351, "learning_rate": 0.00033449948284970617, "loss": 2.500654411315918, "step": 1170 }, { "epoch": 1.904319741622931, "grad_norm": 0.40839987993240356, "learning_rate": 0.00032610441379698937, "loss": 2.4712839126586914, "step": 1180 }, { "epoch": 1.9204683084376262, "grad_norm": 0.49943020939826965, "learning_rate": 0.0003177647500604252, "loss": 2.5296091079711913, "step": 1190 }, { "epoch": 1.9366168752523214, "grad_norm": 0.4655015468597412, "learning_rate": 0.00030948314876154306, "loss": 2.5075130462646484, "step": 1200 }, { "epoch": 1.9527654420670166, "grad_norm": 0.45307499170303345, "learning_rate": 0.00030126224852245056, "loss": 2.464124298095703, "step": 1210 }, { "epoch": 1.9689140088817116, "grad_norm": 0.5530602931976318, "learning_rate": 0.0002931046686251365, "loss": 2.553061866760254, "step": 1220 }, { "epoch": 1.985062575696407, "grad_norm": 0.46495500206947327, "learning_rate": 0.0002850130081769334, "loss": 2.486197853088379, "step": 1230 }, { "epoch": 2.0, "grad_norm": 1.9545954465866089, "learning_rate": 0.00027698984528241036, "loss": 2.4371877670288087, "step": 1240 }, { "epoch": 2.016148566814695, "grad_norm": 0.48022302985191345, "learning_rate": 0.00026903773622195636, "loss": 2.512773895263672, "step": 1250 }, { "epoch": 2.0322971336293905, "grad_norm": 0.433242529630661, "learning_rate": 0.00026115921463731694, "loss": 2.5300994873046876, "step": 1260 }, { "epoch": 2.0484457004440855, "grad_norm": 0.47226834297180176, "learning_rate": 0.0002533567907243446, "loss": 2.421502113342285, "step": 1270 }, { "epoch": 2.064594267258781, "grad_norm": 0.5992064476013184, "learning_rate": 0.00024563295043321783, "loss": 2.5453359603881838, "step": 1280 }, { "epoch": 2.080742834073476, "grad_norm": 0.4261581599712372, "learning_rate": 0.0002379901546763879, "loss": 2.495037841796875, "step": 1290 }, { "epoch": 2.0968914008881714, "grad_norm": 0.4328082203865051, "learning_rate": 0.00023043083854449987, "loss": 2.464985466003418, "step": 1300 }, { "epoch": 2.1130399677028664, "grad_norm": 0.4510248303413391, "learning_rate": 0.00022295741053054296, "loss": 2.5308864593505858, "step": 1310 }, { "epoch": 2.1291885345175614, "grad_norm": 0.49293237924575806, "learning_rate": 0.00021557225176247353, "loss": 2.5278299331665037, "step": 1320 }, { "epoch": 2.145337101332257, "grad_norm": 0.4132377505302429, "learning_rate": 0.0002082777152445589, "loss": 2.398031234741211, "step": 1330 }, { "epoch": 2.161485668146952, "grad_norm": 0.5191354751586914, "learning_rate": 0.00020107612510768014, "loss": 2.5248828887939454, "step": 1340 }, { "epoch": 2.1776342349616473, "grad_norm": 0.5512005686759949, "learning_rate": 0.00019396977586883475, "loss": 2.4451154708862304, "step": 1350 }, { "epoch": 2.1937828017763423, "grad_norm": 0.42351678013801575, "learning_rate": 0.00018696093170007493, "loss": 2.573942756652832, "step": 1360 }, { "epoch": 2.2099313685910373, "grad_norm": 0.45575806498527527, "learning_rate": 0.00018005182570711366, "loss": 2.5537620544433595, "step": 1370 }, { "epoch": 2.226079935405733, "grad_norm": 0.46785682439804077, "learning_rate": 0.0001732446592178295, "loss": 2.491817092895508, "step": 1380 }, { "epoch": 2.242228502220428, "grad_norm": 0.45513054728507996, "learning_rate": 0.00016654160108089594, "loss": 2.5171764373779295, "step": 1390 }, { "epoch": 2.2583770690351233, "grad_norm": 0.43288710713386536, "learning_rate": 0.00015994478697475885, "loss": 2.5143251419067383, "step": 1400 }, { "epoch": 2.2745256358498183, "grad_norm": 0.4459301233291626, "learning_rate": 0.00015345631872718213, "loss": 2.5065849304199217, "step": 1410 }, { "epoch": 2.2906742026645137, "grad_norm": 0.501880407333374, "learning_rate": 0.00014707826364557985, "loss": 2.4833837509155274, "step": 1420 }, { "epoch": 2.3068227694792087, "grad_norm": 0.4559042453765869, "learning_rate": 0.00014081265385834557, "loss": 2.5120367050170898, "step": 1430 }, { "epoch": 2.3229713362939037, "grad_norm": 0.48048946261405945, "learning_rate": 0.000134661485667391, "loss": 2.4629817962646485, "step": 1440 }, { "epoch": 2.339119903108599, "grad_norm": 0.46413764357566833, "learning_rate": 0.0001286267189120986, "loss": 2.4572961807250975, "step": 1450 }, { "epoch": 2.355268469923294, "grad_norm": 0.4971129298210144, "learning_rate": 0.000122710276344893, "loss": 2.5448049545288085, "step": 1460 }, { "epoch": 2.3714170367379896, "grad_norm": 0.45018401741981506, "learning_rate": 0.00011691404301862746, "loss": 2.551463317871094, "step": 1470 }, { "epoch": 2.3875656035526847, "grad_norm": 0.49596303701400757, "learning_rate": 0.00011123986568598249, "loss": 2.5440711975097656, "step": 1480 }, { "epoch": 2.4037141703673797, "grad_norm": 0.47923141717910767, "learning_rate": 0.00010568955221106713, "loss": 2.45603084564209, "step": 1490 }, { "epoch": 2.419862737182075, "grad_norm": 0.4507387578487396, "learning_rate": 0.0001002648709934108, "loss": 2.458144187927246, "step": 1500 }, { "epoch": 2.43601130399677, "grad_norm": 0.46995073556900024, "learning_rate": 9.496755040452915e-05, "loss": 2.4886669158935546, "step": 1510 }, { "epoch": 2.4521598708114656, "grad_norm": 0.4773581326007843, "learning_rate": 8.979927823724321e-05, "loss": 2.548818016052246, "step": 1520 }, { "epoch": 2.4683084376261606, "grad_norm": 0.49100548028945923, "learning_rate": 8.476170116792736e-05, "loss": 2.5328250885009767, "step": 1530 }, { "epoch": 2.484457004440856, "grad_norm": 0.5164358615875244, "learning_rate": 7.985642423185718e-05, "loss": 2.4474281311035155, "step": 1540 }, { "epoch": 2.500605571255551, "grad_norm": 0.48450803756713867, "learning_rate": 7.508501031182585e-05, "loss": 2.470208168029785, "step": 1550 }, { "epoch": 2.5167541380702465, "grad_norm": 0.4779358208179474, "learning_rate": 7.044897964018949e-05, "loss": 2.443818283081055, "step": 1560 }, { "epoch": 2.5329027048849415, "grad_norm": 0.42919352650642395, "learning_rate": 6.594980931450223e-05, "loss": 2.4840776443481447, "step": 1570 }, { "epoch": 2.5490512716996365, "grad_norm": 0.4021783769130707, "learning_rate": 6.158893282689454e-05, "loss": 2.450935173034668, "step": 1580 }, { "epoch": 2.565199838514332, "grad_norm": 0.5504807829856873, "learning_rate": 5.7367739607344093e-05, "loss": 2.477644348144531, "step": 1590 }, { "epoch": 2.581348405329027, "grad_norm": 0.3885093331336975, "learning_rate": 5.328757458098665e-05, "loss": 2.449785041809082, "step": 1600 }, { "epoch": 2.5974969721437224, "grad_norm": 0.5313092470169067, "learning_rate": 4.934973773960572e-05, "loss": 2.564461898803711, "step": 1610 }, { "epoch": 2.6136455389584174, "grad_norm": 0.5081238150596619, "learning_rate": 4.5555483727438896e-05, "loss": 2.5144027709960937, "step": 1620 }, { "epoch": 2.6297941057731125, "grad_norm": 0.45351824164390564, "learning_rate": 4.190602144143207e-05, "loss": 2.5690656661987306, "step": 1630 }, { "epoch": 2.645942672587808, "grad_norm": 0.49749699234962463, "learning_rate": 3.840251364607045e-05, "loss": 2.5524566650390623, "step": 1640 }, { "epoch": 2.662091239402503, "grad_norm": 0.4736417233943939, "learning_rate": 3.50460766029066e-05, "loss": 2.437306022644043, "step": 1650 }, { "epoch": 2.6782398062171984, "grad_norm": 0.54044508934021, "learning_rate": 3.183777971490576e-05, "loss": 2.4329130172729494, "step": 1660 }, { "epoch": 2.6943883730318934, "grad_norm": 0.5381774306297302, "learning_rate": 2.8778645185720487e-05, "loss": 2.5384393692016602, "step": 1670 }, { "epoch": 2.7105369398465884, "grad_norm": 0.4624033570289612, "learning_rate": 2.5869647694003962e-05, "loss": 2.5221799850463866, "step": 1680 }, { "epoch": 2.726685506661284, "grad_norm": 0.49552807211875916, "learning_rate": 2.3111714082864887e-05, "loss": 2.519091987609863, "step": 1690 }, { "epoch": 2.742834073475979, "grad_norm": 0.4731680154800415, "learning_rate": 2.0505723064563886e-05, "loss": 2.4723621368408204, "step": 1700 }, { "epoch": 2.7589826402906743, "grad_norm": 0.44819337129592896, "learning_rate": 1.8052504940544613e-05, "loss": 2.5209144592285155, "step": 1710 }, { "epoch": 2.7751312071053693, "grad_norm": 0.4628264307975769, "learning_rate": 1.575284133688909e-05, "loss": 2.502878189086914, "step": 1720 }, { "epoch": 2.7912797739200643, "grad_norm": 0.4762296676635742, "learning_rate": 1.3607464955282257e-05, "loss": 2.4997129440307617, "step": 1730 }, { "epoch": 2.8074283407347598, "grad_norm": 0.4384547173976898, "learning_rate": 1.1617059339563806e-05, "loss": 2.6152523040771483, "step": 1740 }, { "epoch": 2.8235769075494552, "grad_norm": 0.4384756088256836, "learning_rate": 9.782258657942467e-06, "loss": 2.4833805084228517, "step": 1750 }, { "epoch": 2.8397254743641502, "grad_norm": 0.4431445300579071, "learning_rate": 8.103647500942112e-06, "loss": 2.500337028503418, "step": 1760 }, { "epoch": 2.8558740411788452, "grad_norm": 0.4854304790496826, "learning_rate": 6.581760695143934e-06, "loss": 2.4700483322143554, "step": 1770 }, { "epoch": 2.8720226079935407, "grad_norm": 0.4667441248893738, "learning_rate": 5.217083132783907e-06, "loss": 2.4867990493774412, "step": 1780 }, { "epoch": 2.8881711748082357, "grad_norm": 0.49439942836761475, "learning_rate": 4.010049617260203e-06, "loss": 2.515974426269531, "step": 1790 }, { "epoch": 2.904319741622931, "grad_norm": 0.43787845969200134, "learning_rate": 2.961044724599016e-06, "loss": 2.5289798736572267, "step": 1800 }, { "epoch": 2.920468308437626, "grad_norm": 0.4790705144405365, "learning_rate": 2.0704026809241215e-06, "loss": 2.4601634979248046, "step": 1810 }, { "epoch": 2.936616875252321, "grad_norm": 0.4470031261444092, "learning_rate": 1.338407255968288e-06, "loss": 2.4816938400268556, "step": 1820 }, { "epoch": 2.9527654420670166, "grad_norm": 0.4893916845321655, "learning_rate": 7.652916726604287e-07, "loss": 2.529372978210449, "step": 1830 }, { "epoch": 2.9689140088817116, "grad_norm": 0.431090384721756, "learning_rate": 3.5123853281793237e-07, "loss": 2.551029586791992, "step": 1840 }, { "epoch": 2.985062575696407, "grad_norm": 0.4225512742996216, "learning_rate": 9.637975896759077e-08, "loss": 2.5007448196411133, "step": 1850 }, { "epoch": 3.0, "grad_norm": 1.7053178548812866, "learning_rate": 7.965523131092667e-10, "loss": 2.677412414550781, "step": 1860 } ], "logging_steps": 10, "max_steps": 1860, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.131504576233472e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }