| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 3.0, |
| "eval_steps": 500, |
| "global_step": 1860, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.016148566814695196, |
| "grad_norm": 0.5020289421081543, |
| "learning_rate": 8.999999999999999e-05, |
| "loss": 2.439466857910156, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.03229713362939039, |
| "grad_norm": 0.6233699321746826, |
| "learning_rate": 0.00019, |
| "loss": 2.4973506927490234, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.04844570044408559, |
| "grad_norm": 0.6460151076316833, |
| "learning_rate": 0.00029, |
| "loss": 2.579629898071289, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.06459426725878079, |
| "grad_norm": 0.5584802031517029, |
| "learning_rate": 0.00039000000000000005, |
| "loss": 2.5269956588745117, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.08074283407347597, |
| "grad_norm": 0.5267783403396606, |
| "learning_rate": 0.00049, |
| "loss": 2.5723337173461913, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.09689140088817118, |
| "grad_norm": 0.492374062538147, |
| "learning_rate": 0.00059, |
| "loss": 2.6084869384765623, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.11303996770286637, |
| "grad_norm": 0.533662736415863, |
| "learning_rate": 0.00069, |
| "loss": 2.527654838562012, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.12918853451756157, |
| "grad_norm": 0.568081796169281, |
| "learning_rate": 0.00079, |
| "loss": 2.584039497375488, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.14533710133225677, |
| "grad_norm": 0.5420770049095154, |
| "learning_rate": 0.0008900000000000001, |
| "loss": 2.5998212814331056, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.16148566814695195, |
| "grad_norm": 0.5972040295600891, |
| "learning_rate": 0.00099, |
| "loss": 2.61820011138916, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.17763423496164715, |
| "grad_norm": 0.622533917427063, |
| "learning_rate": 0.0009999354806331361, |
| "loss": 2.5819944381713866, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.19378280177634236, |
| "grad_norm": 0.6911935210227966, |
| "learning_rate": 0.0009997124721002689, |
| "loss": 2.5768102645874023, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.20993136859103753, |
| "grad_norm": 0.7095156908035278, |
| "learning_rate": 0.000999330248902402, |
| "loss": 2.6130306243896486, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.22607993540573273, |
| "grad_norm": 0.5791682600975037, |
| "learning_rate": 0.0009987889328206437, |
| "loss": 2.562555503845215, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.24222850222042794, |
| "grad_norm": 0.5518380403518677, |
| "learning_rate": 0.0009980886963250907, |
| "loss": 2.518760108947754, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.25837706903512314, |
| "grad_norm": 0.4897823631763458, |
| "learning_rate": 0.000997229762519879, |
| "loss": 2.6402866363525392, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.2745256358498183, |
| "grad_norm": 0.5050747394561768, |
| "learning_rate": 0.0009962124050720978, |
| "loss": 2.684323310852051, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.29067420266451355, |
| "grad_norm": 0.468423068523407, |
| "learning_rate": 0.0009950369481245985, |
| "loss": 2.6152185440063476, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.3068227694792087, |
| "grad_norm": 0.5091232657432556, |
| "learning_rate": 0.0009937037661927161, |
| "loss": 2.531853675842285, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.3229713362939039, |
| "grad_norm": 0.4922482967376709, |
| "learning_rate": 0.0009922132840449458, |
| "loss": 2.6094560623168945, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.33911990310859913, |
| "grad_norm": 0.5051465034484863, |
| "learning_rate": 0.0009905659765676053, |
| "loss": 2.559980583190918, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.3552684699232943, |
| "grad_norm": 0.4865105450153351, |
| "learning_rate": 0.0009887623686135306, |
| "loss": 2.508647346496582, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.3714170367379895, |
| "grad_norm": 0.5287356376647949, |
| "learning_rate": 0.0009868030348348512, |
| "loss": 2.6150222778320313, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.3875656035526847, |
| "grad_norm": 0.4643533229827881, |
| "learning_rate": 0.0009846885994998983, |
| "loss": 2.6150381088256838, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.4037141703673799, |
| "grad_norm": 0.46244189143180847, |
| "learning_rate": 0.0009824197362943063, |
| "loss": 2.5374935150146483, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.41986273718207506, |
| "grad_norm": 0.480276882648468, |
| "learning_rate": 0.000979997168106366, |
| "loss": 2.5654741287231446, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.4360113039967703, |
| "grad_norm": 0.5552269816398621, |
| "learning_rate": 0.0009774216667967062, |
| "loss": 2.6036794662475584, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.45215987081146547, |
| "grad_norm": 0.511289656162262, |
| "learning_rate": 0.000974694052952366, |
| "loss": 2.610031318664551, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.4683084376261607, |
| "grad_norm": 0.5037236213684082, |
| "learning_rate": 0.000971815195625348, |
| "loss": 2.516169548034668, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.4844570044408559, |
| "grad_norm": 0.5199276804924011, |
| "learning_rate": 0.000968786012055726, |
| "loss": 2.5069480895996095, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.500605571255551, |
| "grad_norm": 0.5875343680381775, |
| "learning_rate": 0.0009656074673794017, |
| "loss": 2.639841651916504, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.5167541380702463, |
| "grad_norm": 0.5489600896835327, |
| "learning_rate": 0.0009622805743205998, |
| "loss": 2.5628652572631836, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.5329027048849415, |
| "grad_norm": 0.4753468334674835, |
| "learning_rate": 0.0009588063928692012, |
| "loss": 2.5956233978271483, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.5490512716996366, |
| "grad_norm": 0.5152420997619629, |
| "learning_rate": 0.0009551860299430173, |
| "loss": 2.597700500488281, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.5651998385143319, |
| "grad_norm": 0.4520896375179291, |
| "learning_rate": 0.0009514206390351116, |
| "loss": 2.586415481567383, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.5813484053290271, |
| "grad_norm": 0.5123590230941772, |
| "learning_rate": 0.0009475114198462837, |
| "loss": 2.555033302307129, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.5974969721437222, |
| "grad_norm": 0.6008352637290955, |
| "learning_rate": 0.0009434596179028271, |
| "loss": 2.6199901580810545, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.6136455389584174, |
| "grad_norm": 0.4769132435321808, |
| "learning_rate": 0.0009392665241596914, |
| "loss": 2.5420787811279295, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.6297941057731127, |
| "grad_norm": 0.4474424421787262, |
| "learning_rate": 0.0009349334745891666, |
| "loss": 2.549270820617676, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.6459426725878078, |
| "grad_norm": 0.5046530365943909, |
| "learning_rate": 0.0009304618497552253, |
| "loss": 2.540346145629883, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.662091239402503, |
| "grad_norm": 0.5442773699760437, |
| "learning_rate": 0.0009258530743736586, |
| "loss": 2.550925636291504, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.6782398062171983, |
| "grad_norm": 0.45155641436576843, |
| "learning_rate": 0.0009211086168581433, |
| "loss": 2.5896928787231444, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.6943883730318934, |
| "grad_norm": 0.48448678851127625, |
| "learning_rate": 0.0009162299888523867, |
| "loss": 2.568522834777832, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.7105369398465886, |
| "grad_norm": 0.4634808897972107, |
| "learning_rate": 0.0009112187447484979, |
| "loss": 2.543706512451172, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.7266855066612838, |
| "grad_norm": 0.520962655544281, |
| "learning_rate": 0.0009060764811917397, |
| "loss": 2.4791580200195313, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.742834073475979, |
| "grad_norm": 0.495394766330719, |
| "learning_rate": 0.0009008048365718167, |
| "loss": 2.6086082458496094, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.7589826402906742, |
| "grad_norm": 0.4325544834136963, |
| "learning_rate": 0.0008954054905008639, |
| "loss": 2.5405605316162108, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.7751312071053694, |
| "grad_norm": 0.4992341697216034, |
| "learning_rate": 0.0008898801632783013, |
| "loss": 2.6021982192993165, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.7912797739200645, |
| "grad_norm": 0.5032821893692017, |
| "learning_rate": 0.0008842306153427246, |
| "loss": 2.5671119689941406, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.8074283407347598, |
| "grad_norm": 0.44175952672958374, |
| "learning_rate": 0.000878458646711008, |
| "loss": 2.5145410537719726, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.823576907549455, |
| "grad_norm": 0.48530301451683044, |
| "learning_rate": 0.0008725660964047959, |
| "loss": 2.4978832244873046, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.8397254743641501, |
| "grad_norm": 0.4604915976524353, |
| "learning_rate": 0.0008665548418645672, |
| "loss": 2.5596445083618162, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.8558740411788454, |
| "grad_norm": 0.45781826972961426, |
| "learning_rate": 0.0008604267983514594, |
| "loss": 2.6085268020629884, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.8720226079935406, |
| "grad_norm": 0.48819592595100403, |
| "learning_rate": 0.000854183918337043, |
| "loss": 2.5747554779052733, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.8881711748082358, |
| "grad_norm": 0.4929693043231964, |
| "learning_rate": 0.0008478281908812387, |
| "loss": 2.543058395385742, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.9043197416229309, |
| "grad_norm": 0.4913038909435272, |
| "learning_rate": 0.0008413616409985779, |
| "loss": 2.5399295806884767, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.9204683084376262, |
| "grad_norm": 0.47384563088417053, |
| "learning_rate": 0.0008347863290130087, |
| "loss": 2.5927974700927736, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.9366168752523214, |
| "grad_norm": 0.4775764048099518, |
| "learning_rate": 0.0008281043499014498, |
| "loss": 2.5593168258666994, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.9527654420670165, |
| "grad_norm": 0.5058761239051819, |
| "learning_rate": 0.0008213178326263049, |
| "loss": 2.552435111999512, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.9689140088817118, |
| "grad_norm": 0.4922596216201782, |
| "learning_rate": 0.0008144289394571484, |
| "loss": 2.5472679138183594, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.985062575696407, |
| "grad_norm": 0.49022358655929565, |
| "learning_rate": 0.0008074398652817998, |
| "loss": 2.5109460830688475, |
| "step": 610 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 1.6876914501190186, |
| "learning_rate": 0.0008003528369070043, |
| "loss": 2.465944290161133, |
| "step": 620 |
| }, |
| { |
| "epoch": 1.0161485668146952, |
| "grad_norm": 0.4119199812412262, |
| "learning_rate": 0.0007931701123489439, |
| "loss": 2.520769500732422, |
| "step": 630 |
| }, |
| { |
| "epoch": 1.0322971336293905, |
| "grad_norm": 0.4147922992706299, |
| "learning_rate": 0.000785893980113806, |
| "loss": 2.5548782348632812, |
| "step": 640 |
| }, |
| { |
| "epoch": 1.0484457004440857, |
| "grad_norm": 0.5313045382499695, |
| "learning_rate": 0.0007785267584686366, |
| "loss": 2.6331764221191407, |
| "step": 650 |
| }, |
| { |
| "epoch": 1.0645942672587807, |
| "grad_norm": 0.45284605026245117, |
| "learning_rate": 0.00077107079470271, |
| "loss": 2.519462013244629, |
| "step": 660 |
| }, |
| { |
| "epoch": 1.080742834073476, |
| "grad_norm": 0.5042719841003418, |
| "learning_rate": 0.0007635284643796545, |
| "loss": 2.4921422958374024, |
| "step": 670 |
| }, |
| { |
| "epoch": 1.0968914008881712, |
| "grad_norm": 0.4403098225593567, |
| "learning_rate": 0.0007559021705805671, |
| "loss": 2.454839897155762, |
| "step": 680 |
| }, |
| { |
| "epoch": 1.1130399677028664, |
| "grad_norm": 0.4963165819644928, |
| "learning_rate": 0.0007481943431383622, |
| "loss": 2.5821470260620116, |
| "step": 690 |
| }, |
| { |
| "epoch": 1.1291885345175616, |
| "grad_norm": 0.4509197175502777, |
| "learning_rate": 0.000740407437863596, |
| "loss": 2.4973094940185545, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.1453371013322569, |
| "grad_norm": 0.49439796805381775, |
| "learning_rate": 0.0007325439357620147, |
| "loss": 2.6476114273071287, |
| "step": 710 |
| }, |
| { |
| "epoch": 1.1614856681469519, |
| "grad_norm": 0.5243302583694458, |
| "learning_rate": 0.0007246063422440747, |
| "loss": 2.4758913040161135, |
| "step": 720 |
| }, |
| { |
| "epoch": 1.177634234961647, |
| "grad_norm": 0.6043158769607544, |
| "learning_rate": 0.0007165971863266878, |
| "loss": 2.5820987701416014, |
| "step": 730 |
| }, |
| { |
| "epoch": 1.1937828017763423, |
| "grad_norm": 0.44207850098609924, |
| "learning_rate": 0.0007085190198274438, |
| "loss": 2.4599235534667967, |
| "step": 740 |
| }, |
| { |
| "epoch": 1.2099313685910376, |
| "grad_norm": 0.4254566431045532, |
| "learning_rate": 0.0007003744165515704, |
| "loss": 2.4942739486694334, |
| "step": 750 |
| }, |
| { |
| "epoch": 1.2260799354057328, |
| "grad_norm": 0.41949278116226196, |
| "learning_rate": 0.0006921659714718863, |
| "loss": 2.505445098876953, |
| "step": 760 |
| }, |
| { |
| "epoch": 1.242228502220428, |
| "grad_norm": 0.4123310148715973, |
| "learning_rate": 0.0006838962999020094, |
| "loss": 2.5693735122680663, |
| "step": 770 |
| }, |
| { |
| "epoch": 1.2583770690351233, |
| "grad_norm": 0.4186009168624878, |
| "learning_rate": 0.0006755680366630865, |
| "loss": 2.4493398666381836, |
| "step": 780 |
| }, |
| { |
| "epoch": 1.2745256358498183, |
| "grad_norm": 0.444654643535614, |
| "learning_rate": 0.0006671838352443049, |
| "loss": 2.5728691101074217, |
| "step": 790 |
| }, |
| { |
| "epoch": 1.2906742026645135, |
| "grad_norm": 0.40202823281288147, |
| "learning_rate": 0.0006587463669574584, |
| "loss": 2.526685333251953, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.3068227694792087, |
| "grad_norm": 0.4873361885547638, |
| "learning_rate": 0.0006502583200858335, |
| "loss": 2.5635454177856447, |
| "step": 810 |
| }, |
| { |
| "epoch": 1.322971336293904, |
| "grad_norm": 0.5394927859306335, |
| "learning_rate": 0.0006417223990276883, |
| "loss": 2.6018707275390627, |
| "step": 820 |
| }, |
| { |
| "epoch": 1.3391199031085992, |
| "grad_norm": 0.5254472494125366, |
| "learning_rate": 0.0006331413234345977, |
| "loss": 2.5202842712402345, |
| "step": 830 |
| }, |
| { |
| "epoch": 1.3552684699232942, |
| "grad_norm": 0.4611901342868805, |
| "learning_rate": 0.0006245178273449383, |
| "loss": 2.527310371398926, |
| "step": 840 |
| }, |
| { |
| "epoch": 1.3714170367379894, |
| "grad_norm": 0.46280530095100403, |
| "learning_rate": 0.0006158546583127886, |
| "loss": 2.5010074615478515, |
| "step": 850 |
| }, |
| { |
| "epoch": 1.3875656035526847, |
| "grad_norm": 0.47044530510902405, |
| "learning_rate": 0.0006071545765325253, |
| "loss": 2.658343505859375, |
| "step": 860 |
| }, |
| { |
| "epoch": 1.4037141703673799, |
| "grad_norm": 0.5649057030677795, |
| "learning_rate": 0.0005984203539593897, |
| "loss": 2.514650344848633, |
| "step": 870 |
| }, |
| { |
| "epoch": 1.4198627371820751, |
| "grad_norm": 0.5100232362747192, |
| "learning_rate": 0.0005896547734263077, |
| "loss": 2.4994720458984374, |
| "step": 880 |
| }, |
| { |
| "epoch": 1.4360113039967703, |
| "grad_norm": 0.4990105926990509, |
| "learning_rate": 0.0005808606277572453, |
| "loss": 2.489163398742676, |
| "step": 890 |
| }, |
| { |
| "epoch": 1.4521598708114656, |
| "grad_norm": 0.473297655582428, |
| "learning_rate": 0.0005720407188773791, |
| "loss": 2.534769630432129, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.4683084376261606, |
| "grad_norm": 0.5112643837928772, |
| "learning_rate": 0.000563197856920368, |
| "loss": 2.5247997283935546, |
| "step": 910 |
| }, |
| { |
| "epoch": 1.4844570044408558, |
| "grad_norm": 0.4357326924800873, |
| "learning_rate": 0.0005543348593330093, |
| "loss": 2.508163642883301, |
| "step": 920 |
| }, |
| { |
| "epoch": 1.500605571255551, |
| "grad_norm": 0.47014695405960083, |
| "learning_rate": 0.0005454545499775651, |
| "loss": 2.6127824783325195, |
| "step": 930 |
| }, |
| { |
| "epoch": 1.5167541380702463, |
| "grad_norm": 0.4659437835216522, |
| "learning_rate": 0.0005365597582320436, |
| "loss": 2.4793100357055664, |
| "step": 940 |
| }, |
| { |
| "epoch": 1.5329027048849415, |
| "grad_norm": 0.5408269166946411, |
| "learning_rate": 0.0005276533180887248, |
| "loss": 2.45506591796875, |
| "step": 950 |
| }, |
| { |
| "epoch": 1.5490512716996365, |
| "grad_norm": 0.5870039463043213, |
| "learning_rate": 0.000518738067251214, |
| "loss": 2.480586814880371, |
| "step": 960 |
| }, |
| { |
| "epoch": 1.565199838514332, |
| "grad_norm": 0.48131951689720154, |
| "learning_rate": 0.0005098168462303141, |
| "loss": 2.589830207824707, |
| "step": 970 |
| }, |
| { |
| "epoch": 1.581348405329027, |
| "grad_norm": 0.4618188440799713, |
| "learning_rate": 0.0005008924974390041, |
| "loss": 2.5488056182861327, |
| "step": 980 |
| }, |
| { |
| "epoch": 1.5974969721437222, |
| "grad_norm": 0.49082285165786743, |
| "learning_rate": 0.0004919678642868092, |
| "loss": 2.4781982421875, |
| "step": 990 |
| }, |
| { |
| "epoch": 1.6136455389584174, |
| "grad_norm": 0.47744420170783997, |
| "learning_rate": 0.0004830457902738558, |
| "loss": 2.517325210571289, |
| "step": 1000 |
| }, |
| { |
| "epoch": 1.6297941057731127, |
| "grad_norm": 0.507945716381073, |
| "learning_rate": 0.0004741291180848961, |
| "loss": 2.5076452255249024, |
| "step": 1010 |
| }, |
| { |
| "epoch": 1.645942672587808, |
| "grad_norm": 0.42816001176834106, |
| "learning_rate": 0.000465220688683594, |
| "loss": 2.671817398071289, |
| "step": 1020 |
| }, |
| { |
| "epoch": 1.662091239402503, |
| "grad_norm": 0.4258963167667389, |
| "learning_rate": 0.00045632334040735764, |
| "loss": 2.533784103393555, |
| "step": 1030 |
| }, |
| { |
| "epoch": 1.6782398062171984, |
| "grad_norm": 0.494028240442276, |
| "learning_rate": 0.00044743990806300917, |
| "loss": 2.514291000366211, |
| "step": 1040 |
| }, |
| { |
| "epoch": 1.6943883730318934, |
| "grad_norm": 0.4230322539806366, |
| "learning_rate": 0.00043857322202358066, |
| "loss": 2.5531696319580077, |
| "step": 1050 |
| }, |
| { |
| "epoch": 1.7105369398465886, |
| "grad_norm": 0.5738111734390259, |
| "learning_rate": 0.00042972610732652105, |
| "loss": 2.5059112548828124, |
| "step": 1060 |
| }, |
| { |
| "epoch": 1.7266855066612838, |
| "grad_norm": 0.5092839002609253, |
| "learning_rate": 0.0004209013827736042, |
| "loss": 2.5219795227050783, |
| "step": 1070 |
| }, |
| { |
| "epoch": 1.7428340734759789, |
| "grad_norm": 0.4812857508659363, |
| "learning_rate": 0.00041210186003282274, |
| "loss": 2.5235408782958983, |
| "step": 1080 |
| }, |
| { |
| "epoch": 1.7589826402906743, |
| "grad_norm": 0.47096627950668335, |
| "learning_rate": 0.000403330342742556, |
| "loss": 2.574551582336426, |
| "step": 1090 |
| }, |
| { |
| "epoch": 1.7751312071053693, |
| "grad_norm": 0.4319113790988922, |
| "learning_rate": 0.0003945896256182949, |
| "loss": 2.610904502868652, |
| "step": 1100 |
| }, |
| { |
| "epoch": 1.7912797739200645, |
| "grad_norm": 0.44047966599464417, |
| "learning_rate": 0.0003858824935622115, |
| "loss": 2.5323257446289062, |
| "step": 1110 |
| }, |
| { |
| "epoch": 1.8074283407347598, |
| "grad_norm": 0.548047661781311, |
| "learning_rate": 0.00037721172077585287, |
| "loss": 2.5165468215942384, |
| "step": 1120 |
| }, |
| { |
| "epoch": 1.823576907549455, |
| "grad_norm": 0.46526339650154114, |
| "learning_rate": 0.00036858006987624723, |
| "loss": 2.502303886413574, |
| "step": 1130 |
| }, |
| { |
| "epoch": 1.8397254743641502, |
| "grad_norm": 0.4635223150253296, |
| "learning_rate": 0.0003599902910156984, |
| "loss": 2.5442089080810546, |
| "step": 1140 |
| }, |
| { |
| "epoch": 1.8558740411788452, |
| "grad_norm": 0.5317935347557068, |
| "learning_rate": 0.0003514451210055527, |
| "loss": 2.600077247619629, |
| "step": 1150 |
| }, |
| { |
| "epoch": 1.8720226079935407, |
| "grad_norm": 0.5463606119155884, |
| "learning_rate": 0.00034294728244421756, |
| "loss": 2.574476623535156, |
| "step": 1160 |
| }, |
| { |
| "epoch": 1.8881711748082357, |
| "grad_norm": 0.4650241732597351, |
| "learning_rate": 0.00033449948284970617, |
| "loss": 2.500654411315918, |
| "step": 1170 |
| }, |
| { |
| "epoch": 1.904319741622931, |
| "grad_norm": 0.40839987993240356, |
| "learning_rate": 0.00032610441379698937, |
| "loss": 2.4712839126586914, |
| "step": 1180 |
| }, |
| { |
| "epoch": 1.9204683084376262, |
| "grad_norm": 0.49943020939826965, |
| "learning_rate": 0.0003177647500604252, |
| "loss": 2.5296091079711913, |
| "step": 1190 |
| }, |
| { |
| "epoch": 1.9366168752523214, |
| "grad_norm": 0.4655015468597412, |
| "learning_rate": 0.00030948314876154306, |
| "loss": 2.5075130462646484, |
| "step": 1200 |
| }, |
| { |
| "epoch": 1.9527654420670166, |
| "grad_norm": 0.45307499170303345, |
| "learning_rate": 0.00030126224852245056, |
| "loss": 2.464124298095703, |
| "step": 1210 |
| }, |
| { |
| "epoch": 1.9689140088817116, |
| "grad_norm": 0.5530602931976318, |
| "learning_rate": 0.0002931046686251365, |
| "loss": 2.553061866760254, |
| "step": 1220 |
| }, |
| { |
| "epoch": 1.985062575696407, |
| "grad_norm": 0.46495500206947327, |
| "learning_rate": 0.0002850130081769334, |
| "loss": 2.486197853088379, |
| "step": 1230 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 1.9545954465866089, |
| "learning_rate": 0.00027698984528241036, |
| "loss": 2.4371877670288087, |
| "step": 1240 |
| }, |
| { |
| "epoch": 2.016148566814695, |
| "grad_norm": 0.48022302985191345, |
| "learning_rate": 0.00026903773622195636, |
| "loss": 2.512773895263672, |
| "step": 1250 |
| }, |
| { |
| "epoch": 2.0322971336293905, |
| "grad_norm": 0.433242529630661, |
| "learning_rate": 0.00026115921463731694, |
| "loss": 2.5300994873046876, |
| "step": 1260 |
| }, |
| { |
| "epoch": 2.0484457004440855, |
| "grad_norm": 0.47226834297180176, |
| "learning_rate": 0.0002533567907243446, |
| "loss": 2.421502113342285, |
| "step": 1270 |
| }, |
| { |
| "epoch": 2.064594267258781, |
| "grad_norm": 0.5992064476013184, |
| "learning_rate": 0.00024563295043321783, |
| "loss": 2.5453359603881838, |
| "step": 1280 |
| }, |
| { |
| "epoch": 2.080742834073476, |
| "grad_norm": 0.4261581599712372, |
| "learning_rate": 0.0002379901546763879, |
| "loss": 2.495037841796875, |
| "step": 1290 |
| }, |
| { |
| "epoch": 2.0968914008881714, |
| "grad_norm": 0.4328082203865051, |
| "learning_rate": 0.00023043083854449987, |
| "loss": 2.464985466003418, |
| "step": 1300 |
| }, |
| { |
| "epoch": 2.1130399677028664, |
| "grad_norm": 0.4510248303413391, |
| "learning_rate": 0.00022295741053054296, |
| "loss": 2.5308864593505858, |
| "step": 1310 |
| }, |
| { |
| "epoch": 2.1291885345175614, |
| "grad_norm": 0.49293237924575806, |
| "learning_rate": 0.00021557225176247353, |
| "loss": 2.5278299331665037, |
| "step": 1320 |
| }, |
| { |
| "epoch": 2.145337101332257, |
| "grad_norm": 0.4132377505302429, |
| "learning_rate": 0.0002082777152445589, |
| "loss": 2.398031234741211, |
| "step": 1330 |
| }, |
| { |
| "epoch": 2.161485668146952, |
| "grad_norm": 0.5191354751586914, |
| "learning_rate": 0.00020107612510768014, |
| "loss": 2.5248828887939454, |
| "step": 1340 |
| }, |
| { |
| "epoch": 2.1776342349616473, |
| "grad_norm": 0.5512005686759949, |
| "learning_rate": 0.00019396977586883475, |
| "loss": 2.4451154708862304, |
| "step": 1350 |
| }, |
| { |
| "epoch": 2.1937828017763423, |
| "grad_norm": 0.42351678013801575, |
| "learning_rate": 0.00018696093170007493, |
| "loss": 2.573942756652832, |
| "step": 1360 |
| }, |
| { |
| "epoch": 2.2099313685910373, |
| "grad_norm": 0.45575806498527527, |
| "learning_rate": 0.00018005182570711366, |
| "loss": 2.5537620544433595, |
| "step": 1370 |
| }, |
| { |
| "epoch": 2.226079935405733, |
| "grad_norm": 0.46785682439804077, |
| "learning_rate": 0.0001732446592178295, |
| "loss": 2.491817092895508, |
| "step": 1380 |
| }, |
| { |
| "epoch": 2.242228502220428, |
| "grad_norm": 0.45513054728507996, |
| "learning_rate": 0.00016654160108089594, |
| "loss": 2.5171764373779295, |
| "step": 1390 |
| }, |
| { |
| "epoch": 2.2583770690351233, |
| "grad_norm": 0.43288710713386536, |
| "learning_rate": 0.00015994478697475885, |
| "loss": 2.5143251419067383, |
| "step": 1400 |
| }, |
| { |
| "epoch": 2.2745256358498183, |
| "grad_norm": 0.4459301233291626, |
| "learning_rate": 0.00015345631872718213, |
| "loss": 2.5065849304199217, |
| "step": 1410 |
| }, |
| { |
| "epoch": 2.2906742026645137, |
| "grad_norm": 0.501880407333374, |
| "learning_rate": 0.00014707826364557985, |
| "loss": 2.4833837509155274, |
| "step": 1420 |
| }, |
| { |
| "epoch": 2.3068227694792087, |
| "grad_norm": 0.4559042453765869, |
| "learning_rate": 0.00014081265385834557, |
| "loss": 2.5120367050170898, |
| "step": 1430 |
| }, |
| { |
| "epoch": 2.3229713362939037, |
| "grad_norm": 0.48048946261405945, |
| "learning_rate": 0.000134661485667391, |
| "loss": 2.4629817962646485, |
| "step": 1440 |
| }, |
| { |
| "epoch": 2.339119903108599, |
| "grad_norm": 0.46413764357566833, |
| "learning_rate": 0.0001286267189120986, |
| "loss": 2.4572961807250975, |
| "step": 1450 |
| }, |
| { |
| "epoch": 2.355268469923294, |
| "grad_norm": 0.4971129298210144, |
| "learning_rate": 0.000122710276344893, |
| "loss": 2.5448049545288085, |
| "step": 1460 |
| }, |
| { |
| "epoch": 2.3714170367379896, |
| "grad_norm": 0.45018401741981506, |
| "learning_rate": 0.00011691404301862746, |
| "loss": 2.551463317871094, |
| "step": 1470 |
| }, |
| { |
| "epoch": 2.3875656035526847, |
| "grad_norm": 0.49596303701400757, |
| "learning_rate": 0.00011123986568598249, |
| "loss": 2.5440711975097656, |
| "step": 1480 |
| }, |
| { |
| "epoch": 2.4037141703673797, |
| "grad_norm": 0.47923141717910767, |
| "learning_rate": 0.00010568955221106713, |
| "loss": 2.45603084564209, |
| "step": 1490 |
| }, |
| { |
| "epoch": 2.419862737182075, |
| "grad_norm": 0.4507387578487396, |
| "learning_rate": 0.0001002648709934108, |
| "loss": 2.458144187927246, |
| "step": 1500 |
| }, |
| { |
| "epoch": 2.43601130399677, |
| "grad_norm": 0.46995073556900024, |
| "learning_rate": 9.496755040452915e-05, |
| "loss": 2.4886669158935546, |
| "step": 1510 |
| }, |
| { |
| "epoch": 2.4521598708114656, |
| "grad_norm": 0.4773581326007843, |
| "learning_rate": 8.979927823724321e-05, |
| "loss": 2.548818016052246, |
| "step": 1520 |
| }, |
| { |
| "epoch": 2.4683084376261606, |
| "grad_norm": 0.49100548028945923, |
| "learning_rate": 8.476170116792736e-05, |
| "loss": 2.5328250885009767, |
| "step": 1530 |
| }, |
| { |
| "epoch": 2.484457004440856, |
| "grad_norm": 0.5164358615875244, |
| "learning_rate": 7.985642423185718e-05, |
| "loss": 2.4474281311035155, |
| "step": 1540 |
| }, |
| { |
| "epoch": 2.500605571255551, |
| "grad_norm": 0.48450803756713867, |
| "learning_rate": 7.508501031182585e-05, |
| "loss": 2.470208168029785, |
| "step": 1550 |
| }, |
| { |
| "epoch": 2.5167541380702465, |
| "grad_norm": 0.4779358208179474, |
| "learning_rate": 7.044897964018949e-05, |
| "loss": 2.443818283081055, |
| "step": 1560 |
| }, |
| { |
| "epoch": 2.5329027048849415, |
| "grad_norm": 0.42919352650642395, |
| "learning_rate": 6.594980931450223e-05, |
| "loss": 2.4840776443481447, |
| "step": 1570 |
| }, |
| { |
| "epoch": 2.5490512716996365, |
| "grad_norm": 0.4021783769130707, |
| "learning_rate": 6.158893282689454e-05, |
| "loss": 2.450935173034668, |
| "step": 1580 |
| }, |
| { |
| "epoch": 2.565199838514332, |
| "grad_norm": 0.5504807829856873, |
| "learning_rate": 5.7367739607344093e-05, |
| "loss": 2.477644348144531, |
| "step": 1590 |
| }, |
| { |
| "epoch": 2.581348405329027, |
| "grad_norm": 0.3885093331336975, |
| "learning_rate": 5.328757458098665e-05, |
| "loss": 2.449785041809082, |
| "step": 1600 |
| }, |
| { |
| "epoch": 2.5974969721437224, |
| "grad_norm": 0.5313092470169067, |
| "learning_rate": 4.934973773960572e-05, |
| "loss": 2.564461898803711, |
| "step": 1610 |
| }, |
| { |
| "epoch": 2.6136455389584174, |
| "grad_norm": 0.5081238150596619, |
| "learning_rate": 4.5555483727438896e-05, |
| "loss": 2.5144027709960937, |
| "step": 1620 |
| }, |
| { |
| "epoch": 2.6297941057731125, |
| "grad_norm": 0.45351824164390564, |
| "learning_rate": 4.190602144143207e-05, |
| "loss": 2.5690656661987306, |
| "step": 1630 |
| }, |
| { |
| "epoch": 2.645942672587808, |
| "grad_norm": 0.49749699234962463, |
| "learning_rate": 3.840251364607045e-05, |
| "loss": 2.5524566650390623, |
| "step": 1640 |
| }, |
| { |
| "epoch": 2.662091239402503, |
| "grad_norm": 0.4736417233943939, |
| "learning_rate": 3.50460766029066e-05, |
| "loss": 2.437306022644043, |
| "step": 1650 |
| }, |
| { |
| "epoch": 2.6782398062171984, |
| "grad_norm": 0.54044508934021, |
| "learning_rate": 3.183777971490576e-05, |
| "loss": 2.4329130172729494, |
| "step": 1660 |
| }, |
| { |
| "epoch": 2.6943883730318934, |
| "grad_norm": 0.5381774306297302, |
| "learning_rate": 2.8778645185720487e-05, |
| "loss": 2.5384393692016602, |
| "step": 1670 |
| }, |
| { |
| "epoch": 2.7105369398465884, |
| "grad_norm": 0.4624033570289612, |
| "learning_rate": 2.5869647694003962e-05, |
| "loss": 2.5221799850463866, |
| "step": 1680 |
| }, |
| { |
| "epoch": 2.726685506661284, |
| "grad_norm": 0.49552807211875916, |
| "learning_rate": 2.3111714082864887e-05, |
| "loss": 2.519091987609863, |
| "step": 1690 |
| }, |
| { |
| "epoch": 2.742834073475979, |
| "grad_norm": 0.4731680154800415, |
| "learning_rate": 2.0505723064563886e-05, |
| "loss": 2.4723621368408204, |
| "step": 1700 |
| }, |
| { |
| "epoch": 2.7589826402906743, |
| "grad_norm": 0.44819337129592896, |
| "learning_rate": 1.8052504940544613e-05, |
| "loss": 2.5209144592285155, |
| "step": 1710 |
| }, |
| { |
| "epoch": 2.7751312071053693, |
| "grad_norm": 0.4628264307975769, |
| "learning_rate": 1.575284133688909e-05, |
| "loss": 2.502878189086914, |
| "step": 1720 |
| }, |
| { |
| "epoch": 2.7912797739200643, |
| "grad_norm": 0.4762296676635742, |
| "learning_rate": 1.3607464955282257e-05, |
| "loss": 2.4997129440307617, |
| "step": 1730 |
| }, |
| { |
| "epoch": 2.8074283407347598, |
| "grad_norm": 0.4384547173976898, |
| "learning_rate": 1.1617059339563806e-05, |
| "loss": 2.6152523040771483, |
| "step": 1740 |
| }, |
| { |
| "epoch": 2.8235769075494552, |
| "grad_norm": 0.4384756088256836, |
| "learning_rate": 9.782258657942467e-06, |
| "loss": 2.4833805084228517, |
| "step": 1750 |
| }, |
| { |
| "epoch": 2.8397254743641502, |
| "grad_norm": 0.4431445300579071, |
| "learning_rate": 8.103647500942112e-06, |
| "loss": 2.500337028503418, |
| "step": 1760 |
| }, |
| { |
| "epoch": 2.8558740411788452, |
| "grad_norm": 0.4854304790496826, |
| "learning_rate": 6.581760695143934e-06, |
| "loss": 2.4700483322143554, |
| "step": 1770 |
| }, |
| { |
| "epoch": 2.8720226079935407, |
| "grad_norm": 0.4667441248893738, |
| "learning_rate": 5.217083132783907e-06, |
| "loss": 2.4867990493774412, |
| "step": 1780 |
| }, |
| { |
| "epoch": 2.8881711748082357, |
| "grad_norm": 0.49439942836761475, |
| "learning_rate": 4.010049617260203e-06, |
| "loss": 2.515974426269531, |
| "step": 1790 |
| }, |
| { |
| "epoch": 2.904319741622931, |
| "grad_norm": 0.43787845969200134, |
| "learning_rate": 2.961044724599016e-06, |
| "loss": 2.5289798736572267, |
| "step": 1800 |
| }, |
| { |
| "epoch": 2.920468308437626, |
| "grad_norm": 0.4790705144405365, |
| "learning_rate": 2.0704026809241215e-06, |
| "loss": 2.4601634979248046, |
| "step": 1810 |
| }, |
| { |
| "epoch": 2.936616875252321, |
| "grad_norm": 0.4470031261444092, |
| "learning_rate": 1.338407255968288e-06, |
| "loss": 2.4816938400268556, |
| "step": 1820 |
| }, |
| { |
| "epoch": 2.9527654420670166, |
| "grad_norm": 0.4893916845321655, |
| "learning_rate": 7.652916726604287e-07, |
| "loss": 2.529372978210449, |
| "step": 1830 |
| }, |
| { |
| "epoch": 2.9689140088817116, |
| "grad_norm": 0.431090384721756, |
| "learning_rate": 3.5123853281793237e-07, |
| "loss": 2.551029586791992, |
| "step": 1840 |
| }, |
| { |
| "epoch": 2.985062575696407, |
| "grad_norm": 0.4225512742996216, |
| "learning_rate": 9.637975896759077e-08, |
| "loss": 2.5007448196411133, |
| "step": 1850 |
| }, |
| { |
| "epoch": 3.0, |
| "grad_norm": 1.7053178548812866, |
| "learning_rate": 7.965523131092667e-10, |
| "loss": 2.677412414550781, |
| "step": 1860 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 1860, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 200, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.131504576233472e+16, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|