{ "best_global_step": 700, "best_metric": 0.44259119033813477, "best_model_checkpoint": "/data/taoyong/LabOS/QWEN-36/checkpoints/qwen3.6-35b-a3b-lora-lf/checkpoint-700", "epoch": 10.0, "eval_steps": 100, "global_step": 2500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04, "grad_norm": 1.3030611276626587, "learning_rate": 3.6e-06, "loss": 1.1145790100097657, "step": 10 }, { "epoch": 0.08, "grad_norm": 1.540786623954773, "learning_rate": 7.6e-06, "loss": 1.2167404174804688, "step": 20 }, { "epoch": 0.12, "grad_norm": 1.0591915845870972, "learning_rate": 1.16e-05, "loss": 1.0437713623046876, "step": 30 }, { "epoch": 0.16, "grad_norm": 0.6695119142532349, "learning_rate": 1.56e-05, "loss": 0.9282869338989258, "step": 40 }, { "epoch": 0.2, "grad_norm": 0.7912387847900391, "learning_rate": 1.9600000000000002e-05, "loss": 0.8799624443054199, "step": 50 }, { "epoch": 0.24, "grad_norm": 0.7810359001159668, "learning_rate": 2.36e-05, "loss": 0.7062759399414062, "step": 60 }, { "epoch": 0.28, "grad_norm": 0.7185921669006348, "learning_rate": 2.7600000000000003e-05, "loss": 0.7228042602539062, "step": 70 }, { "epoch": 0.32, "grad_norm": 0.7974339723587036, "learning_rate": 3.16e-05, "loss": 0.6257906913757324, "step": 80 }, { "epoch": 0.36, "grad_norm": 0.7850703597068787, "learning_rate": 3.56e-05, "loss": 0.5399329185485839, "step": 90 }, { "epoch": 0.4, "grad_norm": 0.7295215129852295, "learning_rate": 3.960000000000001e-05, "loss": 0.5184461116790772, "step": 100 }, { "epoch": 0.4, "eval_loss": 0.5476460456848145, "eval_runtime": 21.5181, "eval_samples_per_second": 18.589, "eval_steps_per_second": 3.114, "step": 100 }, { "epoch": 0.44, "grad_norm": 1.0682953596115112, "learning_rate": 4.36e-05, "loss": 0.5210700988769531, "step": 110 }, { "epoch": 0.48, "grad_norm": 0.9108087420463562, "learning_rate": 4.76e-05, "loss": 0.5155693531036377, "step": 120 }, { "epoch": 0.52, "grad_norm": 1.0037930011749268, "learning_rate": 5.16e-05, "loss": 0.45534143447875974, "step": 130 }, { "epoch": 0.56, "grad_norm": 0.9430785179138184, "learning_rate": 5.560000000000001e-05, "loss": 0.45524797439575193, "step": 140 }, { "epoch": 0.6, "grad_norm": 0.9689427614212036, "learning_rate": 5.96e-05, "loss": 0.47152209281921387, "step": 150 }, { "epoch": 0.64, "grad_norm": 0.7584393620491028, "learning_rate": 6.36e-05, "loss": 0.4532940864562988, "step": 160 }, { "epoch": 0.68, "grad_norm": 0.7581620216369629, "learning_rate": 6.76e-05, "loss": 0.48988704681396483, "step": 170 }, { "epoch": 0.72, "grad_norm": 0.9882776141166687, "learning_rate": 7.16e-05, "loss": 0.46865572929382326, "step": 180 }, { "epoch": 0.76, "grad_norm": 0.743236780166626, "learning_rate": 7.560000000000001e-05, "loss": 0.45577139854431153, "step": 190 }, { "epoch": 0.8, "grad_norm": 0.6103836894035339, "learning_rate": 7.960000000000001e-05, "loss": 0.4559042453765869, "step": 200 }, { "epoch": 0.8, "eval_loss": 0.485470175743103, "eval_runtime": 17.4199, "eval_samples_per_second": 22.962, "eval_steps_per_second": 3.846, "step": 200 }, { "epoch": 0.84, "grad_norm": 0.8245580792427063, "learning_rate": 8.36e-05, "loss": 0.45926451683044434, "step": 210 }, { "epoch": 0.88, "grad_norm": 0.6920369267463684, "learning_rate": 8.76e-05, "loss": 0.4545453548431396, "step": 220 }, { "epoch": 0.92, "grad_norm": 0.6936920881271362, "learning_rate": 9.16e-05, "loss": 0.47637343406677246, "step": 230 }, { "epoch": 0.96, "grad_norm": 0.6694210767745972, "learning_rate": 9.56e-05, "loss": 0.43120541572570803, "step": 240 }, { "epoch": 1.0, "grad_norm": 0.583095133304596, "learning_rate": 9.960000000000001e-05, "loss": 0.4153712272644043, "step": 250 }, { "epoch": 1.04, "grad_norm": 0.6926116943359375, "learning_rate": 9.999605221019081e-05, "loss": 0.44300012588500975, "step": 260 }, { "epoch": 1.08, "grad_norm": 0.761324405670166, "learning_rate": 9.998240632972073e-05, "loss": 0.462084436416626, "step": 270 }, { "epoch": 1.12, "grad_norm": 0.5191273093223572, "learning_rate": 9.995901628010196e-05, "loss": 0.39808471202850343, "step": 280 }, { "epoch": 1.16, "grad_norm": 0.8463711738586426, "learning_rate": 9.9925886621271e-05, "loss": 0.423044490814209, "step": 290 }, { "epoch": 1.2, "grad_norm": 0.8373249769210815, "learning_rate": 9.98830238119205e-05, "loss": 0.41622562408447267, "step": 300 }, { "epoch": 1.2, "eval_loss": 0.4695434272289276, "eval_runtime": 19.2419, "eval_samples_per_second": 20.788, "eval_steps_per_second": 3.482, "step": 300 }, { "epoch": 1.24, "grad_norm": 0.6290304064750671, "learning_rate": 9.983043620824005e-05, "loss": 0.4166346549987793, "step": 310 }, { "epoch": 1.28, "grad_norm": 0.6189863681793213, "learning_rate": 9.97681340622872e-05, "loss": 0.43734130859375, "step": 320 }, { "epoch": 1.32, "grad_norm": 0.5579029321670532, "learning_rate": 9.969612951998874e-05, "loss": 0.3747305631637573, "step": 330 }, { "epoch": 1.3599999999999999, "grad_norm": 1.1675549745559692, "learning_rate": 9.961443661877289e-05, "loss": 0.42578792572021484, "step": 340 }, { "epoch": 1.4, "grad_norm": 0.6578675508499146, "learning_rate": 9.952307128483256e-05, "loss": 0.39537777900695803, "step": 350 }, { "epoch": 1.44, "grad_norm": 0.8092941045761108, "learning_rate": 9.942205133002068e-05, "loss": 0.4084367275238037, "step": 360 }, { "epoch": 1.48, "grad_norm": 0.6226063370704651, "learning_rate": 9.931139644837754e-05, "loss": 0.3781426906585693, "step": 370 }, { "epoch": 1.52, "grad_norm": 0.7148721218109131, "learning_rate": 9.919112821229163e-05, "loss": 0.3952002048492432, "step": 380 }, { "epoch": 1.56, "grad_norm": 0.5743547081947327, "learning_rate": 9.906127006829384e-05, "loss": 0.4087832927703857, "step": 390 }, { "epoch": 1.6, "grad_norm": 0.6315461993217468, "learning_rate": 9.892184733248666e-05, "loss": 0.3861570119857788, "step": 400 }, { "epoch": 1.6, "eval_loss": 0.45406103134155273, "eval_runtime": 19.7154, "eval_samples_per_second": 20.289, "eval_steps_per_second": 3.398, "step": 400 }, { "epoch": 1.6400000000000001, "grad_norm": 0.6243694424629211, "learning_rate": 9.877288718560866e-05, "loss": 0.39033331871032717, "step": 410 }, { "epoch": 1.6800000000000002, "grad_norm": 0.6677294969558716, "learning_rate": 9.861441866773564e-05, "loss": 0.43663845062255857, "step": 420 }, { "epoch": 1.72, "grad_norm": 0.6460554599761963, "learning_rate": 9.844647267261916e-05, "loss": 0.43364706039428713, "step": 430 }, { "epoch": 1.76, "grad_norm": 0.570160984992981, "learning_rate": 9.82690819416637e-05, "loss": 0.409498929977417, "step": 440 }, { "epoch": 1.8, "grad_norm": 0.5696760416030884, "learning_rate": 9.808228105754376e-05, "loss": 0.4264820098876953, "step": 450 }, { "epoch": 1.8399999999999999, "grad_norm": 0.583260715007782, "learning_rate": 9.788610643746184e-05, "loss": 0.417040491104126, "step": 460 }, { "epoch": 1.88, "grad_norm": 0.6025984287261963, "learning_rate": 9.76805963260488e-05, "loss": 0.3749807357788086, "step": 470 }, { "epoch": 1.92, "grad_norm": 0.5953373312950134, "learning_rate": 9.746579078790807e-05, "loss": 0.4022481918334961, "step": 480 }, { "epoch": 1.96, "grad_norm": 0.4357820153236389, "learning_rate": 9.724173169980491e-05, "loss": 0.38319835662841795, "step": 490 }, { "epoch": 2.0, "grad_norm": 0.5152677297592163, "learning_rate": 9.700846274250251e-05, "loss": 0.4122174263000488, "step": 500 }, { "epoch": 2.0, "eval_loss": 0.44415727257728577, "eval_runtime": 18.9015, "eval_samples_per_second": 21.162, "eval_steps_per_second": 3.545, "step": 500 }, { "epoch": 2.04, "grad_norm": 0.38848409056663513, "learning_rate": 9.676602939224629e-05, "loss": 0.3524669408798218, "step": 510 }, { "epoch": 2.08, "grad_norm": 0.5285012125968933, "learning_rate": 9.651447891189825e-05, "loss": 0.3717231273651123, "step": 520 }, { "epoch": 2.12, "grad_norm": 0.6452465653419495, "learning_rate": 9.62538603417229e-05, "loss": 0.40065832138061525, "step": 530 }, { "epoch": 2.16, "grad_norm": 0.48196467757225037, "learning_rate": 9.598422448982696e-05, "loss": 0.33635973930358887, "step": 540 }, { "epoch": 2.2, "grad_norm": 0.563376247882843, "learning_rate": 9.570562392225396e-05, "loss": 0.3708656787872314, "step": 550 }, { "epoch": 2.24, "grad_norm": 0.6459429860115051, "learning_rate": 9.541811295273656e-05, "loss": 0.35284056663513186, "step": 560 }, { "epoch": 2.2800000000000002, "grad_norm": 0.5247339606285095, "learning_rate": 9.512174763210797e-05, "loss": 0.3429510831832886, "step": 570 }, { "epoch": 2.32, "grad_norm": 0.5456256866455078, "learning_rate": 9.481658573737465e-05, "loss": 0.36770102977752683, "step": 580 }, { "epoch": 2.36, "grad_norm": 0.5435087084770203, "learning_rate": 9.450268676045262e-05, "loss": 0.3684037208557129, "step": 590 }, { "epoch": 2.4, "grad_norm": 0.5584478974342346, "learning_rate": 9.418011189656941e-05, "loss": 0.3221792697906494, "step": 600 }, { "epoch": 2.4, "eval_loss": 0.44748273491859436, "eval_runtime": 18.8521, "eval_samples_per_second": 21.218, "eval_steps_per_second": 3.554, "step": 600 }, { "epoch": 2.44, "grad_norm": 0.7217129468917847, "learning_rate": 9.384892403233384e-05, "loss": 0.40174164772033694, "step": 610 }, { "epoch": 2.48, "grad_norm": 0.5068971514701843, "learning_rate": 9.35091877334763e-05, "loss": 0.3701002836227417, "step": 620 }, { "epoch": 2.52, "grad_norm": 0.4331487715244293, "learning_rate": 9.316096923226135e-05, "loss": 0.3759175777435303, "step": 630 }, { "epoch": 2.56, "grad_norm": 0.5161293148994446, "learning_rate": 9.28043364145758e-05, "loss": 0.3581662178039551, "step": 640 }, { "epoch": 2.6, "grad_norm": 0.709299623966217, "learning_rate": 9.24393588066941e-05, "loss": 0.35065665245056155, "step": 650 }, { "epoch": 2.64, "grad_norm": 0.6004891991615295, "learning_rate": 9.206610756172402e-05, "loss": 0.36879355907440187, "step": 660 }, { "epoch": 2.68, "grad_norm": 0.4662474989891052, "learning_rate": 9.168465544573536e-05, "loss": 0.3592060565948486, "step": 670 }, { "epoch": 2.7199999999999998, "grad_norm": 0.5826489329338074, "learning_rate": 9.129507682357394e-05, "loss": 0.36156315803527833, "step": 680 }, { "epoch": 2.76, "grad_norm": 0.48988744616508484, "learning_rate": 9.089744764436403e-05, "loss": 0.34445748329162595, "step": 690 }, { "epoch": 2.8, "grad_norm": 0.4443361163139343, "learning_rate": 9.049184542670199e-05, "loss": 0.3526463985443115, "step": 700 }, { "epoch": 2.8, "eval_loss": 0.44259119033813477, "eval_runtime": 16.8228, "eval_samples_per_second": 23.777, "eval_steps_per_second": 3.983, "step": 700 }, { "epoch": 2.84, "grad_norm": 0.5471161007881165, "learning_rate": 9.007834924354383e-05, "loss": 0.3458081245422363, "step": 710 }, { "epoch": 2.88, "grad_norm": 0.5264748930931091, "learning_rate": 8.965703970678974e-05, "loss": 0.3651163101196289, "step": 720 }, { "epoch": 2.92, "grad_norm": 0.48987507820129395, "learning_rate": 8.922799895156867e-05, "loss": 0.3218229293823242, "step": 730 }, { "epoch": 2.96, "grad_norm": 0.5640589594841003, "learning_rate": 8.879131062022598e-05, "loss": 0.3561582088470459, "step": 740 }, { "epoch": 3.0, "grad_norm": 0.7934619784355164, "learning_rate": 8.834705984601708e-05, "loss": 0.36128854751586914, "step": 750 }, { "epoch": 3.04, "grad_norm": 1.0869489908218384, "learning_rate": 8.789533323651066e-05, "loss": 0.31422438621521, "step": 760 }, { "epoch": 3.08, "grad_norm": 0.4695897102355957, "learning_rate": 8.74362188567043e-05, "loss": 0.29355826377868655, "step": 770 }, { "epoch": 3.12, "grad_norm": 0.5532680153846741, "learning_rate": 8.696980621185602e-05, "loss": 0.3185117721557617, "step": 780 }, { "epoch": 3.16, "grad_norm": 0.5760806202888489, "learning_rate": 8.649618623003508e-05, "loss": 0.28971233367919924, "step": 790 }, { "epoch": 3.2, "grad_norm": 0.5517900586128235, "learning_rate": 8.601545124439535e-05, "loss": 0.3055370092391968, "step": 800 }, { "epoch": 3.2, "eval_loss": 0.4529191255569458, "eval_runtime": 18.5382, "eval_samples_per_second": 21.577, "eval_steps_per_second": 3.614, "step": 800 }, { "epoch": 3.24, "grad_norm": 0.5356678366661072, "learning_rate": 8.552769497517482e-05, "loss": 0.28035550117492675, "step": 810 }, { "epoch": 3.2800000000000002, "grad_norm": 0.5985352993011475, "learning_rate": 8.503301251142459e-05, "loss": 0.3199602603912354, "step": 820 }, { "epoch": 3.32, "grad_norm": 0.5187913179397583, "learning_rate": 8.453150029247114e-05, "loss": 0.29444499015808107, "step": 830 }, { "epoch": 3.36, "grad_norm": 0.5703292489051819, "learning_rate": 8.402325608911526e-05, "loss": 0.30467259883880615, "step": 840 }, { "epoch": 3.4, "grad_norm": 0.9323157072067261, "learning_rate": 8.350837898457143e-05, "loss": 0.3117033004760742, "step": 850 }, { "epoch": 3.44, "grad_norm": 0.628546953201294, "learning_rate": 8.298696935515132e-05, "loss": 0.34261503219604494, "step": 860 }, { "epoch": 3.48, "grad_norm": 0.5379561185836792, "learning_rate": 8.245912885069531e-05, "loss": 0.3159458637237549, "step": 870 }, { "epoch": 3.52, "grad_norm": 0.6575730443000793, "learning_rate": 8.192496037475562e-05, "loss": 0.2982481002807617, "step": 880 }, { "epoch": 3.56, "grad_norm": 0.5830497145652771, "learning_rate": 8.138456806453503e-05, "loss": 0.3232215404510498, "step": 890 }, { "epoch": 3.6, "grad_norm": 0.5474710464477539, "learning_rate": 8.083805727058513e-05, "loss": 0.3305091381072998, "step": 900 }, { "epoch": 3.6, "eval_loss": 0.44760578870773315, "eval_runtime": 19.5159, "eval_samples_per_second": 20.496, "eval_steps_per_second": 3.433, "step": 900 }, { "epoch": 3.64, "grad_norm": 0.5096336007118225, "learning_rate": 8.028553453626808e-05, "loss": 0.35752732753753663, "step": 910 }, { "epoch": 3.68, "grad_norm": 0.5023341774940491, "learning_rate": 7.972710757698567e-05, "loss": 0.3292932271957397, "step": 920 }, { "epoch": 3.7199999999999998, "grad_norm": 0.5277951955795288, "learning_rate": 7.916288525918007e-05, "loss": 0.28986682891845705, "step": 930 }, { "epoch": 3.76, "grad_norm": 0.600412905216217, "learning_rate": 7.859297757911013e-05, "loss": 0.3027395725250244, "step": 940 }, { "epoch": 3.8, "grad_norm": 0.6396210193634033, "learning_rate": 7.801749564140724e-05, "loss": 0.3238774061203003, "step": 950 }, { "epoch": 3.84, "grad_norm": 0.628635585308075, "learning_rate": 7.743655163741543e-05, "loss": 0.34537086486816404, "step": 960 }, { "epoch": 3.88, "grad_norm": 0.49822649359703064, "learning_rate": 7.685025882331936e-05, "loss": 0.3292637825012207, "step": 970 }, { "epoch": 3.92, "grad_norm": 0.5356727242469788, "learning_rate": 7.62587314980648e-05, "loss": 0.32722015380859376, "step": 980 }, { "epoch": 3.96, "grad_norm": 0.6211317777633667, "learning_rate": 7.566208498107585e-05, "loss": 0.29880056381225584, "step": 990 }, { "epoch": 4.0, "grad_norm": 0.5336779356002808, "learning_rate": 7.506043558977321e-05, "loss": 0.2978524684906006, "step": 1000 }, { "epoch": 4.0, "eval_loss": 0.44613513350486755, "eval_runtime": 19.2382, "eval_samples_per_second": 20.792, "eval_steps_per_second": 3.483, "step": 1000 }, { "epoch": 4.04, "grad_norm": 0.6681120991706848, "learning_rate": 7.445390061689782e-05, "loss": 0.27530927658081056, "step": 1010 }, { "epoch": 4.08, "grad_norm": 0.6299528479576111, "learning_rate": 7.38425983076444e-05, "loss": 0.2517704486846924, "step": 1020 }, { "epoch": 4.12, "grad_norm": 0.5211061239242554, "learning_rate": 7.32266478366094e-05, "loss": 0.28200175762176516, "step": 1030 }, { "epoch": 4.16, "grad_norm": 0.5778363347053528, "learning_rate": 7.260616928455754e-05, "loss": 0.2569046258926392, "step": 1040 }, { "epoch": 4.2, "grad_norm": 0.6715266108512878, "learning_rate": 7.1981283615012e-05, "loss": 0.2665576696395874, "step": 1050 }, { "epoch": 4.24, "grad_norm": 0.6580007672309875, "learning_rate": 7.135211265067216e-05, "loss": 0.2635650634765625, "step": 1060 }, { "epoch": 4.28, "grad_norm": 0.6889304518699646, "learning_rate": 7.071877904966423e-05, "loss": 0.26842334270477297, "step": 1070 }, { "epoch": 4.32, "grad_norm": 0.5896309018135071, "learning_rate": 7.00814062816285e-05, "loss": 0.2633937358856201, "step": 1080 }, { "epoch": 4.36, "grad_norm": 0.6062363386154175, "learning_rate": 6.944011860364905e-05, "loss": 0.2895397186279297, "step": 1090 }, { "epoch": 4.4, "grad_norm": 0.6124110817909241, "learning_rate": 6.879504103602935e-05, "loss": 0.27405414581298826, "step": 1100 }, { "epoch": 4.4, "eval_loss": 0.46795058250427246, "eval_runtime": 17.2143, "eval_samples_per_second": 23.237, "eval_steps_per_second": 3.892, "step": 1100 }, { "epoch": 4.44, "grad_norm": 0.8100364208221436, "learning_rate": 6.814629933791931e-05, "loss": 0.2581511974334717, "step": 1110 }, { "epoch": 4.48, "grad_norm": 0.6187950372695923, "learning_rate": 6.749401998279846e-05, "loss": 0.2689012050628662, "step": 1120 }, { "epoch": 4.52, "grad_norm": 0.6595885157585144, "learning_rate": 6.683833013381941e-05, "loss": 0.27230424880981446, "step": 1130 }, { "epoch": 4.5600000000000005, "grad_norm": 0.6320788860321045, "learning_rate": 6.617935761901748e-05, "loss": 0.2903036594390869, "step": 1140 }, { "epoch": 4.6, "grad_norm": 0.6367589831352234, "learning_rate": 6.551723090639007e-05, "loss": 0.2551115989685059, "step": 1150 }, { "epoch": 4.64, "grad_norm": 0.5754795670509338, "learning_rate": 6.485207907885175e-05, "loss": 0.2783109188079834, "step": 1160 }, { "epoch": 4.68, "grad_norm": 0.6343188881874084, "learning_rate": 6.418403180906922e-05, "loss": 0.29131503105163575, "step": 1170 }, { "epoch": 4.72, "grad_norm": 0.6726956963539124, "learning_rate": 6.351321933418139e-05, "loss": 0.2730400085449219, "step": 1180 }, { "epoch": 4.76, "grad_norm": 0.5498913526535034, "learning_rate": 6.283977243040939e-05, "loss": 0.2572148323059082, "step": 1190 }, { "epoch": 4.8, "grad_norm": 0.6083167195320129, "learning_rate": 6.216382238756146e-05, "loss": 0.27444655895233155, "step": 1200 }, { "epoch": 4.8, "eval_loss": 0.466619610786438, "eval_runtime": 19.9505, "eval_samples_per_second": 20.05, "eval_steps_per_second": 3.358, "step": 1200 }, { "epoch": 4.84, "grad_norm": 0.5861450433731079, "learning_rate": 6.148550098343778e-05, "loss": 0.27054529190063475, "step": 1210 }, { "epoch": 4.88, "grad_norm": 0.7090939879417419, "learning_rate": 6.080494045814011e-05, "loss": 0.26785056591033934, "step": 1220 }, { "epoch": 4.92, "grad_norm": 0.5825073719024658, "learning_rate": 6.0122273488291304e-05, "loss": 0.26335647106170657, "step": 1230 }, { "epoch": 4.96, "grad_norm": 0.5506169199943542, "learning_rate": 5.943763316116977e-05, "loss": 0.2614041090011597, "step": 1240 }, { "epoch": 5.0, "grad_norm": 0.6169804930686951, "learning_rate": 5.875115294876381e-05, "loss": 0.24768717288970948, "step": 1250 }, { "epoch": 5.04, "grad_norm": 0.8200834393501282, "learning_rate": 5.806296668175104e-05, "loss": 0.21707432270050048, "step": 1260 }, { "epoch": 5.08, "grad_norm": 1.5680038928985596, "learning_rate": 5.737320852340775e-05, "loss": 0.2139519214630127, "step": 1270 }, { "epoch": 5.12, "grad_norm": 0.6845637559890747, "learning_rate": 5.668201294345363e-05, "loss": 0.20998594760894776, "step": 1280 }, { "epoch": 5.16, "grad_norm": 0.8293268084526062, "learning_rate": 5.598951469183649e-05, "loss": 0.23306002616882324, "step": 1290 }, { "epoch": 5.2, "grad_norm": 0.7228839993476868, "learning_rate": 5.52958487724626e-05, "loss": 0.2262401580810547, "step": 1300 }, { "epoch": 5.2, "eval_loss": 0.49972543120384216, "eval_runtime": 18.926, "eval_samples_per_second": 21.135, "eval_steps_per_second": 3.54, "step": 1300 }, { "epoch": 5.24, "grad_norm": 0.6243706345558167, "learning_rate": 5.4601150416877367e-05, "loss": 0.21100988388061523, "step": 1310 }, { "epoch": 5.28, "grad_norm": 1.0553343296051025, "learning_rate": 5.390555505790168e-05, "loss": 0.23542592525482178, "step": 1320 }, { "epoch": 5.32, "grad_norm": 0.6127402186393738, "learning_rate": 5.3209198303229027e-05, "loss": 0.2095633029937744, "step": 1330 }, { "epoch": 5.36, "grad_norm": 0.7463288903236389, "learning_rate": 5.2512215908988484e-05, "loss": 0.21693904399871827, "step": 1340 }, { "epoch": 5.4, "grad_norm": 0.8020226955413818, "learning_rate": 5.1814743753278795e-05, "loss": 0.2076347827911377, "step": 1350 }, { "epoch": 5.44, "grad_norm": 0.6652446389198303, "learning_rate": 5.111691780967869e-05, "loss": 0.22539749145507812, "step": 1360 }, { "epoch": 5.48, "grad_norm": 0.6378898620605469, "learning_rate": 5.041887412073854e-05, "loss": 0.2077547550201416, "step": 1370 }, { "epoch": 5.52, "grad_norm": 0.7381134033203125, "learning_rate": 4.97207487714586e-05, "loss": 0.21558783054351807, "step": 1380 }, { "epoch": 5.5600000000000005, "grad_norm": 0.6613102555274963, "learning_rate": 4.9022677862758945e-05, "loss": 0.21069679260253907, "step": 1390 }, { "epoch": 5.6, "grad_norm": 0.7527480721473694, "learning_rate": 4.832479748494643e-05, "loss": 0.21843309402465821, "step": 1400 }, { "epoch": 5.6, "eval_loss": 0.49576279520988464, "eval_runtime": 18.3368, "eval_samples_per_second": 21.814, "eval_steps_per_second": 3.654, "step": 1400 }, { "epoch": 5.64, "grad_norm": 0.5983570218086243, "learning_rate": 4.7627243691183453e-05, "loss": 0.22310276031494142, "step": 1410 }, { "epoch": 5.68, "grad_norm": 0.6202098727226257, "learning_rate": 4.693015247096423e-05, "loss": 0.22056117057800292, "step": 1420 }, { "epoch": 5.72, "grad_norm": 0.7730934023857117, "learning_rate": 4.623365972360337e-05, "loss": 0.2241537094116211, "step": 1430 }, { "epoch": 5.76, "grad_norm": 0.6262892484664917, "learning_rate": 4.553790123174197e-05, "loss": 0.21514451503753662, "step": 1440 }, { "epoch": 5.8, "grad_norm": 0.646507203578949, "learning_rate": 4.484301263487665e-05, "loss": 0.21031346321105956, "step": 1450 }, { "epoch": 5.84, "grad_norm": 0.8227706551551819, "learning_rate": 4.414912940291613e-05, "loss": 0.2312474489212036, "step": 1460 }, { "epoch": 5.88, "grad_norm": 0.6932390332221985, "learning_rate": 4.345638680977139e-05, "loss": 0.22380952835083007, "step": 1470 }, { "epoch": 5.92, "grad_norm": 0.7352316379547119, "learning_rate": 4.276491990698355e-05, "loss": 0.22706894874572753, "step": 1480 }, { "epoch": 5.96, "grad_norm": 0.6953718066215515, "learning_rate": 4.2074863497395377e-05, "loss": 0.2103546142578125, "step": 1490 }, { "epoch": 6.0, "grad_norm": 0.661618709564209, "learning_rate": 4.1386352108871174e-05, "loss": 0.2276217222213745, "step": 1500 }, { "epoch": 6.0, "eval_loss": 0.4966464042663574, "eval_runtime": 17.2948, "eval_samples_per_second": 23.128, "eval_steps_per_second": 3.874, "step": 1500 }, { "epoch": 6.04, "grad_norm": 0.8837434649467468, "learning_rate": 4.069951996807034e-05, "loss": 0.16540236473083497, "step": 1510 }, { "epoch": 6.08, "grad_norm": 1.3857215642929077, "learning_rate": 4.001450097427966e-05, "loss": 0.1638352394104004, "step": 1520 }, { "epoch": 6.12, "grad_norm": 0.8306711912155151, "learning_rate": 3.9331428673309204e-05, "loss": 0.1719011664390564, "step": 1530 }, { "epoch": 6.16, "grad_norm": 0.8509021997451782, "learning_rate": 3.865043623145751e-05, "loss": 0.1651092290878296, "step": 1540 }, { "epoch": 6.2, "grad_norm": 0.7507994174957275, "learning_rate": 3.797165640955041e-05, "loss": 0.1746900796890259, "step": 1550 }, { "epoch": 6.24, "grad_norm": 0.740626335144043, "learning_rate": 3.729522153705916e-05, "loss": 0.16637682914733887, "step": 1560 }, { "epoch": 6.28, "grad_norm": 0.6479809880256653, "learning_rate": 3.662126348630237e-05, "loss": 0.1709848165512085, "step": 1570 }, { "epoch": 6.32, "grad_norm": 0.6932395100593567, "learning_rate": 3.594991364673745e-05, "loss": 0.18107957839965821, "step": 1580 }, { "epoch": 6.36, "grad_norm": 0.8027141690254211, "learning_rate": 3.528130289934583e-05, "loss": 0.16225044727325438, "step": 1590 }, { "epoch": 6.4, "grad_norm": 0.5781376957893372, "learning_rate": 3.461556159111748e-05, "loss": 0.17544152736663818, "step": 1600 }, { "epoch": 6.4, "eval_loss": 0.5342507362365723, "eval_runtime": 19.471, "eval_samples_per_second": 20.543, "eval_steps_per_second": 3.441, "step": 1600 }, { "epoch": 6.44, "grad_norm": 0.7642867565155029, "learning_rate": 3.3952819509639534e-05, "loss": 0.17091144323349, "step": 1610 }, { "epoch": 6.48, "grad_norm": 0.7651257514953613, "learning_rate": 3.329320585779393e-05, "loss": 0.17765278816223146, "step": 1620 }, { "epoch": 6.52, "grad_norm": 0.6956056356430054, "learning_rate": 3.263684922856905e-05, "loss": 0.16475566625595092, "step": 1630 }, { "epoch": 6.5600000000000005, "grad_norm": 0.7344402074813843, "learning_rate": 3.1983877579990274e-05, "loss": 0.172060227394104, "step": 1640 }, { "epoch": 6.6, "grad_norm": 0.7196578979492188, "learning_rate": 3.1334418210174263e-05, "loss": 0.16673840284347535, "step": 1650 }, { "epoch": 6.64, "grad_norm": 0.7540257573127747, "learning_rate": 3.0688597732512e-05, "loss": 0.17414634227752684, "step": 1660 }, { "epoch": 6.68, "grad_norm": 0.5103999972343445, "learning_rate": 3.0046542050985237e-05, "loss": 0.1620783567428589, "step": 1670 }, { "epoch": 6.72, "grad_norm": 0.8846920132637024, "learning_rate": 2.940837633562127e-05, "loss": 0.17428462505340575, "step": 1680 }, { "epoch": 6.76, "grad_norm": 0.8017328381538391, "learning_rate": 2.877422499809072e-05, "loss": 0.19050977230072022, "step": 1690 }, { "epoch": 6.8, "grad_norm": 0.8515416383743286, "learning_rate": 2.8144211667453368e-05, "loss": 0.16926174163818358, "step": 1700 }, { "epoch": 6.8, "eval_loss": 0.5441356301307678, "eval_runtime": 17.5836, "eval_samples_per_second": 22.749, "eval_steps_per_second": 3.81, "step": 1700 }, { "epoch": 6.84, "grad_norm": 0.7547643184661865, "learning_rate": 2.75184591660563e-05, "loss": 0.1793771743774414, "step": 1710 }, { "epoch": 6.88, "grad_norm": 0.7164461016654968, "learning_rate": 2.6897089485589583e-05, "loss": 0.1647491931915283, "step": 1720 }, { "epoch": 6.92, "grad_norm": 1.1592035293579102, "learning_rate": 2.6280223763303546e-05, "loss": 0.17397019863128663, "step": 1730 }, { "epoch": 6.96, "grad_norm": 0.9889470934867859, "learning_rate": 2.5667982258393014e-05, "loss": 0.17107686996459961, "step": 1740 }, { "epoch": 7.0, "grad_norm": 0.7448652982711792, "learning_rate": 2.506048432855247e-05, "loss": 0.1730511426925659, "step": 1750 }, { "epoch": 7.04, "grad_norm": 0.6695497632026672, "learning_rate": 2.4457848406707013e-05, "loss": 0.13950222730636597, "step": 1760 }, { "epoch": 7.08, "grad_norm": 0.7200675010681152, "learning_rate": 2.3860191977923672e-05, "loss": 0.1326605796813965, "step": 1770 }, { "epoch": 7.12, "grad_norm": 0.6615055799484253, "learning_rate": 2.326763155650744e-05, "loss": 0.1265331983566284, "step": 1780 }, { "epoch": 7.16, "grad_norm": 0.8998573422431946, "learning_rate": 2.2680282663286552e-05, "loss": 0.12731509208679198, "step": 1790 }, { "epoch": 7.2, "grad_norm": 0.808588981628418, "learning_rate": 2.209825980309151e-05, "loss": 0.13114826679229735, "step": 1800 }, { "epoch": 7.2, "eval_loss": 0.5847110748291016, "eval_runtime": 18.9921, "eval_samples_per_second": 21.061, "eval_steps_per_second": 3.528, "step": 1800 }, { "epoch": 7.24, "grad_norm": 0.951817512512207, "learning_rate": 2.152167644243213e-05, "loss": 0.12906957864761354, "step": 1810 }, { "epoch": 7.28, "grad_norm": 0.8695458173751831, "learning_rate": 2.095064498737701e-05, "loss": 0.133590030670166, "step": 1820 }, { "epoch": 7.32, "grad_norm": 0.7357354760169983, "learning_rate": 2.0385276761639765e-05, "loss": 0.13653848171234131, "step": 1830 }, { "epoch": 7.36, "grad_norm": 0.7873698472976685, "learning_rate": 1.9825681984876172e-05, "loss": 0.12472724914550781, "step": 1840 }, { "epoch": 7.4, "grad_norm": 0.873921811580658, "learning_rate": 1.9271969751196776e-05, "loss": 0.13255125284194946, "step": 1850 }, { "epoch": 7.44, "grad_norm": 0.7591536045074463, "learning_rate": 1.8724248007898647e-05, "loss": 0.13693161010742189, "step": 1860 }, { "epoch": 7.48, "grad_norm": 1.0509488582611084, "learning_rate": 1.8182623534420907e-05, "loss": 0.13425672054290771, "step": 1870 }, { "epoch": 7.52, "grad_norm": 0.8472399711608887, "learning_rate": 1.76472019215278e-05, "loss": 0.13668575286865234, "step": 1880 }, { "epoch": 7.5600000000000005, "grad_norm": 0.911901593208313, "learning_rate": 1.7118087550723633e-05, "loss": 0.1317702889442444, "step": 1890 }, { "epoch": 7.6, "grad_norm": 0.9731144309043884, "learning_rate": 1.659538357390341e-05, "loss": 0.14458621740341188, "step": 1900 }, { "epoch": 7.6, "eval_loss": 0.5830516219139099, "eval_runtime": 18.7747, "eval_samples_per_second": 21.305, "eval_steps_per_second": 3.569, "step": 1900 }, { "epoch": 7.64, "grad_norm": 0.5515460968017578, "learning_rate": 1.60791918932431e-05, "loss": 0.13126691579818725, "step": 1910 }, { "epoch": 7.68, "grad_norm": 0.7286776304244995, "learning_rate": 1.556961314133359e-05, "loss": 0.12600460052490234, "step": 1920 }, { "epoch": 7.72, "grad_norm": 0.95229572057724, "learning_rate": 1.5066746661562253e-05, "loss": 0.12453792095184327, "step": 1930 }, { "epoch": 7.76, "grad_norm": 0.7712796330451965, "learning_rate": 1.4570690488745687e-05, "loss": 0.14839541912078857, "step": 1940 }, { "epoch": 7.8, "grad_norm": 0.8011840581893921, "learning_rate": 1.4081541330017705e-05, "loss": 0.1321096420288086, "step": 1950 }, { "epoch": 7.84, "grad_norm": 0.936607301235199, "learning_rate": 1.3599394545975951e-05, "loss": 0.1317069411277771, "step": 1960 }, { "epoch": 7.88, "grad_norm": 0.9034994840621948, "learning_rate": 1.312434413209131e-05, "loss": 0.13362932205200195, "step": 1970 }, { "epoch": 7.92, "grad_norm": 0.9586318731307983, "learning_rate": 1.2656482700383237e-05, "loss": 0.12677763700485228, "step": 1980 }, { "epoch": 7.96, "grad_norm": 0.9358674883842468, "learning_rate": 1.219590146136485e-05, "loss": 0.1382434129714966, "step": 1990 }, { "epoch": 8.0, "grad_norm": 0.8410677313804626, "learning_rate": 1.1742690206261292e-05, "loss": 0.12519369125366211, "step": 2000 }, { "epoch": 8.0, "eval_loss": 0.5840195417404175, "eval_runtime": 18.625, "eval_samples_per_second": 21.477, "eval_steps_per_second": 3.597, "step": 2000 }, { "epoch": 8.04, "grad_norm": 0.6319883465766907, "learning_rate": 1.129693728950474e-05, "loss": 0.10409053564071655, "step": 2010 }, { "epoch": 8.08, "grad_norm": 0.7751646041870117, "learning_rate": 1.0858729611509516e-05, "loss": 0.10310100317001343, "step": 2020 }, { "epoch": 8.12, "grad_norm": 0.9277542233467102, "learning_rate": 1.0428152601730718e-05, "loss": 0.09960774183273316, "step": 2030 }, { "epoch": 8.16, "grad_norm": 0.8381429314613342, "learning_rate": 1.0005290202009531e-05, "loss": 0.09982571601867676, "step": 2040 }, { "epoch": 8.2, "grad_norm": 0.7726228833198547, "learning_rate": 9.590224850208646e-06, "loss": 0.11322143077850341, "step": 2050 }, { "epoch": 8.24, "grad_norm": 0.7724836468696594, "learning_rate": 9.183037464140804e-06, "loss": 0.10006082057952881, "step": 2060 }, { "epoch": 8.28, "grad_norm": 1.0587371587753296, "learning_rate": 8.783807425793721e-06, "loss": 0.11560235023498536, "step": 2070 }, { "epoch": 8.32, "grad_norm": 0.8337858319282532, "learning_rate": 8.392612565854375e-06, "loss": 0.10931503772735596, "step": 2080 }, { "epoch": 8.36, "grad_norm": 0.805338978767395, "learning_rate": 8.009529148535855e-06, "loss": 0.10900030136108399, "step": 2090 }, { "epoch": 8.4, "grad_norm": 0.7612441182136536, "learning_rate": 7.63463185670939e-06, "loss": 0.1069128155708313, "step": 2100 }, { "epoch": 8.4, "eval_loss": 0.6247864961624146, "eval_runtime": 18.281, "eval_samples_per_second": 21.881, "eval_steps_per_second": 3.665, "step": 2100 }, { "epoch": 8.44, "grad_norm": 0.8081948757171631, "learning_rate": 7.267993777344856e-06, "loss": 0.09856721758842468, "step": 2110 }, { "epoch": 8.48, "grad_norm": 0.7861329913139343, "learning_rate": 6.909686387262254e-06, "loss": 0.10609345436096192, "step": 2120 }, { "epoch": 8.52, "grad_norm": 0.7145861387252808, "learning_rate": 6.559779539197231e-06, "loss": 0.105103600025177, "step": 2130 }, { "epoch": 8.56, "grad_norm": 0.7359808683395386, "learning_rate": 6.21834144818314e-06, "loss": 0.10853493213653564, "step": 2140 }, { "epoch": 8.6, "grad_norm": 0.8519245982170105, "learning_rate": 5.885438678252342e-06, "loss": 0.11464111804962158, "step": 2150 }, { "epoch": 8.64, "grad_norm": 0.8307661414146423, "learning_rate": 5.5611361294594325e-06, "loss": 0.10765299797058106, "step": 2160 }, { "epoch": 8.68, "grad_norm": 0.8340169787406921, "learning_rate": 5.245497025228874e-06, "loss": 0.10699164867401123, "step": 2170 }, { "epoch": 8.72, "grad_norm": 0.7895165085792542, "learning_rate": 4.938582900029437e-06, "loss": 0.10728691816329956, "step": 2180 }, { "epoch": 8.76, "grad_norm": 0.7967789769172668, "learning_rate": 4.640453587377957e-06, "loss": 0.11177785396575927, "step": 2190 }, { "epoch": 8.8, "grad_norm": 0.8613453507423401, "learning_rate": 4.351167208174639e-06, "loss": 0.11041848659515381, "step": 2200 }, { "epoch": 8.8, "eval_loss": 0.6235533356666565, "eval_runtime": 19.0901, "eval_samples_per_second": 20.953, "eval_steps_per_second": 3.51, "step": 2200 }, { "epoch": 8.84, "grad_norm": 0.6587359309196472, "learning_rate": 4.0707801593723e-06, "loss": 0.1085782766342163, "step": 2210 }, { "epoch": 8.88, "grad_norm": 0.7126621603965759, "learning_rate": 3.799347102981665e-06, "loss": 0.11138873100280762, "step": 2220 }, { "epoch": 8.92, "grad_norm": 0.7560760974884033, "learning_rate": 3.536920955414885e-06, "loss": 0.10770895481109619, "step": 2230 }, { "epoch": 8.96, "grad_norm": 0.95421302318573, "learning_rate": 3.2835528771693992e-06, "loss": 0.11167995929718018, "step": 2240 }, { "epoch": 9.0, "grad_norm": 0.9774760007858276, "learning_rate": 3.039292262854088e-06, "loss": 0.11738998889923095, "step": 2250 }, { "epoch": 9.04, "grad_norm": 0.7680178880691528, "learning_rate": 2.804186731559677e-06, "loss": 0.10072145462036133, "step": 2260 }, { "epoch": 9.08, "grad_norm": 0.8222008943557739, "learning_rate": 2.5782821175753422e-06, "loss": 0.09228388667106628, "step": 2270 }, { "epoch": 9.12, "grad_norm": 0.8610215783119202, "learning_rate": 2.361622461453178e-06, "loss": 0.09626876711845397, "step": 2280 }, { "epoch": 9.16, "grad_norm": 0.7807718515396118, "learning_rate": 2.154250001422431e-06, "loss": 0.0960278868675232, "step": 2290 }, { "epoch": 9.2, "grad_norm": 0.8036084175109863, "learning_rate": 1.956205165155078e-06, "loss": 0.0941778838634491, "step": 2300 }, { "epoch": 9.2, "eval_loss": 0.6419874429702759, "eval_runtime": 19.9334, "eval_samples_per_second": 20.067, "eval_steps_per_second": 3.361, "step": 2300 }, { "epoch": 9.24, "grad_norm": 0.7480472326278687, "learning_rate": 1.7675265618843362e-06, "loss": 0.09725146293640137, "step": 2310 }, { "epoch": 9.28, "grad_norm": 0.8559448719024658, "learning_rate": 1.5882509748777808e-06, "loss": 0.09353782534599304, "step": 2320 }, { "epoch": 9.32, "grad_norm": 0.6416171193122864, "learning_rate": 1.4184133542663014e-06, "loss": 0.09848537445068359, "step": 2330 }, { "epoch": 9.36, "grad_norm": 0.7388947606086731, "learning_rate": 1.258046810230562e-06, "loss": 0.10164464712142944, "step": 2340 }, { "epoch": 9.4, "grad_norm": 0.8187626600265503, "learning_rate": 1.1071826065460588e-06, "loss": 0.0934177041053772, "step": 2350 }, { "epoch": 9.44, "grad_norm": 0.865635871887207, "learning_rate": 9.65850154488218e-07, "loss": 0.1012031078338623, "step": 2360 }, { "epoch": 9.48, "grad_norm": 0.8829763531684875, "learning_rate": 8.340770070986214e-07, "loss": 0.09371918439865112, "step": 2370 }, { "epoch": 9.52, "grad_norm": 0.7734853625297546, "learning_rate": 7.11888853813436e-07, "loss": 0.09450345039367676, "step": 2380 }, { "epoch": 9.56, "grad_norm": 0.7692961096763611, "learning_rate": 5.993095154552431e-07, "loss": 0.09499152898788452, "step": 2390 }, { "epoch": 9.6, "grad_norm": 1.1678398847579956, "learning_rate": 4.963609395891299e-07, "loss": 0.10716021060943604, "step": 2400 }, { "epoch": 9.6, "eval_loss": 0.6402375102043152, "eval_runtime": 18.9858, "eval_samples_per_second": 21.068, "eval_steps_per_second": 3.529, "step": 2400 }, { "epoch": 9.64, "grad_norm": 0.7258604764938354, "learning_rate": 4.030631962439302e-07, "loss": 0.09596163630485535, "step": 2410 }, { "epoch": 9.68, "grad_norm": 0.8662357330322266, "learning_rate": 3.1943447399958027e-07, "loss": 0.09645589590072631, "step": 2420 }, { "epoch": 9.72, "grad_norm": 0.8258174061775208, "learning_rate": 2.4549107644117885e-07, "loss": 0.09415926933288574, "step": 2430 }, { "epoch": 9.76, "grad_norm": 0.911540150642395, "learning_rate": 1.8124741898058462e-07, "loss": 0.10026730298995971, "step": 2440 }, { "epoch": 9.8, "grad_norm": 0.8336577415466309, "learning_rate": 1.267160260461253e-07, "loss": 0.09711679220199584, "step": 2450 }, { "epoch": 9.84, "grad_norm": 0.7324675917625427, "learning_rate": 8.190752864088436e-08, "loss": 0.09345818758010864, "step": 2460 }, { "epoch": 9.88, "grad_norm": 0.9261553287506104, "learning_rate": 4.683066227023081e-08, "loss": 0.102751624584198, "step": 2470 }, { "epoch": 9.92, "grad_norm": 0.9403973817825317, "learning_rate": 2.1492265238748366e-08, "loss": 0.0988599717617035, "step": 2480 }, { "epoch": 9.96, "grad_norm": 0.7062044739723206, "learning_rate": 5.897277317157279e-09, "loss": 0.09828301668167114, "step": 2490 }, { "epoch": 10.0, "grad_norm": 0.7819132804870605, "learning_rate": 4.873877924582715e-11, "loss": 0.0937616467475891, "step": 2500 }, { "epoch": 10.0, "eval_loss": 0.6409608721733093, "eval_runtime": 17.8761, "eval_samples_per_second": 22.376, "eval_steps_per_second": 3.748, "step": 2500 }, { "epoch": 10.0, "step": 2500, "total_flos": 3.634151342457697e+19, "train_loss": 0.2690703985452652, "train_runtime": 10014.7733, "train_samples_per_second": 5.991, "train_steps_per_second": 0.25 } ], "logging_steps": 10, "max_steps": 2500, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.634151342457697e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }