| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.5074024226110363, |
| "eval_steps": 500, |
| "global_step": 1050, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.007178106774338268, |
| "grad_norm": 0.65234375, |
| "learning_rate": 0.0001, |
| "loss": 6.8185, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.014356213548676536, |
| "grad_norm": 0.69921875, |
| "learning_rate": 0.0001, |
| "loss": 5.3587, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.021534320323014805, |
| "grad_norm": 0.984375, |
| "learning_rate": 0.0001, |
| "loss": 3.9044, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.028712427097353072, |
| "grad_norm": 0.84765625, |
| "learning_rate": 0.0001, |
| "loss": 2.4036, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.03589053387169134, |
| "grad_norm": 0.63671875, |
| "learning_rate": 0.0001, |
| "loss": 1.5506, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.04306864064602961, |
| "grad_norm": 0.44921875, |
| "learning_rate": 0.0001, |
| "loss": 0.8859, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.05024674742036788, |
| "grad_norm": 0.259765625, |
| "learning_rate": 0.0001, |
| "loss": 0.3927, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.057424854194706144, |
| "grad_norm": 0.11669921875, |
| "learning_rate": 0.0001, |
| "loss": 0.1452, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.06460296096904442, |
| "grad_norm": 0.10400390625, |
| "learning_rate": 0.0001, |
| "loss": 0.0693, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.07178106774338268, |
| "grad_norm": 0.040283203125, |
| "learning_rate": 0.0001, |
| "loss": 0.0279, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.07895917451772096, |
| "grad_norm": 0.46484375, |
| "learning_rate": 0.0001, |
| "loss": 1.6299, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.08613728129205922, |
| "grad_norm": 0.201171875, |
| "learning_rate": 0.0001, |
| "loss": 0.9721, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.09331538806639748, |
| "grad_norm": 0.1953125, |
| "learning_rate": 0.0001, |
| "loss": 0.8273, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.10049349484073576, |
| "grad_norm": 0.1259765625, |
| "learning_rate": 0.0001, |
| "loss": 0.6694, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.10767160161507403, |
| "grad_norm": 0.1171875, |
| "learning_rate": 0.0001, |
| "loss": 0.5689, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.11484970838941229, |
| "grad_norm": 0.1357421875, |
| "learning_rate": 0.0001, |
| "loss": 0.35, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.12202781516375057, |
| "grad_norm": 0.06640625, |
| "learning_rate": 0.0001, |
| "loss": 0.1548, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.12920592193808883, |
| "grad_norm": 0.0791015625, |
| "learning_rate": 0.0001, |
| "loss": 0.0625, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.1363840287124271, |
| "grad_norm": 0.0284423828125, |
| "learning_rate": 0.0001, |
| "loss": 0.0345, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.14356213548676536, |
| "grad_norm": 0.0654296875, |
| "learning_rate": 0.0001, |
| "loss": 0.0194, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.15074024226110364, |
| "grad_norm": 0.2490234375, |
| "learning_rate": 0.0001, |
| "loss": 1.1732, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.1579183490354419, |
| "grad_norm": 0.2099609375, |
| "learning_rate": 0.0001, |
| "loss": 0.87, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.16509645580978016, |
| "grad_norm": 0.1298828125, |
| "learning_rate": 0.0001, |
| "loss": 0.7213, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.17227456258411844, |
| "grad_norm": 0.158203125, |
| "learning_rate": 0.0001, |
| "loss": 0.5522, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.17945266935845672, |
| "grad_norm": 0.1015625, |
| "learning_rate": 0.0001, |
| "loss": 0.4513, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.18663077613279497, |
| "grad_norm": 0.1064453125, |
| "learning_rate": 0.0001, |
| "loss": 0.2306, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.19380888290713325, |
| "grad_norm": 0.06591796875, |
| "learning_rate": 0.0001, |
| "loss": 0.0997, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.20098698968147152, |
| "grad_norm": 0.060546875, |
| "learning_rate": 0.0001, |
| "loss": 0.0362, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.20816509645580977, |
| "grad_norm": 0.037109375, |
| "learning_rate": 0.0001, |
| "loss": 0.0274, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.21534320323014805, |
| "grad_norm": 0.0234375, |
| "learning_rate": 0.0001, |
| "loss": 0.0054, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.22252131000448633, |
| "grad_norm": 0.337890625, |
| "learning_rate": 0.0001, |
| "loss": 1.0624, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.22969941677882458, |
| "grad_norm": 0.1787109375, |
| "learning_rate": 0.0001, |
| "loss": 0.829, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.23687752355316286, |
| "grad_norm": 0.15234375, |
| "learning_rate": 0.0001, |
| "loss": 0.6497, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.24405563032750113, |
| "grad_norm": 0.1142578125, |
| "learning_rate": 0.0001, |
| "loss": 0.5721, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.2512337371018394, |
| "grad_norm": 0.1533203125, |
| "learning_rate": 0.0001, |
| "loss": 0.4299, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.25841184387617766, |
| "grad_norm": 0.11962890625, |
| "learning_rate": 0.0001, |
| "loss": 0.2842, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.26558995065051594, |
| "grad_norm": 0.049560546875, |
| "learning_rate": 0.0001, |
| "loss": 0.1096, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.2727680574248542, |
| "grad_norm": 0.072265625, |
| "learning_rate": 0.0001, |
| "loss": 0.0362, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.27994616419919244, |
| "grad_norm": 0.0634765625, |
| "learning_rate": 0.0001, |
| "loss": 0.0188, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.2871242709735307, |
| "grad_norm": 0.0167236328125, |
| "learning_rate": 0.0001, |
| "loss": 0.0077, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.294302377747869, |
| "grad_norm": 0.2109375, |
| "learning_rate": 0.0001, |
| "loss": 1.0719, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.30148048452220727, |
| "grad_norm": 0.1669921875, |
| "learning_rate": 0.0001, |
| "loss": 0.79, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.30865859129654555, |
| "grad_norm": 0.1328125, |
| "learning_rate": 0.0001, |
| "loss": 0.6307, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.3158366980708838, |
| "grad_norm": 0.126953125, |
| "learning_rate": 0.0001, |
| "loss": 0.5041, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.32301480484522205, |
| "grad_norm": 0.1748046875, |
| "learning_rate": 0.0001, |
| "loss": 0.4389, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.3301929116195603, |
| "grad_norm": 0.1181640625, |
| "learning_rate": 0.0001, |
| "loss": 0.2337, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.3373710183938986, |
| "grad_norm": 0.08154296875, |
| "learning_rate": 0.0001, |
| "loss": 0.1152, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.3445491251682369, |
| "grad_norm": 0.038818359375, |
| "learning_rate": 0.0001, |
| "loss": 0.0224, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.35172723194257516, |
| "grad_norm": 0.0703125, |
| "learning_rate": 0.0001, |
| "loss": 0.0363, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.35890533871691344, |
| "grad_norm": 0.0400390625, |
| "learning_rate": 0.0001, |
| "loss": 0.0073, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.36608344549125166, |
| "grad_norm": 0.1650390625, |
| "learning_rate": 0.0001, |
| "loss": 1.0824, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.37326155226558994, |
| "grad_norm": 0.12890625, |
| "learning_rate": 0.0001, |
| "loss": 0.8525, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.3804396590399282, |
| "grad_norm": 0.1484375, |
| "learning_rate": 0.0001, |
| "loss": 0.6736, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.3876177658142665, |
| "grad_norm": 0.16015625, |
| "learning_rate": 0.0001, |
| "loss": 0.5694, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.39479587258860477, |
| "grad_norm": 0.146484375, |
| "learning_rate": 0.0001, |
| "loss": 0.4329, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.40197397936294305, |
| "grad_norm": 0.095703125, |
| "learning_rate": 0.0001, |
| "loss": 0.2051, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.40915208613728127, |
| "grad_norm": 0.130859375, |
| "learning_rate": 0.0001, |
| "loss": 0.1067, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.41633019291161955, |
| "grad_norm": 0.10302734375, |
| "learning_rate": 0.0001, |
| "loss": 0.0365, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.4235082996859578, |
| "grad_norm": 0.05126953125, |
| "learning_rate": 0.0001, |
| "loss": 0.0252, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.4306864064602961, |
| "grad_norm": 0.0029449462890625, |
| "learning_rate": 0.0001, |
| "loss": 0.0046, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.4378645132346344, |
| "grad_norm": 0.2177734375, |
| "learning_rate": 0.0001, |
| "loss": 1.0461, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.44504262000897266, |
| "grad_norm": 0.1474609375, |
| "learning_rate": 0.0001, |
| "loss": 0.7834, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.4522207267833109, |
| "grad_norm": 0.11669921875, |
| "learning_rate": 0.0001, |
| "loss": 0.6162, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.45939883355764916, |
| "grad_norm": 0.1123046875, |
| "learning_rate": 0.0001, |
| "loss": 0.4886, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.46657694033198743, |
| "grad_norm": 0.11962890625, |
| "learning_rate": 0.0001, |
| "loss": 0.3858, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.4737550471063257, |
| "grad_norm": 0.09521484375, |
| "learning_rate": 0.0001, |
| "loss": 0.2249, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.480933153880664, |
| "grad_norm": 0.061279296875, |
| "learning_rate": 0.0001, |
| "loss": 0.0778, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.48811126065500227, |
| "grad_norm": 0.04931640625, |
| "learning_rate": 0.0001, |
| "loss": 0.0258, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.4952893674293405, |
| "grad_norm": 0.0283203125, |
| "learning_rate": 0.0001, |
| "loss": 0.0245, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.5024674742036788, |
| "grad_norm": 0.0218505859375, |
| "learning_rate": 0.0001, |
| "loss": 0.0108, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.509645580978017, |
| "grad_norm": 0.2060546875, |
| "learning_rate": 0.0001, |
| "loss": 1.1229, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.5168236877523553, |
| "grad_norm": 0.130859375, |
| "learning_rate": 0.0001, |
| "loss": 0.7767, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.5240017945266936, |
| "grad_norm": 0.1162109375, |
| "learning_rate": 0.0001, |
| "loss": 0.6151, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.5311799013010319, |
| "grad_norm": 0.11767578125, |
| "learning_rate": 0.0001, |
| "loss": 0.4997, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.5383580080753702, |
| "grad_norm": 0.1181640625, |
| "learning_rate": 0.0001, |
| "loss": 0.3645, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.5455361148497084, |
| "grad_norm": 0.09228515625, |
| "learning_rate": 0.0001, |
| "loss": 0.2487, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.5527142216240467, |
| "grad_norm": 0.043212890625, |
| "learning_rate": 0.0001, |
| "loss": 0.1116, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.5598923283983849, |
| "grad_norm": 0.0262451171875, |
| "learning_rate": 0.0001, |
| "loss": 0.0278, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.5670704351727232, |
| "grad_norm": 0.048583984375, |
| "learning_rate": 0.0001, |
| "loss": 0.0104, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.5742485419470614, |
| "grad_norm": 0.0458984375, |
| "learning_rate": 0.0001, |
| "loss": 0.0104, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.5814266487213997, |
| "grad_norm": 0.1953125, |
| "learning_rate": 0.0001, |
| "loss": 0.9303, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.588604755495738, |
| "grad_norm": 0.1513671875, |
| "learning_rate": 0.0001, |
| "loss": 0.766, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.5957828622700763, |
| "grad_norm": 0.130859375, |
| "learning_rate": 0.0001, |
| "loss": 0.5917, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.6029609690444145, |
| "grad_norm": 0.10595703125, |
| "learning_rate": 0.0001, |
| "loss": 0.5611, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.6101390758187528, |
| "grad_norm": 0.1220703125, |
| "learning_rate": 0.0001, |
| "loss": 0.3833, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.6173171825930911, |
| "grad_norm": 0.11865234375, |
| "learning_rate": 0.0001, |
| "loss": 0.2563, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.6244952893674294, |
| "grad_norm": 0.07568359375, |
| "learning_rate": 0.0001, |
| "loss": 0.1056, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.6316733961417677, |
| "grad_norm": 0.087890625, |
| "learning_rate": 0.0001, |
| "loss": 0.0343, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.6388515029161059, |
| "grad_norm": 0.038818359375, |
| "learning_rate": 0.0001, |
| "loss": 0.0113, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.6460296096904441, |
| "grad_norm": 0.0194091796875, |
| "learning_rate": 0.0001, |
| "loss": 0.0062, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.6532077164647824, |
| "grad_norm": 0.18359375, |
| "learning_rate": 0.0001, |
| "loss": 0.894, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.6603858232391207, |
| "grad_norm": 0.158203125, |
| "learning_rate": 0.0001, |
| "loss": 0.7454, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.6675639300134589, |
| "grad_norm": 0.123046875, |
| "learning_rate": 0.0001, |
| "loss": 0.5539, |
| "step": 465 |
| }, |
| { |
| "epoch": 0.6747420367877972, |
| "grad_norm": 0.1357421875, |
| "learning_rate": 0.0001, |
| "loss": 0.5263, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.6819201435621355, |
| "grad_norm": 0.09521484375, |
| "learning_rate": 0.0001, |
| "loss": 0.3882, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.6890982503364738, |
| "grad_norm": 0.07958984375, |
| "learning_rate": 0.0001, |
| "loss": 0.2243, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.696276357110812, |
| "grad_norm": 0.08154296875, |
| "learning_rate": 0.0001, |
| "loss": 0.0728, |
| "step": 485 |
| }, |
| { |
| "epoch": 0.7034544638851503, |
| "grad_norm": 0.048583984375, |
| "learning_rate": 0.0001, |
| "loss": 0.0205, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.7106325706594886, |
| "grad_norm": 0.06103515625, |
| "learning_rate": 0.0001, |
| "loss": 0.0179, |
| "step": 495 |
| }, |
| { |
| "epoch": 0.7178106774338269, |
| "grad_norm": 0.031494140625, |
| "learning_rate": 0.0001, |
| "loss": 0.0072, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.7249887842081651, |
| "grad_norm": 0.2470703125, |
| "learning_rate": 0.0001, |
| "loss": 0.9516, |
| "step": 505 |
| }, |
| { |
| "epoch": 0.7321668909825033, |
| "grad_norm": 0.1240234375, |
| "learning_rate": 0.0001, |
| "loss": 0.6854, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.7393449977568416, |
| "grad_norm": 0.1552734375, |
| "learning_rate": 0.0001, |
| "loss": 0.5769, |
| "step": 515 |
| }, |
| { |
| "epoch": 0.7465231045311799, |
| "grad_norm": 0.11962890625, |
| "learning_rate": 0.0001, |
| "loss": 0.4634, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.7537012113055181, |
| "grad_norm": 0.11962890625, |
| "learning_rate": 0.0001, |
| "loss": 0.3856, |
| "step": 525 |
| }, |
| { |
| "epoch": 0.7608793180798564, |
| "grad_norm": 0.10791015625, |
| "learning_rate": 0.0001, |
| "loss": 0.2155, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.7680574248541947, |
| "grad_norm": 0.0634765625, |
| "learning_rate": 0.0001, |
| "loss": 0.0857, |
| "step": 535 |
| }, |
| { |
| "epoch": 0.775235531628533, |
| "grad_norm": 0.07861328125, |
| "learning_rate": 0.0001, |
| "loss": 0.0233, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.7824136384028713, |
| "grad_norm": 0.0281982421875, |
| "learning_rate": 0.0001, |
| "loss": 0.013, |
| "step": 545 |
| }, |
| { |
| "epoch": 0.7895917451772095, |
| "grad_norm": 0.016845703125, |
| "learning_rate": 0.0001, |
| "loss": 0.0061, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.7967698519515478, |
| "grad_norm": 0.1796875, |
| "learning_rate": 0.0001, |
| "loss": 0.8853, |
| "step": 555 |
| }, |
| { |
| "epoch": 0.8039479587258861, |
| "grad_norm": 0.154296875, |
| "learning_rate": 0.0001, |
| "loss": 0.726, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.8111260655002244, |
| "grad_norm": 0.1328125, |
| "learning_rate": 0.0001, |
| "loss": 0.62, |
| "step": 565 |
| }, |
| { |
| "epoch": 0.8183041722745625, |
| "grad_norm": 0.126953125, |
| "learning_rate": 0.0001, |
| "loss": 0.5036, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.8254822790489008, |
| "grad_norm": 0.1279296875, |
| "learning_rate": 0.0001, |
| "loss": 0.4053, |
| "step": 575 |
| }, |
| { |
| "epoch": 0.8326603858232391, |
| "grad_norm": 0.1142578125, |
| "learning_rate": 0.0001, |
| "loss": 0.2355, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.8398384925975774, |
| "grad_norm": 0.045654296875, |
| "learning_rate": 0.0001, |
| "loss": 0.0751, |
| "step": 585 |
| }, |
| { |
| "epoch": 0.8470165993719156, |
| "grad_norm": 0.109375, |
| "learning_rate": 0.0001, |
| "loss": 0.0226, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.8541947061462539, |
| "grad_norm": 0.027587890625, |
| "learning_rate": 0.0001, |
| "loss": 0.0055, |
| "step": 595 |
| }, |
| { |
| "epoch": 0.8613728129205922, |
| "grad_norm": 0.05712890625, |
| "learning_rate": 0.0001, |
| "loss": 0.0052, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.8685509196949305, |
| "grad_norm": 0.1943359375, |
| "learning_rate": 0.0001, |
| "loss": 0.9366, |
| "step": 605 |
| }, |
| { |
| "epoch": 0.8757290264692688, |
| "grad_norm": 0.1484375, |
| "learning_rate": 0.0001, |
| "loss": 0.7429, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.882907133243607, |
| "grad_norm": 0.14453125, |
| "learning_rate": 0.0001, |
| "loss": 0.564, |
| "step": 615 |
| }, |
| { |
| "epoch": 0.8900852400179453, |
| "grad_norm": 0.140625, |
| "learning_rate": 0.0001, |
| "loss": 0.5045, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.8972633467922836, |
| "grad_norm": 0.1259765625, |
| "learning_rate": 0.0001, |
| "loss": 0.3997, |
| "step": 625 |
| }, |
| { |
| "epoch": 0.9044414535666218, |
| "grad_norm": 0.0830078125, |
| "learning_rate": 0.0001, |
| "loss": 0.1856, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.91161956034096, |
| "grad_norm": 0.06298828125, |
| "learning_rate": 0.0001, |
| "loss": 0.0583, |
| "step": 635 |
| }, |
| { |
| "epoch": 0.9187976671152983, |
| "grad_norm": 0.033935546875, |
| "learning_rate": 0.0001, |
| "loss": 0.0274, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.9259757738896366, |
| "grad_norm": 0.03271484375, |
| "learning_rate": 0.0001, |
| "loss": 0.0078, |
| "step": 645 |
| }, |
| { |
| "epoch": 0.9331538806639749, |
| "grad_norm": 0.0244140625, |
| "learning_rate": 0.0001, |
| "loss": 0.003, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.9403319874383131, |
| "grad_norm": 0.220703125, |
| "learning_rate": 0.0001, |
| "loss": 0.9234, |
| "step": 655 |
| }, |
| { |
| "epoch": 0.9475100942126514, |
| "grad_norm": 0.1494140625, |
| "learning_rate": 0.0001, |
| "loss": 0.7145, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.9546882009869897, |
| "grad_norm": 0.138671875, |
| "learning_rate": 0.0001, |
| "loss": 0.5764, |
| "step": 665 |
| }, |
| { |
| "epoch": 0.961866307761328, |
| "grad_norm": 0.1298828125, |
| "learning_rate": 0.0001, |
| "loss": 0.4568, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.9690444145356663, |
| "grad_norm": 0.10400390625, |
| "learning_rate": 0.0001, |
| "loss": 0.2681, |
| "step": 675 |
| }, |
| { |
| "epoch": 0.9762225213100045, |
| "grad_norm": 0.080078125, |
| "learning_rate": 0.0001, |
| "loss": 0.1399, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.9834006280843428, |
| "grad_norm": 0.068359375, |
| "learning_rate": 0.0001, |
| "loss": 0.0375, |
| "step": 685 |
| }, |
| { |
| "epoch": 0.990578734858681, |
| "grad_norm": 0.040283203125, |
| "learning_rate": 0.0001, |
| "loss": 0.0108, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.9977568416330193, |
| "grad_norm": 0.022216796875, |
| "learning_rate": 0.0001, |
| "loss": 0.0082, |
| "step": 695 |
| }, |
| { |
| "epoch": 1.0049349484073575, |
| "grad_norm": 0.193359375, |
| "learning_rate": 0.0001, |
| "loss": 0.6031, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.012113055181696, |
| "grad_norm": 0.1640625, |
| "learning_rate": 0.0001, |
| "loss": 0.7291, |
| "step": 705 |
| }, |
| { |
| "epoch": 1.019291161956034, |
| "grad_norm": 0.1708984375, |
| "learning_rate": 0.0001, |
| "loss": 0.5393, |
| "step": 710 |
| }, |
| { |
| "epoch": 1.0264692687303723, |
| "grad_norm": 0.1416015625, |
| "learning_rate": 0.0001, |
| "loss": 0.413, |
| "step": 715 |
| }, |
| { |
| "epoch": 1.0336473755047106, |
| "grad_norm": 0.11669921875, |
| "learning_rate": 0.0001, |
| "loss": 0.3693, |
| "step": 720 |
| }, |
| { |
| "epoch": 1.0408254822790488, |
| "grad_norm": 0.123046875, |
| "learning_rate": 0.0001, |
| "loss": 0.2104, |
| "step": 725 |
| }, |
| { |
| "epoch": 1.0480035890533872, |
| "grad_norm": 0.055908203125, |
| "learning_rate": 0.0001, |
| "loss": 0.0834, |
| "step": 730 |
| }, |
| { |
| "epoch": 1.0551816958277254, |
| "grad_norm": 0.0546875, |
| "learning_rate": 0.0001, |
| "loss": 0.0144, |
| "step": 735 |
| }, |
| { |
| "epoch": 1.0623598026020638, |
| "grad_norm": 0.11181640625, |
| "learning_rate": 0.0001, |
| "loss": 0.0119, |
| "step": 740 |
| }, |
| { |
| "epoch": 1.069537909376402, |
| "grad_norm": 0.0034332275390625, |
| "learning_rate": 0.0001, |
| "loss": 0.0023, |
| "step": 745 |
| }, |
| { |
| "epoch": 1.0767160161507403, |
| "grad_norm": 0.2490234375, |
| "learning_rate": 0.0001, |
| "loss": 0.5662, |
| "step": 750 |
| }, |
| { |
| "epoch": 1.0838941229250785, |
| "grad_norm": 0.2177734375, |
| "learning_rate": 0.0001, |
| "loss": 0.7079, |
| "step": 755 |
| }, |
| { |
| "epoch": 1.0910722296994169, |
| "grad_norm": 0.1904296875, |
| "learning_rate": 0.0001, |
| "loss": 0.5619, |
| "step": 760 |
| }, |
| { |
| "epoch": 1.098250336473755, |
| "grad_norm": 0.12890625, |
| "learning_rate": 0.0001, |
| "loss": 0.4236, |
| "step": 765 |
| }, |
| { |
| "epoch": 1.1054284432480934, |
| "grad_norm": 0.11328125, |
| "learning_rate": 0.0001, |
| "loss": 0.3422, |
| "step": 770 |
| }, |
| { |
| "epoch": 1.1126065500224316, |
| "grad_norm": 0.11181640625, |
| "learning_rate": 0.0001, |
| "loss": 0.2757, |
| "step": 775 |
| }, |
| { |
| "epoch": 1.1197846567967698, |
| "grad_norm": 0.1103515625, |
| "learning_rate": 0.0001, |
| "loss": 0.101, |
| "step": 780 |
| }, |
| { |
| "epoch": 1.1269627635711081, |
| "grad_norm": 0.0615234375, |
| "learning_rate": 0.0001, |
| "loss": 0.0292, |
| "step": 785 |
| }, |
| { |
| "epoch": 1.1341408703454463, |
| "grad_norm": 0.01123046875, |
| "learning_rate": 0.0001, |
| "loss": 0.0117, |
| "step": 790 |
| }, |
| { |
| "epoch": 1.1413189771197847, |
| "grad_norm": 0.0311279296875, |
| "learning_rate": 0.0001, |
| "loss": 0.0068, |
| "step": 795 |
| }, |
| { |
| "epoch": 1.1484970838941229, |
| "grad_norm": 0.2236328125, |
| "learning_rate": 0.0001, |
| "loss": 0.5275, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.1556751906684612, |
| "grad_norm": 0.2060546875, |
| "learning_rate": 0.0001, |
| "loss": 0.7151, |
| "step": 805 |
| }, |
| { |
| "epoch": 1.1628532974427994, |
| "grad_norm": 0.1708984375, |
| "learning_rate": 0.0001, |
| "loss": 0.5625, |
| "step": 810 |
| }, |
| { |
| "epoch": 1.1700314042171378, |
| "grad_norm": 0.1708984375, |
| "learning_rate": 0.0001, |
| "loss": 0.4765, |
| "step": 815 |
| }, |
| { |
| "epoch": 1.177209510991476, |
| "grad_norm": 0.1875, |
| "learning_rate": 0.0001, |
| "loss": 0.3728, |
| "step": 820 |
| }, |
| { |
| "epoch": 1.1843876177658144, |
| "grad_norm": 0.11181640625, |
| "learning_rate": 0.0001, |
| "loss": 0.2169, |
| "step": 825 |
| }, |
| { |
| "epoch": 1.1915657245401525, |
| "grad_norm": 0.0888671875, |
| "learning_rate": 0.0001, |
| "loss": 0.09, |
| "step": 830 |
| }, |
| { |
| "epoch": 1.198743831314491, |
| "grad_norm": 0.06396484375, |
| "learning_rate": 0.0001, |
| "loss": 0.0261, |
| "step": 835 |
| }, |
| { |
| "epoch": 1.205921938088829, |
| "grad_norm": 0.06591796875, |
| "learning_rate": 0.0001, |
| "loss": 0.0169, |
| "step": 840 |
| }, |
| { |
| "epoch": 1.2131000448631672, |
| "grad_norm": 0.01409912109375, |
| "learning_rate": 0.0001, |
| "loss": 0.0041, |
| "step": 845 |
| }, |
| { |
| "epoch": 1.2202781516375056, |
| "grad_norm": 0.2265625, |
| "learning_rate": 0.0001, |
| "loss": 0.5508, |
| "step": 850 |
| }, |
| { |
| "epoch": 1.2274562584118438, |
| "grad_norm": 0.255859375, |
| "learning_rate": 0.0001, |
| "loss": 0.7281, |
| "step": 855 |
| }, |
| { |
| "epoch": 1.2346343651861822, |
| "grad_norm": 0.212890625, |
| "learning_rate": 0.0001, |
| "loss": 0.499, |
| "step": 860 |
| }, |
| { |
| "epoch": 1.2418124719605204, |
| "grad_norm": 0.1767578125, |
| "learning_rate": 0.0001, |
| "loss": 0.5054, |
| "step": 865 |
| }, |
| { |
| "epoch": 1.2489905787348587, |
| "grad_norm": 0.1513671875, |
| "learning_rate": 0.0001, |
| "loss": 0.3918, |
| "step": 870 |
| }, |
| { |
| "epoch": 1.256168685509197, |
| "grad_norm": 0.1318359375, |
| "learning_rate": 0.0001, |
| "loss": 0.2211, |
| "step": 875 |
| }, |
| { |
| "epoch": 1.263346792283535, |
| "grad_norm": 0.053955078125, |
| "learning_rate": 0.0001, |
| "loss": 0.099, |
| "step": 880 |
| }, |
| { |
| "epoch": 1.2705248990578735, |
| "grad_norm": 0.0263671875, |
| "learning_rate": 0.0001, |
| "loss": 0.0239, |
| "step": 885 |
| }, |
| { |
| "epoch": 1.2777030058322119, |
| "grad_norm": 0.055908203125, |
| "learning_rate": 0.0001, |
| "loss": 0.0203, |
| "step": 890 |
| }, |
| { |
| "epoch": 1.28488111260655, |
| "grad_norm": 0.0172119140625, |
| "learning_rate": 0.0001, |
| "loss": 0.0053, |
| "step": 895 |
| }, |
| { |
| "epoch": 1.2920592193808882, |
| "grad_norm": 0.1943359375, |
| "learning_rate": 0.0001, |
| "loss": 0.4856, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.2992373261552266, |
| "grad_norm": 0.2138671875, |
| "learning_rate": 0.0001, |
| "loss": 0.7204, |
| "step": 905 |
| }, |
| { |
| "epoch": 1.3064154329295647, |
| "grad_norm": 0.19140625, |
| "learning_rate": 0.0001, |
| "loss": 0.5374, |
| "step": 910 |
| }, |
| { |
| "epoch": 1.3135935397039031, |
| "grad_norm": 0.216796875, |
| "learning_rate": 0.0001, |
| "loss": 0.48, |
| "step": 915 |
| }, |
| { |
| "epoch": 1.3207716464782413, |
| "grad_norm": 0.19921875, |
| "learning_rate": 0.0001, |
| "loss": 0.3897, |
| "step": 920 |
| }, |
| { |
| "epoch": 1.3279497532525797, |
| "grad_norm": 0.10205078125, |
| "learning_rate": 0.0001, |
| "loss": 0.2242, |
| "step": 925 |
| }, |
| { |
| "epoch": 1.3351278600269179, |
| "grad_norm": 0.08251953125, |
| "learning_rate": 0.0001, |
| "loss": 0.1292, |
| "step": 930 |
| }, |
| { |
| "epoch": 1.3423059668012562, |
| "grad_norm": 0.068359375, |
| "learning_rate": 0.0001, |
| "loss": 0.0242, |
| "step": 935 |
| }, |
| { |
| "epoch": 1.3494840735755944, |
| "grad_norm": 0.08544921875, |
| "learning_rate": 0.0001, |
| "loss": 0.0092, |
| "step": 940 |
| }, |
| { |
| "epoch": 1.3566621803499328, |
| "grad_norm": 0.004241943359375, |
| "learning_rate": 0.0001, |
| "loss": 0.0039, |
| "step": 945 |
| }, |
| { |
| "epoch": 1.363840287124271, |
| "grad_norm": 0.25390625, |
| "learning_rate": 0.0001, |
| "loss": 0.5465, |
| "step": 950 |
| }, |
| { |
| "epoch": 1.3710183938986091, |
| "grad_norm": 0.2412109375, |
| "learning_rate": 0.0001, |
| "loss": 0.6114, |
| "step": 955 |
| }, |
| { |
| "epoch": 1.3781965006729475, |
| "grad_norm": 0.244140625, |
| "learning_rate": 0.0001, |
| "loss": 0.5226, |
| "step": 960 |
| }, |
| { |
| "epoch": 1.385374607447286, |
| "grad_norm": 0.205078125, |
| "learning_rate": 0.0001, |
| "loss": 0.4234, |
| "step": 965 |
| }, |
| { |
| "epoch": 1.392552714221624, |
| "grad_norm": 0.130859375, |
| "learning_rate": 0.0001, |
| "loss": 0.3595, |
| "step": 970 |
| }, |
| { |
| "epoch": 1.3997308209959622, |
| "grad_norm": 0.123046875, |
| "learning_rate": 0.0001, |
| "loss": 0.2464, |
| "step": 975 |
| }, |
| { |
| "epoch": 1.4069089277703006, |
| "grad_norm": 0.11767578125, |
| "learning_rate": 0.0001, |
| "loss": 0.11, |
| "step": 980 |
| }, |
| { |
| "epoch": 1.4140870345446388, |
| "grad_norm": 0.05322265625, |
| "learning_rate": 0.0001, |
| "loss": 0.0205, |
| "step": 985 |
| }, |
| { |
| "epoch": 1.4212651413189772, |
| "grad_norm": 0.0206298828125, |
| "learning_rate": 0.0001, |
| "loss": 0.0102, |
| "step": 990 |
| }, |
| { |
| "epoch": 1.4284432480933154, |
| "grad_norm": 0.0250244140625, |
| "learning_rate": 0.0001, |
| "loss": 0.0044, |
| "step": 995 |
| }, |
| { |
| "epoch": 1.4356213548676537, |
| "grad_norm": 0.23046875, |
| "learning_rate": 0.0001, |
| "loss": 0.4827, |
| "step": 1000 |
| }, |
| { |
| "epoch": 1.442799461641992, |
| "grad_norm": 0.2314453125, |
| "learning_rate": 0.0001, |
| "loss": 0.6536, |
| "step": 1005 |
| }, |
| { |
| "epoch": 1.44997756841633, |
| "grad_norm": 0.1953125, |
| "learning_rate": 0.0001, |
| "loss": 0.5993, |
| "step": 1010 |
| }, |
| { |
| "epoch": 1.4571556751906685, |
| "grad_norm": 0.158203125, |
| "learning_rate": 0.0001, |
| "loss": 0.4176, |
| "step": 1015 |
| }, |
| { |
| "epoch": 1.4643337819650069, |
| "grad_norm": 0.1689453125, |
| "learning_rate": 0.0001, |
| "loss": 0.307, |
| "step": 1020 |
| }, |
| { |
| "epoch": 1.471511888739345, |
| "grad_norm": 0.1005859375, |
| "learning_rate": 0.0001, |
| "loss": 0.2381, |
| "step": 1025 |
| }, |
| { |
| "epoch": 1.4786899955136832, |
| "grad_norm": 0.06396484375, |
| "learning_rate": 0.0001, |
| "loss": 0.084, |
| "step": 1030 |
| }, |
| { |
| "epoch": 1.4858681022880216, |
| "grad_norm": 0.01153564453125, |
| "learning_rate": 0.0001, |
| "loss": 0.0165, |
| "step": 1035 |
| }, |
| { |
| "epoch": 1.4930462090623597, |
| "grad_norm": 0.0283203125, |
| "learning_rate": 0.0001, |
| "loss": 0.0059, |
| "step": 1040 |
| }, |
| { |
| "epoch": 1.500224315836698, |
| "grad_norm": 0.0380859375, |
| "learning_rate": 0.0001, |
| "loss": 0.0051, |
| "step": 1045 |
| }, |
| { |
| "epoch": 1.5074024226110363, |
| "grad_norm": 0.296875, |
| "learning_rate": 0.0001, |
| "loss": 0.5321, |
| "step": 1050 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 1050, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 90, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 4.83809405232513e+18, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|