| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 4.998875140607424, | |
| "eval_steps": 500, | |
| "global_step": 555, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.008998875140607425, | |
| "grad_norm": 6.405796769298009, | |
| "learning_rate": 1.4285714285714286e-06, | |
| "loss": 1.0356, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.01799775028121485, | |
| "grad_norm": 6.457071617226858, | |
| "learning_rate": 2.8571428571428573e-06, | |
| "loss": 1.0356, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.02699662542182227, | |
| "grad_norm": 6.306094210313564, | |
| "learning_rate": 4.2857142857142855e-06, | |
| "loss": 1.0346, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.0359955005624297, | |
| "grad_norm": 4.578449546704467, | |
| "learning_rate": 5.7142857142857145e-06, | |
| "loss": 0.9945, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.04499437570303712, | |
| "grad_norm": 2.7068662098456318, | |
| "learning_rate": 7.1428571428571436e-06, | |
| "loss": 0.9512, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.05399325084364454, | |
| "grad_norm": 2.4781853702406553, | |
| "learning_rate": 8.571428571428571e-06, | |
| "loss": 0.9632, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.06299212598425197, | |
| "grad_norm": 4.778781193622344, | |
| "learning_rate": 1e-05, | |
| "loss": 0.9478, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.0719910011248594, | |
| "grad_norm": 5.10213288638439, | |
| "learning_rate": 1.1428571428571429e-05, | |
| "loss": 0.9316, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.08098987626546682, | |
| "grad_norm": 5.371042207893317, | |
| "learning_rate": 1.2857142857142859e-05, | |
| "loss": 0.9182, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.08998875140607424, | |
| "grad_norm": 4.422548992141668, | |
| "learning_rate": 1.4285714285714287e-05, | |
| "loss": 0.9052, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.09898762654668167, | |
| "grad_norm": 2.908057739308309, | |
| "learning_rate": 1.5714285714285715e-05, | |
| "loss": 0.8738, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.10798650168728909, | |
| "grad_norm": 2.081438174306188, | |
| "learning_rate": 1.7142857142857142e-05, | |
| "loss": 0.8448, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.11698537682789652, | |
| "grad_norm": 1.8625844126627613, | |
| "learning_rate": 1.8571428571428575e-05, | |
| "loss": 0.8214, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.12598425196850394, | |
| "grad_norm": 1.4150826691986598, | |
| "learning_rate": 2e-05, | |
| "loss": 0.8118, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.13498312710911137, | |
| "grad_norm": 1.3470926001768453, | |
| "learning_rate": 2.1428571428571428e-05, | |
| "loss": 0.7967, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.1439820022497188, | |
| "grad_norm": 1.318138672473746, | |
| "learning_rate": 2.2857142857142858e-05, | |
| "loss": 0.7817, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.1529808773903262, | |
| "grad_norm": 1.132045894481426, | |
| "learning_rate": 2.4285714285714285e-05, | |
| "loss": 0.7702, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.16197975253093364, | |
| "grad_norm": 1.1216235493173696, | |
| "learning_rate": 2.5714285714285718e-05, | |
| "loss": 0.7625, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.17097862767154107, | |
| "grad_norm": 1.2949162303101338, | |
| "learning_rate": 2.7142857142857148e-05, | |
| "loss": 0.7552, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.17997750281214847, | |
| "grad_norm": 1.0018485261634034, | |
| "learning_rate": 2.8571428571428574e-05, | |
| "loss": 0.748, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.1889763779527559, | |
| "grad_norm": 0.9642142358632533, | |
| "learning_rate": 3.0000000000000004e-05, | |
| "loss": 0.7319, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.19797525309336333, | |
| "grad_norm": 1.3162346092752937, | |
| "learning_rate": 3.142857142857143e-05, | |
| "loss": 0.7376, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.20697412823397077, | |
| "grad_norm": 1.2462908815710223, | |
| "learning_rate": 3.285714285714286e-05, | |
| "loss": 0.7225, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.21597300337457817, | |
| "grad_norm": 1.1288729992725366, | |
| "learning_rate": 3.4285714285714284e-05, | |
| "loss": 0.7232, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.2249718785151856, | |
| "grad_norm": 1.2886081610849456, | |
| "learning_rate": 3.571428571428572e-05, | |
| "loss": 0.7235, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.23397075365579303, | |
| "grad_norm": 1.377579395059948, | |
| "learning_rate": 3.714285714285715e-05, | |
| "loss": 0.7136, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.24296962879640044, | |
| "grad_norm": 1.0739406693404967, | |
| "learning_rate": 3.857142857142858e-05, | |
| "loss": 0.7138, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.25196850393700787, | |
| "grad_norm": 1.2160719105328928, | |
| "learning_rate": 4e-05, | |
| "loss": 0.7106, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.2609673790776153, | |
| "grad_norm": 1.3411926888307302, | |
| "learning_rate": 4.1428571428571437e-05, | |
| "loss": 0.7056, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.26996625421822273, | |
| "grad_norm": 0.8344443792471369, | |
| "learning_rate": 4.2857142857142856e-05, | |
| "loss": 0.6906, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.27896512935883017, | |
| "grad_norm": 1.6378121927259697, | |
| "learning_rate": 4.428571428571429e-05, | |
| "loss": 0.7022, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.2879640044994376, | |
| "grad_norm": 1.312672534168789, | |
| "learning_rate": 4.5714285714285716e-05, | |
| "loss": 0.6925, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.296962879640045, | |
| "grad_norm": 1.1997985746020943, | |
| "learning_rate": 4.714285714285715e-05, | |
| "loss": 0.6894, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.3059617547806524, | |
| "grad_norm": 1.3374680408068491, | |
| "learning_rate": 4.857142857142857e-05, | |
| "loss": 0.6924, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.31496062992125984, | |
| "grad_norm": 0.9866632538970782, | |
| "learning_rate": 5e-05, | |
| "loss": 0.6856, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.32395950506186727, | |
| "grad_norm": 1.7617767807118945, | |
| "learning_rate": 5.1428571428571436e-05, | |
| "loss": 0.6882, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.3329583802024747, | |
| "grad_norm": 1.2162422657164105, | |
| "learning_rate": 5.285714285714286e-05, | |
| "loss": 0.6827, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.34195725534308213, | |
| "grad_norm": 1.4381817884560075, | |
| "learning_rate": 5.4285714285714295e-05, | |
| "loss": 0.6804, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.35095613048368957, | |
| "grad_norm": 1.1994316767887803, | |
| "learning_rate": 5.5714285714285715e-05, | |
| "loss": 0.675, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.35995500562429694, | |
| "grad_norm": 1.3859110222701834, | |
| "learning_rate": 5.714285714285715e-05, | |
| "loss": 0.6714, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.3689538807649044, | |
| "grad_norm": 1.0456569888416478, | |
| "learning_rate": 5.8571428571428575e-05, | |
| "loss": 0.666, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.3779527559055118, | |
| "grad_norm": 1.550645312960814, | |
| "learning_rate": 6.000000000000001e-05, | |
| "loss": 0.6716, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.38695163104611924, | |
| "grad_norm": 1.7520944490642492, | |
| "learning_rate": 6.142857142857143e-05, | |
| "loss": 0.6668, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.39595050618672667, | |
| "grad_norm": 1.220392434494151, | |
| "learning_rate": 6.285714285714286e-05, | |
| "loss": 0.6706, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.4049493813273341, | |
| "grad_norm": 1.5640678258841063, | |
| "learning_rate": 6.428571428571429e-05, | |
| "loss": 0.6666, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.41394825646794153, | |
| "grad_norm": 1.0339470330368512, | |
| "learning_rate": 6.571428571428571e-05, | |
| "loss": 0.6611, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.4229471316085489, | |
| "grad_norm": 1.5630693263603568, | |
| "learning_rate": 6.714285714285715e-05, | |
| "loss": 0.6654, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.43194600674915634, | |
| "grad_norm": 1.4317212538586248, | |
| "learning_rate": 6.857142857142857e-05, | |
| "loss": 0.666, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.4409448818897638, | |
| "grad_norm": 1.2652376704046324, | |
| "learning_rate": 7.000000000000001e-05, | |
| "loss": 0.6525, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.4499437570303712, | |
| "grad_norm": 1.9389193690579092, | |
| "learning_rate": 7.142857142857143e-05, | |
| "loss": 0.6706, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.45894263217097864, | |
| "grad_norm": 1.2095199143374322, | |
| "learning_rate": 7.285714285714286e-05, | |
| "loss": 0.6578, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.46794150731158607, | |
| "grad_norm": 2.1530205341425943, | |
| "learning_rate": 7.42857142857143e-05, | |
| "loss": 0.6659, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.4769403824521935, | |
| "grad_norm": 1.3498447393418538, | |
| "learning_rate": 7.571428571428571e-05, | |
| "loss": 0.6555, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.4859392575928009, | |
| "grad_norm": 1.7344034955388106, | |
| "learning_rate": 7.714285714285715e-05, | |
| "loss": 0.6655, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.4949381327334083, | |
| "grad_norm": 2.165550267732371, | |
| "learning_rate": 7.857142857142858e-05, | |
| "loss": 0.6593, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.5039370078740157, | |
| "grad_norm": 1.6288021163787398, | |
| "learning_rate": 8e-05, | |
| "loss": 0.6541, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.5129358830146231, | |
| "grad_norm": 1.6669527952524703, | |
| "learning_rate": 7.999920726649282e-05, | |
| "loss": 0.6421, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.5219347581552306, | |
| "grad_norm": 1.2703954706550948, | |
| "learning_rate": 7.999682909739257e-05, | |
| "loss": 0.6416, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.530933633295838, | |
| "grad_norm": 1.6224554752837215, | |
| "learning_rate": 7.999286558696199e-05, | |
| "loss": 0.6427, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.5399325084364455, | |
| "grad_norm": 1.3278739806750215, | |
| "learning_rate": 7.998731689230145e-05, | |
| "loss": 0.6535, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.5489313835770528, | |
| "grad_norm": 1.9995409784489284, | |
| "learning_rate": 7.998018323334275e-05, | |
| "loss": 0.6496, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.5579302587176603, | |
| "grad_norm": 1.5356642072108388, | |
| "learning_rate": 7.997146489284042e-05, | |
| "loss": 0.6437, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.5669291338582677, | |
| "grad_norm": 1.256225547258712, | |
| "learning_rate": 7.996116221636049e-05, | |
| "loss": 0.6478, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.5759280089988752, | |
| "grad_norm": 1.7177343714374778, | |
| "learning_rate": 7.994927561226682e-05, | |
| "loss": 0.641, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.5849268841394826, | |
| "grad_norm": 1.8288469591109016, | |
| "learning_rate": 7.993580555170486e-05, | |
| "loss": 0.6483, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.59392575928009, | |
| "grad_norm": 0.9174977020863746, | |
| "learning_rate": 7.992075256858302e-05, | |
| "loss": 0.639, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.6029246344206974, | |
| "grad_norm": 2.3068233658832846, | |
| "learning_rate": 7.990411725955153e-05, | |
| "loss": 0.6469, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.6119235095613048, | |
| "grad_norm": 1.331580694798539, | |
| "learning_rate": 7.98859002839787e-05, | |
| "loss": 0.6403, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.6209223847019123, | |
| "grad_norm": 1.9556860563217953, | |
| "learning_rate": 7.986610236392491e-05, | |
| "loss": 0.6436, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.6299212598425197, | |
| "grad_norm": 1.4030777265618954, | |
| "learning_rate": 7.984472428411388e-05, | |
| "loss": 0.6541, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.6389201349831272, | |
| "grad_norm": 1.1120271583327745, | |
| "learning_rate": 7.98217668919016e-05, | |
| "loss": 0.6408, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.6479190101237345, | |
| "grad_norm": 1.7515091640927072, | |
| "learning_rate": 7.97972310972428e-05, | |
| "loss": 0.6462, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.6569178852643419, | |
| "grad_norm": 1.2154666389278914, | |
| "learning_rate": 7.977111787265479e-05, | |
| "loss": 0.6348, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.6659167604049494, | |
| "grad_norm": 1.1948666759848607, | |
| "learning_rate": 7.9743428253179e-05, | |
| "loss": 0.6398, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.6749156355455568, | |
| "grad_norm": 1.4465768500981744, | |
| "learning_rate": 7.971416333633984e-05, | |
| "loss": 0.633, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.6839145106861643, | |
| "grad_norm": 0.7079358166882018, | |
| "learning_rate": 7.968332428210136e-05, | |
| "loss": 0.6344, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.6929133858267716, | |
| "grad_norm": 1.2354926269354385, | |
| "learning_rate": 7.965091231282114e-05, | |
| "loss": 0.6316, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.7019122609673791, | |
| "grad_norm": 0.8980787882545499, | |
| "learning_rate": 7.961692871320186e-05, | |
| "loss": 0.6384, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.7109111361079865, | |
| "grad_norm": 1.327053450288205, | |
| "learning_rate": 7.958137483024044e-05, | |
| "loss": 0.6348, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.7199100112485939, | |
| "grad_norm": 1.0872221047887007, | |
| "learning_rate": 7.95442520731746e-05, | |
| "loss": 0.6319, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.7289088863892014, | |
| "grad_norm": 1.0406772088732106, | |
| "learning_rate": 7.9505561913427e-05, | |
| "loss": 0.6255, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.7379077615298087, | |
| "grad_norm": 1.3043796341854046, | |
| "learning_rate": 7.946530588454695e-05, | |
| "loss": 0.6324, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.7469066366704162, | |
| "grad_norm": 1.0289090852613956, | |
| "learning_rate": 7.942348558214958e-05, | |
| "loss": 0.6326, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.7559055118110236, | |
| "grad_norm": 1.309994146626142, | |
| "learning_rate": 7.938010266385268e-05, | |
| "loss": 0.6362, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.7649043869516311, | |
| "grad_norm": 0.9949679280573976, | |
| "learning_rate": 7.933515884921086e-05, | |
| "loss": 0.6274, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.7739032620922385, | |
| "grad_norm": 1.3529279590081953, | |
| "learning_rate": 7.928865591964751e-05, | |
| "loss": 0.6356, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.7829021372328459, | |
| "grad_norm": 1.5394293742781517, | |
| "learning_rate": 7.924059571838419e-05, | |
| "loss": 0.6239, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.7919010123734533, | |
| "grad_norm": 0.8230315586506134, | |
| "learning_rate": 7.919098015036746e-05, | |
| "loss": 0.6254, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.8008998875140607, | |
| "grad_norm": 2.5412889042558646, | |
| "learning_rate": 7.91398111821935e-05, | |
| "loss": 0.6386, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.8098987626546682, | |
| "grad_norm": 1.6534609255338006, | |
| "learning_rate": 7.908709084203006e-05, | |
| "loss": 0.6433, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.8188976377952756, | |
| "grad_norm": 2.115550172588045, | |
| "learning_rate": 7.903282121953619e-05, | |
| "loss": 0.6274, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.8278965129358831, | |
| "grad_norm": 2.21745507570489, | |
| "learning_rate": 7.897700446577928e-05, | |
| "loss": 0.6395, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.8368953880764904, | |
| "grad_norm": 1.1348223149712242, | |
| "learning_rate": 7.891964279314988e-05, | |
| "loss": 0.6304, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.8458942632170978, | |
| "grad_norm": 1.1093790619140609, | |
| "learning_rate": 7.886073847527397e-05, | |
| "loss": 0.634, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.8548931383577053, | |
| "grad_norm": 1.0247299216217371, | |
| "learning_rate": 7.88002938469229e-05, | |
| "loss": 0.6187, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.8638920134983127, | |
| "grad_norm": 0.9690359150703725, | |
| "learning_rate": 7.873831130392077e-05, | |
| "loss": 0.6224, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.8728908886389202, | |
| "grad_norm": 0.8479503109980445, | |
| "learning_rate": 7.867479330304951e-05, | |
| "loss": 0.6129, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.8818897637795275, | |
| "grad_norm": 0.7677468105242354, | |
| "learning_rate": 7.860974236195151e-05, | |
| "loss": 0.6133, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.890888638920135, | |
| "grad_norm": 1.0066107483900497, | |
| "learning_rate": 7.85431610590298e-05, | |
| "loss": 0.6212, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.8998875140607424, | |
| "grad_norm": 0.8385258369213318, | |
| "learning_rate": 7.847505203334588e-05, | |
| "loss": 0.6162, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.9088863892013498, | |
| "grad_norm": 0.8097143309117704, | |
| "learning_rate": 7.840541798451506e-05, | |
| "loss": 0.6105, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.9178852643419573, | |
| "grad_norm": 0.6217223076576118, | |
| "learning_rate": 7.833426167259955e-05, | |
| "loss": 0.6091, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.9268841394825647, | |
| "grad_norm": 0.8171222793279067, | |
| "learning_rate": 7.826158591799898e-05, | |
| "loss": 0.6124, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.9358830146231721, | |
| "grad_norm": 0.825619425206217, | |
| "learning_rate": 7.818739360133863e-05, | |
| "loss": 0.6093, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.9448818897637795, | |
| "grad_norm": 0.9916352114638577, | |
| "learning_rate": 7.811168766335531e-05, | |
| "loss": 0.6152, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.953880764904387, | |
| "grad_norm": 1.0337582772307539, | |
| "learning_rate": 7.803447110478067e-05, | |
| "loss": 0.6183, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.9628796400449944, | |
| "grad_norm": 0.9165715880682846, | |
| "learning_rate": 7.795574698622237e-05, | |
| "loss": 0.6176, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.9718785151856018, | |
| "grad_norm": 0.6530649555546069, | |
| "learning_rate": 7.787551842804276e-05, | |
| "loss": 0.6099, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.9808773903262092, | |
| "grad_norm": 0.7668074713592107, | |
| "learning_rate": 7.779378861023516e-05, | |
| "loss": 0.6063, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.9898762654668166, | |
| "grad_norm": 0.715515594924498, | |
| "learning_rate": 7.77105607722978e-05, | |
| "loss": 0.6088, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.9988751406074241, | |
| "grad_norm": 0.7194843087763119, | |
| "learning_rate": 7.762583821310548e-05, | |
| "loss": 0.6027, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 1.0078740157480315, | |
| "grad_norm": 1.3373684763348388, | |
| "learning_rate": 7.753962429077881e-05, | |
| "loss": 1.109, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 1.0168728908886389, | |
| "grad_norm": 1.2665705644041705, | |
| "learning_rate": 7.7451922422551e-05, | |
| "loss": 0.5896, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 1.0258717660292462, | |
| "grad_norm": 0.7146916740896777, | |
| "learning_rate": 7.736273608463253e-05, | |
| "loss": 0.5776, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 1.0348706411698538, | |
| "grad_norm": 0.8865840936771311, | |
| "learning_rate": 7.727206881207334e-05, | |
| "loss": 0.5907, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 1.0438695163104612, | |
| "grad_norm": 0.7277897994412549, | |
| "learning_rate": 7.717992419862268e-05, | |
| "loss": 0.5873, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 1.0528683914510686, | |
| "grad_norm": 0.6536923705270321, | |
| "learning_rate": 7.708630589658667e-05, | |
| "loss": 0.5835, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 1.061867266591676, | |
| "grad_norm": 0.7162373408409907, | |
| "learning_rate": 7.699121761668355e-05, | |
| "loss": 0.5794, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 1.0708661417322836, | |
| "grad_norm": 1.0254372509038927, | |
| "learning_rate": 7.689466312789661e-05, | |
| "loss": 0.582, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 1.079865016872891, | |
| "grad_norm": 1.1137703756989226, | |
| "learning_rate": 7.679664625732478e-05, | |
| "loss": 0.5907, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.0888638920134983, | |
| "grad_norm": 0.5744577610457379, | |
| "learning_rate": 7.669717089003094e-05, | |
| "loss": 0.5742, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 1.0978627671541057, | |
| "grad_norm": 0.6451796147865981, | |
| "learning_rate": 7.659624096888792e-05, | |
| "loss": 0.5831, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 1.106861642294713, | |
| "grad_norm": 0.6276129538814031, | |
| "learning_rate": 7.649386049442223e-05, | |
| "loss": 0.5827, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 1.1158605174353207, | |
| "grad_norm": 0.7718645523573185, | |
| "learning_rate": 7.639003352465551e-05, | |
| "loss": 0.5745, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 1.124859392575928, | |
| "grad_norm": 0.9206192827571739, | |
| "learning_rate": 7.628476417494368e-05, | |
| "loss": 0.5839, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 1.1338582677165354, | |
| "grad_norm": 0.698663737317266, | |
| "learning_rate": 7.617805661781374e-05, | |
| "loss": 0.5754, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 1.1428571428571428, | |
| "grad_norm": 0.5240414441327959, | |
| "learning_rate": 7.60699150827985e-05, | |
| "loss": 0.5772, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 1.1518560179977504, | |
| "grad_norm": 0.5843162509749283, | |
| "learning_rate": 7.596034385626888e-05, | |
| "loss": 0.5727, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 1.1608548931383578, | |
| "grad_norm": 0.7391156549586344, | |
| "learning_rate": 7.584934728126403e-05, | |
| "loss": 0.5736, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 1.1698537682789651, | |
| "grad_norm": 0.6964429639639632, | |
| "learning_rate": 7.573692975731914e-05, | |
| "loss": 0.5677, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.1788526434195725, | |
| "grad_norm": 0.5118585657523064, | |
| "learning_rate": 7.562309574029112e-05, | |
| "loss": 0.5796, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 1.18785151856018, | |
| "grad_norm": 0.3947628953294446, | |
| "learning_rate": 7.550784974218195e-05, | |
| "loss": 0.5669, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 1.1968503937007875, | |
| "grad_norm": 0.37395524397542534, | |
| "learning_rate": 7.539119633095983e-05, | |
| "loss": 0.5641, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 1.2058492688413949, | |
| "grad_norm": 0.37361515124611633, | |
| "learning_rate": 7.527314013037815e-05, | |
| "loss": 0.5763, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 1.2148481439820022, | |
| "grad_norm": 0.4839280159646048, | |
| "learning_rate": 7.515368581979224e-05, | |
| "loss": 0.5703, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 1.2238470191226096, | |
| "grad_norm": 0.6174733585746707, | |
| "learning_rate": 7.503283813397379e-05, | |
| "loss": 0.5703, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 1.232845894263217, | |
| "grad_norm": 0.7319038998136339, | |
| "learning_rate": 7.491060186292331e-05, | |
| "loss": 0.5715, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 1.2418447694038246, | |
| "grad_norm": 0.7864687770694577, | |
| "learning_rate": 7.478698185168019e-05, | |
| "loss": 0.5714, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 1.250843644544432, | |
| "grad_norm": 0.7693307388456105, | |
| "learning_rate": 7.466198300013066e-05, | |
| "loss": 0.5761, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 1.2598425196850394, | |
| "grad_norm": 0.7329449732496208, | |
| "learning_rate": 7.453561026281366e-05, | |
| "loss": 0.5697, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.2688413948256467, | |
| "grad_norm": 0.7219015663079241, | |
| "learning_rate": 7.440786864872433e-05, | |
| "loss": 0.5725, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 1.277840269966254, | |
| "grad_norm": 0.7161519080047792, | |
| "learning_rate": 7.427876322111558e-05, | |
| "loss": 0.5737, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 1.2868391451068617, | |
| "grad_norm": 0.7985855229347674, | |
| "learning_rate": 7.414829909729727e-05, | |
| "loss": 0.5724, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 1.295838020247469, | |
| "grad_norm": 0.8867813272578328, | |
| "learning_rate": 7.40164814484336e-05, | |
| "loss": 0.5725, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 1.3048368953880765, | |
| "grad_norm": 0.8113361068419839, | |
| "learning_rate": 7.388331549933787e-05, | |
| "loss": 0.5743, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 1.3138357705286838, | |
| "grad_norm": 0.5476904687412559, | |
| "learning_rate": 7.37488065282656e-05, | |
| "loss": 0.5658, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 1.3228346456692912, | |
| "grad_norm": 0.45189065402105816, | |
| "learning_rate": 7.361295986670522e-05, | |
| "loss": 0.5708, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 1.3318335208098988, | |
| "grad_norm": 0.751469386998812, | |
| "learning_rate": 7.347578089916672e-05, | |
| "loss": 0.5629, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 1.3408323959505062, | |
| "grad_norm": 0.7162670873895776, | |
| "learning_rate": 7.333727506296831e-05, | |
| "loss": 0.5711, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 1.3498312710911136, | |
| "grad_norm": 0.5796332830646129, | |
| "learning_rate": 7.319744784802087e-05, | |
| "loss": 0.5701, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.3588301462317212, | |
| "grad_norm": 0.5806837398044375, | |
| "learning_rate": 7.305630479661033e-05, | |
| "loss": 0.568, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 1.3678290213723285, | |
| "grad_norm": 0.36656011961132856, | |
| "learning_rate": 7.291385150317796e-05, | |
| "loss": 0.5704, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 1.376827896512936, | |
| "grad_norm": 0.4854553631469011, | |
| "learning_rate": 7.277009361409874e-05, | |
| "loss": 0.5626, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 1.3858267716535433, | |
| "grad_norm": 0.7060268694492198, | |
| "learning_rate": 7.262503682745744e-05, | |
| "loss": 0.5682, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 1.3948256467941507, | |
| "grad_norm": 0.6290468975159552, | |
| "learning_rate": 7.247868689282283e-05, | |
| "loss": 0.5652, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 1.4038245219347583, | |
| "grad_norm": 0.5124920327952127, | |
| "learning_rate": 7.233104961101974e-05, | |
| "loss": 0.5702, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 1.4128233970753656, | |
| "grad_norm": 0.435666421319146, | |
| "learning_rate": 7.21821308338992e-05, | |
| "loss": 0.5656, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 1.421822272215973, | |
| "grad_norm": 0.31789150940846095, | |
| "learning_rate": 7.203193646410642e-05, | |
| "loss": 0.5625, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 1.4308211473565804, | |
| "grad_norm": 0.44522630815692826, | |
| "learning_rate": 7.188047245484686e-05, | |
| "loss": 0.5622, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 1.4398200224971878, | |
| "grad_norm": 0.5340918126651806, | |
| "learning_rate": 7.172774480965033e-05, | |
| "loss": 0.5663, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.4488188976377954, | |
| "grad_norm": 0.5126590833097094, | |
| "learning_rate": 7.157375958213288e-05, | |
| "loss": 0.5604, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 1.4578177727784027, | |
| "grad_norm": 0.5574211540917228, | |
| "learning_rate": 7.141852287575701e-05, | |
| "loss": 0.5644, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 1.4668166479190101, | |
| "grad_norm": 0.49894825274633764, | |
| "learning_rate": 7.126204084358963e-05, | |
| "loss": 0.5543, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 1.4758155230596175, | |
| "grad_norm": 0.383714620666216, | |
| "learning_rate": 7.110431968805825e-05, | |
| "loss": 0.5667, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 1.4848143982002249, | |
| "grad_norm": 0.3679127273669197, | |
| "learning_rate": 7.094536566070514e-05, | |
| "loss": 0.5649, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 1.4938132733408325, | |
| "grad_norm": 0.41052750978866387, | |
| "learning_rate": 7.078518506193945e-05, | |
| "loss": 0.5597, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 1.5028121484814398, | |
| "grad_norm": 0.4161268377444414, | |
| "learning_rate": 7.062378424078758e-05, | |
| "loss": 0.5683, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 1.5118110236220472, | |
| "grad_norm": 0.4143309696590428, | |
| "learning_rate": 7.046116959464149e-05, | |
| "loss": 0.5627, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 1.5208098987626548, | |
| "grad_norm": 0.47622613149080295, | |
| "learning_rate": 7.02973475690051e-05, | |
| "loss": 0.5619, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 1.529808773903262, | |
| "grad_norm": 0.5903824719429629, | |
| "learning_rate": 7.013232465723888e-05, | |
| "loss": 0.5685, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.5388076490438696, | |
| "grad_norm": 0.6410524866967787, | |
| "learning_rate": 6.996610740030237e-05, | |
| "loss": 0.5649, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 1.547806524184477, | |
| "grad_norm": 0.5120196634127667, | |
| "learning_rate": 6.979870238649506e-05, | |
| "loss": 0.5641, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 1.5568053993250843, | |
| "grad_norm": 0.46968763374354905, | |
| "learning_rate": 6.963011625119514e-05, | |
| "loss": 0.5607, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 1.565804274465692, | |
| "grad_norm": 0.7346438690156033, | |
| "learning_rate": 6.94603556765965e-05, | |
| "loss": 0.5632, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 1.574803149606299, | |
| "grad_norm": 0.9133676726822815, | |
| "learning_rate": 6.928942739144394e-05, | |
| "loss": 0.5561, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.5838020247469067, | |
| "grad_norm": 0.9699546226599123, | |
| "learning_rate": 6.911733817076638e-05, | |
| "loss": 0.5684, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 1.592800899887514, | |
| "grad_norm": 0.8546409930461957, | |
| "learning_rate": 6.894409483560845e-05, | |
| "loss": 0.5636, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 1.6017997750281214, | |
| "grad_norm": 0.5767270121981363, | |
| "learning_rate": 6.876970425275993e-05, | |
| "loss": 0.5636, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 1.610798650168729, | |
| "grad_norm": 0.4603868453512875, | |
| "learning_rate": 6.859417333448376e-05, | |
| "loss": 0.5657, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 1.6197975253093362, | |
| "grad_norm": 0.4657088093589826, | |
| "learning_rate": 6.841750903824196e-05, | |
| "loss": 0.553, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.6287964004499438, | |
| "grad_norm": 0.4245830377495125, | |
| "learning_rate": 6.823971836641988e-05, | |
| "loss": 0.553, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 1.6377952755905512, | |
| "grad_norm": 0.38313124117842345, | |
| "learning_rate": 6.806080836604868e-05, | |
| "loss": 0.5606, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 1.6467941507311585, | |
| "grad_norm": 0.3951074598265005, | |
| "learning_rate": 6.788078612852596e-05, | |
| "loss": 0.5611, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 1.6557930258717661, | |
| "grad_norm": 0.3324519467665917, | |
| "learning_rate": 6.769965878933468e-05, | |
| "loss": 0.5589, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 1.6647919010123733, | |
| "grad_norm": 0.30525105111986356, | |
| "learning_rate": 6.751743352776041e-05, | |
| "loss": 0.5513, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 1.6737907761529809, | |
| "grad_norm": 0.38317669421888395, | |
| "learning_rate": 6.733411756660668e-05, | |
| "loss": 0.561, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 1.6827896512935883, | |
| "grad_norm": 0.36452922899831897, | |
| "learning_rate": 6.714971817190872e-05, | |
| "loss": 0.5565, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 1.6917885264341956, | |
| "grad_norm": 0.45451136606739156, | |
| "learning_rate": 6.696424265264549e-05, | |
| "loss": 0.5554, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 1.7007874015748032, | |
| "grad_norm": 0.5850188409579592, | |
| "learning_rate": 6.677769836044991e-05, | |
| "loss": 0.5603, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 1.7097862767154106, | |
| "grad_norm": 0.6094742834131825, | |
| "learning_rate": 6.659009268931756e-05, | |
| "loss": 0.5574, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.718785151856018, | |
| "grad_norm": 0.6284162963180803, | |
| "learning_rate": 6.64014330753135e-05, | |
| "loss": 0.5507, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 1.7277840269966256, | |
| "grad_norm": 0.6446183014432966, | |
| "learning_rate": 6.621172699627761e-05, | |
| "loss": 0.5551, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 1.7367829021372327, | |
| "grad_norm": 0.6604713611773748, | |
| "learning_rate": 6.602098197152817e-05, | |
| "loss": 0.555, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 1.7457817772778403, | |
| "grad_norm": 0.7414347856141247, | |
| "learning_rate": 6.582920556156378e-05, | |
| "loss": 0.5631, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 1.7547806524184477, | |
| "grad_norm": 0.7773047456674285, | |
| "learning_rate": 6.563640536776375e-05, | |
| "loss": 0.56, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 1.763779527559055, | |
| "grad_norm": 0.6417760152496987, | |
| "learning_rate": 6.544258903208679e-05, | |
| "loss": 0.5518, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 1.7727784026996627, | |
| "grad_norm": 0.3607864424657309, | |
| "learning_rate": 6.524776423676806e-05, | |
| "loss": 0.55, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 1.7817772778402698, | |
| "grad_norm": 0.4683312923869542, | |
| "learning_rate": 6.505193870401472e-05, | |
| "loss": 0.5579, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 1.7907761529808774, | |
| "grad_norm": 0.6344422826239488, | |
| "learning_rate": 6.485512019569986e-05, | |
| "loss": 0.5628, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 1.7997750281214848, | |
| "grad_norm": 0.48250024105262346, | |
| "learning_rate": 6.465731651305475e-05, | |
| "loss": 0.5561, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.8087739032620922, | |
| "grad_norm": 0.4011620009633319, | |
| "learning_rate": 6.445853549635982e-05, | |
| "loss": 0.5589, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 1.8177727784026998, | |
| "grad_norm": 0.46120746083720787, | |
| "learning_rate": 6.425878502463363e-05, | |
| "loss": 0.5464, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 1.826771653543307, | |
| "grad_norm": 0.3366919397807797, | |
| "learning_rate": 6.405807301532082e-05, | |
| "loss": 0.5583, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 1.8357705286839145, | |
| "grad_norm": 0.3775528944092533, | |
| "learning_rate": 6.38564074239781e-05, | |
| "loss": 0.5515, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 1.844769403824522, | |
| "grad_norm": 0.4646001384485239, | |
| "learning_rate": 6.365379624395911e-05, | |
| "loss": 0.5545, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 1.8537682789651293, | |
| "grad_norm": 0.3173359890548618, | |
| "learning_rate": 6.345024750609735e-05, | |
| "loss": 0.5511, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 1.862767154105737, | |
| "grad_norm": 0.30771535160757224, | |
| "learning_rate": 6.324576927838811e-05, | |
| "loss": 0.5569, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 1.871766029246344, | |
| "grad_norm": 0.39138826318539643, | |
| "learning_rate": 6.30403696656685e-05, | |
| "loss": 0.5507, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 1.8807649043869517, | |
| "grad_norm": 0.39402910206893743, | |
| "learning_rate": 6.28340568092963e-05, | |
| "loss": 0.551, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 1.889763779527559, | |
| "grad_norm": 0.42067586027851417, | |
| "learning_rate": 6.26268388868272e-05, | |
| "loss": 0.5552, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.8987626546681664, | |
| "grad_norm": 0.39427161933244775, | |
| "learning_rate": 6.241872411169075e-05, | |
| "loss": 0.5515, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 1.907761529808774, | |
| "grad_norm": 0.2940525255889463, | |
| "learning_rate": 6.220972073286469e-05, | |
| "loss": 0.5452, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 1.9167604049493814, | |
| "grad_norm": 0.34048373966582285, | |
| "learning_rate": 6.199983703454813e-05, | |
| "loss": 0.5509, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 1.9257592800899888, | |
| "grad_norm": 0.299832377853422, | |
| "learning_rate": 6.178908133583306e-05, | |
| "loss": 0.5456, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 1.9347581552305961, | |
| "grad_norm": 0.2392599130105496, | |
| "learning_rate": 6.157746199037473e-05, | |
| "loss": 0.5525, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 1.9437570303712035, | |
| "grad_norm": 0.2884892083080534, | |
| "learning_rate": 6.136498738606038e-05, | |
| "loss": 0.5494, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 1.952755905511811, | |
| "grad_norm": 0.25915586035077326, | |
| "learning_rate": 6.115166594467696e-05, | |
| "loss": 0.5578, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 1.9617547806524185, | |
| "grad_norm": 0.2404861202851113, | |
| "learning_rate": 6.093750612157719e-05, | |
| "loss": 0.5501, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 1.9707536557930259, | |
| "grad_norm": 0.27079067912862353, | |
| "learning_rate": 6.0722516405344436e-05, | |
| "loss": 0.5544, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 1.9797525309336335, | |
| "grad_norm": 0.27569696549284556, | |
| "learning_rate": 6.050670531745629e-05, | |
| "loss": 0.5436, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.9887514060742406, | |
| "grad_norm": 0.39262044690390413, | |
| "learning_rate": 6.0290081411946785e-05, | |
| "loss": 0.5589, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 1.9977502812148482, | |
| "grad_norm": 0.3759638408126938, | |
| "learning_rate": 6.007265327506734e-05, | |
| "loss": 0.5489, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 2.0067491563554554, | |
| "grad_norm": 0.6721068777680985, | |
| "learning_rate": 5.985442952494643e-05, | |
| "loss": 0.9973, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 2.015748031496063, | |
| "grad_norm": 1.2532035078989137, | |
| "learning_rate": 5.9635418811248e-05, | |
| "loss": 0.5251, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 2.0247469066366706, | |
| "grad_norm": 1.3965146746879655, | |
| "learning_rate": 5.941562981482859e-05, | |
| "loss": 0.5222, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 2.0337457817772777, | |
| "grad_norm": 0.47181130829523693, | |
| "learning_rate": 5.9195071247393325e-05, | |
| "loss": 0.5175, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 2.0427446569178853, | |
| "grad_norm": 1.2945631540598115, | |
| "learning_rate": 5.897375185115052e-05, | |
| "loss": 0.5219, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 2.0517435320584925, | |
| "grad_norm": 0.753799335003198, | |
| "learning_rate": 5.8751680398465244e-05, | |
| "loss": 0.5248, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 2.0607424071991, | |
| "grad_norm": 0.7482718095479951, | |
| "learning_rate": 5.8528865691511564e-05, | |
| "loss": 0.531, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 2.0697412823397077, | |
| "grad_norm": 0.9072745588806806, | |
| "learning_rate": 5.83053165619237e-05, | |
| "loss": 0.5185, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 2.078740157480315, | |
| "grad_norm": 0.7594832390100079, | |
| "learning_rate": 5.808104187044592e-05, | |
| "loss": 0.5238, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 2.0877390326209224, | |
| "grad_norm": 0.7018923770640809, | |
| "learning_rate": 5.785605050658134e-05, | |
| "loss": 0.5116, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 2.09673790776153, | |
| "grad_norm": 0.6538828939619331, | |
| "learning_rate": 5.7630351388239654e-05, | |
| "loss": 0.5188, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 2.105736782902137, | |
| "grad_norm": 0.5009696448905507, | |
| "learning_rate": 5.7403953461383515e-05, | |
| "loss": 0.5189, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 2.1147356580427448, | |
| "grad_norm": 0.5418468893978772, | |
| "learning_rate": 5.717686569967406e-05, | |
| "loss": 0.513, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 2.123734533183352, | |
| "grad_norm": 0.5552312277008503, | |
| "learning_rate": 5.694909710411517e-05, | |
| "loss": 0.5136, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 2.1327334083239595, | |
| "grad_norm": 0.4618715663984918, | |
| "learning_rate": 5.672065670269674e-05, | |
| "loss": 0.5226, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 2.141732283464567, | |
| "grad_norm": 0.4634338024463565, | |
| "learning_rate": 5.649155355003677e-05, | |
| "loss": 0.517, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 2.1507311586051743, | |
| "grad_norm": 0.35434206430397164, | |
| "learning_rate": 5.6261796727022575e-05, | |
| "loss": 0.5166, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 2.159730033745782, | |
| "grad_norm": 0.46283864431852784, | |
| "learning_rate": 5.603139534045075e-05, | |
| "loss": 0.5112, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 2.168728908886389, | |
| "grad_norm": 0.42575715836926703, | |
| "learning_rate": 5.5800358522666254e-05, | |
| "loss": 0.5078, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 2.1777277840269966, | |
| "grad_norm": 0.33272119313147885, | |
| "learning_rate": 5.556869543120043e-05, | |
| "loss": 0.5194, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 2.1867266591676042, | |
| "grad_norm": 0.3307000934681126, | |
| "learning_rate": 5.533641524840805e-05, | |
| "loss": 0.5177, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 2.1957255343082114, | |
| "grad_norm": 0.3028678329115125, | |
| "learning_rate": 5.5103527181103266e-05, | |
| "loss": 0.5169, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 2.204724409448819, | |
| "grad_norm": 0.3208549750095541, | |
| "learning_rate": 5.4870040460194834e-05, | |
| "loss": 0.5166, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 2.213723284589426, | |
| "grad_norm": 0.24593852068559713, | |
| "learning_rate": 5.463596434032011e-05, | |
| "loss": 0.5162, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 2.2227221597300337, | |
| "grad_norm": 0.26893184750686033, | |
| "learning_rate": 5.440130809947824e-05, | |
| "loss": 0.5211, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 2.2317210348706413, | |
| "grad_norm": 0.2453740819753266, | |
| "learning_rate": 5.4166081038662484e-05, | |
| "loss": 0.5114, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 2.2407199100112485, | |
| "grad_norm": 0.22939493074869363, | |
| "learning_rate": 5.3930292481491494e-05, | |
| "loss": 0.5101, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 2.249718785151856, | |
| "grad_norm": 0.25733357689003183, | |
| "learning_rate": 5.3693951773839736e-05, | |
| "loss": 0.5111, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 2.2587176602924632, | |
| "grad_norm": 0.2252436543398025, | |
| "learning_rate": 5.345706828346715e-05, | |
| "loss": 0.516, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 2.267716535433071, | |
| "grad_norm": 0.28121431385301127, | |
| "learning_rate": 5.3219651399647716e-05, | |
| "loss": 0.5168, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 2.2767154105736784, | |
| "grad_norm": 0.2450809438398232, | |
| "learning_rate": 5.298171053279739e-05, | |
| "loss": 0.5089, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 2.2857142857142856, | |
| "grad_norm": 0.22424110667284097, | |
| "learning_rate": 5.274325511410106e-05, | |
| "loss": 0.5165, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 2.294713160854893, | |
| "grad_norm": 0.2360828175959895, | |
| "learning_rate": 5.250429459513876e-05, | |
| "loss": 0.5148, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 2.303712035995501, | |
| "grad_norm": 0.2188304429809361, | |
| "learning_rate": 5.226483844751099e-05, | |
| "loss": 0.5098, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 2.312710911136108, | |
| "grad_norm": 0.2334512539619716, | |
| "learning_rate": 5.202489616246333e-05, | |
| "loss": 0.5158, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 2.3217097862767155, | |
| "grad_norm": 0.19345732584763242, | |
| "learning_rate": 5.178447725051026e-05, | |
| "loss": 0.5137, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 2.3307086614173227, | |
| "grad_norm": 0.22726718301152563, | |
| "learning_rate": 5.154359124105814e-05, | |
| "loss": 0.5149, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 2.3397075365579303, | |
| "grad_norm": 0.2149304217989204, | |
| "learning_rate": 5.1302247682027494e-05, | |
| "loss": 0.5122, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 2.3487064116985374, | |
| "grad_norm": 0.2574574491521757, | |
| "learning_rate": 5.106045613947466e-05, | |
| "loss": 0.5106, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 2.357705286839145, | |
| "grad_norm": 0.2244044383603563, | |
| "learning_rate": 5.08182261972125e-05, | |
| "loss": 0.5098, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 2.3667041619797526, | |
| "grad_norm": 0.1722686877882969, | |
| "learning_rate": 5.057556745643058e-05, | |
| "loss": 0.5152, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 2.37570303712036, | |
| "grad_norm": 0.20786422486929515, | |
| "learning_rate": 5.033248953531466e-05, | |
| "loss": 0.5096, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 2.3847019122609674, | |
| "grad_norm": 0.20518803463398436, | |
| "learning_rate": 5.0089002068665376e-05, | |
| "loss": 0.5108, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 2.393700787401575, | |
| "grad_norm": 0.19783852278781122, | |
| "learning_rate": 4.9845114707516395e-05, | |
| "loss": 0.5169, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 2.402699662542182, | |
| "grad_norm": 0.15894601748576243, | |
| "learning_rate": 4.9600837118751874e-05, | |
| "loss": 0.5077, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 2.4116985376827897, | |
| "grad_norm": 0.19443345762052888, | |
| "learning_rate": 4.93561789847233e-05, | |
| "loss": 0.5211, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 2.420697412823397, | |
| "grad_norm": 0.19638270288569096, | |
| "learning_rate": 4.91111500028657e-05, | |
| "loss": 0.5098, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 2.4296962879640045, | |
| "grad_norm": 0.17816319122872262, | |
| "learning_rate": 4.886575988531329e-05, | |
| "loss": 0.5175, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 2.438695163104612, | |
| "grad_norm": 0.17651511328007938, | |
| "learning_rate": 4.862001835851449e-05, | |
| "loss": 0.5136, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 2.4476940382452193, | |
| "grad_norm": 0.1840295184610878, | |
| "learning_rate": 4.837393516284642e-05, | |
| "loss": 0.5099, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 2.456692913385827, | |
| "grad_norm": 0.1914170149726011, | |
| "learning_rate": 4.8127520052228815e-05, | |
| "loss": 0.5165, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 2.465691788526434, | |
| "grad_norm": 0.184680647588443, | |
| "learning_rate": 4.788078279373744e-05, | |
| "loss": 0.5084, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 2.4746906636670416, | |
| "grad_norm": 0.22287775595578857, | |
| "learning_rate": 4.763373316721687e-05, | |
| "loss": 0.5134, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 2.483689538807649, | |
| "grad_norm": 0.25180987790543646, | |
| "learning_rate": 4.7386380964892984e-05, | |
| "loss": 0.5137, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 2.4926884139482564, | |
| "grad_norm": 0.2645098272392606, | |
| "learning_rate": 4.713873599098471e-05, | |
| "loss": 0.5128, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 2.501687289088864, | |
| "grad_norm": 0.16671664940075334, | |
| "learning_rate": 4.689080806131547e-05, | |
| "loss": 0.5114, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 2.5106861642294716, | |
| "grad_norm": 0.22458637707427415, | |
| "learning_rate": 4.664260700292416e-05, | |
| "loss": 0.5095, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 2.5196850393700787, | |
| "grad_norm": 0.20330358237599677, | |
| "learning_rate": 4.639414265367554e-05, | |
| "loss": 0.509, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 2.5286839145106863, | |
| "grad_norm": 0.19076267160273108, | |
| "learning_rate": 4.614542486187033e-05, | |
| "loss": 0.5135, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 2.5376827896512935, | |
| "grad_norm": 0.19390672620858349, | |
| "learning_rate": 4.589646348585494e-05, | |
| "loss": 0.5129, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 2.546681664791901, | |
| "grad_norm": 0.17431590192861887, | |
| "learning_rate": 4.564726839363059e-05, | |
| "loss": 0.5119, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 2.555680539932508, | |
| "grad_norm": 0.19796573953993535, | |
| "learning_rate": 4.539784946246225e-05, | |
| "loss": 0.5226, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 2.564679415073116, | |
| "grad_norm": 0.21806700674738302, | |
| "learning_rate": 4.5148216578487134e-05, | |
| "loss": 0.5205, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 2.5736782902137234, | |
| "grad_norm": 0.21026084289689406, | |
| "learning_rate": 4.4898379636322815e-05, | |
| "loss": 0.5078, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 2.5826771653543306, | |
| "grad_norm": 0.18917285015389398, | |
| "learning_rate": 4.4648348538675064e-05, | |
| "loss": 0.5146, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 2.591676040494938, | |
| "grad_norm": 0.17876905148977648, | |
| "learning_rate": 4.4398133195945326e-05, | |
| "loss": 0.5124, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 2.6006749156355458, | |
| "grad_norm": 0.19991148697525993, | |
| "learning_rate": 4.414774352583791e-05, | |
| "loss": 0.5141, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 2.609673790776153, | |
| "grad_norm": 0.1718371307146163, | |
| "learning_rate": 4.3897189452966895e-05, | |
| "loss": 0.5126, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 2.6186726659167605, | |
| "grad_norm": 0.1685835159809121, | |
| "learning_rate": 4.364648090846271e-05, | |
| "loss": 0.5133, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 2.6276715410573677, | |
| "grad_norm": 0.14950354028807825, | |
| "learning_rate": 4.339562782957857e-05, | |
| "loss": 0.5083, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 2.6366704161979753, | |
| "grad_norm": 0.17804715853771555, | |
| "learning_rate": 4.314464015929649e-05, | |
| "loss": 0.5194, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 2.6456692913385824, | |
| "grad_norm": 0.17707266681302694, | |
| "learning_rate": 4.28935278459333e-05, | |
| "loss": 0.5128, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 2.65466816647919, | |
| "grad_norm": 0.15273475864497393, | |
| "learning_rate": 4.264230084274624e-05, | |
| "loss": 0.5148, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 2.6636670416197976, | |
| "grad_norm": 0.19292123693836521, | |
| "learning_rate": 4.239096910753846e-05, | |
| "loss": 0.5097, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 2.6726659167604048, | |
| "grad_norm": 0.19491285573674288, | |
| "learning_rate": 4.213954260226438e-05, | |
| "loss": 0.5108, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 2.6816647919010124, | |
| "grad_norm": 0.16377738874278228, | |
| "learning_rate": 4.188803129263476e-05, | |
| "loss": 0.5084, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 2.69066366704162, | |
| "grad_norm": 0.17707808785911283, | |
| "learning_rate": 4.163644514772172e-05, | |
| "loss": 0.5078, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 2.699662542182227, | |
| "grad_norm": 0.16129691634220186, | |
| "learning_rate": 4.1384794139563614e-05, | |
| "loss": 0.5177, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 2.7086614173228347, | |
| "grad_norm": 0.19221250489172526, | |
| "learning_rate": 4.113308824276977e-05, | |
| "loss": 0.5149, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 2.7176602924634423, | |
| "grad_norm": 0.17286893566543513, | |
| "learning_rate": 4.0881337434125086e-05, | |
| "loss": 0.5088, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 2.7266591676040495, | |
| "grad_norm": 0.1755459663317774, | |
| "learning_rate": 4.0629551692194634e-05, | |
| "loss": 0.509, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 2.735658042744657, | |
| "grad_norm": 0.1791201340425473, | |
| "learning_rate": 4.037774099692815e-05, | |
| "loss": 0.5143, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 2.7446569178852642, | |
| "grad_norm": 0.18143794043338418, | |
| "learning_rate": 4.0125915329264396e-05, | |
| "loss": 0.5123, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 2.753655793025872, | |
| "grad_norm": 0.15111984385935012, | |
| "learning_rate": 3.9874084670735624e-05, | |
| "loss": 0.5132, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 2.762654668166479, | |
| "grad_norm": 0.17406268964374014, | |
| "learning_rate": 3.962225900307187e-05, | |
| "loss": 0.5088, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 2.7716535433070866, | |
| "grad_norm": 0.1845218510232061, | |
| "learning_rate": 3.937044830780537e-05, | |
| "loss": 0.5115, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 2.780652418447694, | |
| "grad_norm": 0.14408452277861603, | |
| "learning_rate": 3.9118662565874934e-05, | |
| "loss": 0.5088, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 2.7896512935883013, | |
| "grad_norm": 0.2121818690921875, | |
| "learning_rate": 3.886691175723025e-05, | |
| "loss": 0.515, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 2.798650168728909, | |
| "grad_norm": 0.17415478680558893, | |
| "learning_rate": 3.8615205860436406e-05, | |
| "loss": 0.5102, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 2.8076490438695165, | |
| "grad_norm": 0.17710673827275958, | |
| "learning_rate": 3.83635548522783e-05, | |
| "loss": 0.5142, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 2.8166479190101237, | |
| "grad_norm": 0.2131327323874231, | |
| "learning_rate": 3.811196870736526e-05, | |
| "loss": 0.5143, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 2.8256467941507313, | |
| "grad_norm": 0.17135728004223627, | |
| "learning_rate": 3.786045739773564e-05, | |
| "loss": 0.5066, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 2.8346456692913384, | |
| "grad_norm": 0.1787641234702102, | |
| "learning_rate": 3.7609030892461554e-05, | |
| "loss": 0.5077, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 2.843644544431946, | |
| "grad_norm": 0.1737180877569213, | |
| "learning_rate": 3.735769915725378e-05, | |
| "loss": 0.5104, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 2.852643419572553, | |
| "grad_norm": 0.1759592145643783, | |
| "learning_rate": 3.710647215406672e-05, | |
| "loss": 0.5076, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 2.861642294713161, | |
| "grad_norm": 0.1674590421194775, | |
| "learning_rate": 3.6855359840703525e-05, | |
| "loss": 0.5117, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 2.8706411698537684, | |
| "grad_norm": 0.19195727087026487, | |
| "learning_rate": 3.660437217042145e-05, | |
| "loss": 0.515, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 2.8796400449943755, | |
| "grad_norm": 0.2079526174400504, | |
| "learning_rate": 3.63535190915373e-05, | |
| "loss": 0.5114, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 2.888638920134983, | |
| "grad_norm": 0.16261938739039505, | |
| "learning_rate": 3.610281054703311e-05, | |
| "loss": 0.512, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 2.8976377952755907, | |
| "grad_norm": 0.17301946358427484, | |
| "learning_rate": 3.58522564741621e-05, | |
| "loss": 0.513, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 2.906636670416198, | |
| "grad_norm": 0.15809619671547212, | |
| "learning_rate": 3.560186680405469e-05, | |
| "loss": 0.5092, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 2.9156355455568055, | |
| "grad_norm": 0.18005047534402024, | |
| "learning_rate": 3.535165146132494e-05, | |
| "loss": 0.5143, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 2.924634420697413, | |
| "grad_norm": 0.17055434106531733, | |
| "learning_rate": 3.51016203636772e-05, | |
| "loss": 0.5044, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 2.9336332958380202, | |
| "grad_norm": 0.2091344061636353, | |
| "learning_rate": 3.485178342151287e-05, | |
| "loss": 0.5151, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 2.942632170978628, | |
| "grad_norm": 0.17776622234467257, | |
| "learning_rate": 3.460215053753776e-05, | |
| "loss": 0.506, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 2.951631046119235, | |
| "grad_norm": 0.2020120609501547, | |
| "learning_rate": 3.435273160636942e-05, | |
| "loss": 0.5106, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 2.9606299212598426, | |
| "grad_norm": 0.17580656405878176, | |
| "learning_rate": 3.410353651414507e-05, | |
| "loss": 0.509, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 2.9696287964004497, | |
| "grad_norm": 0.1874074410148767, | |
| "learning_rate": 3.385457513812968e-05, | |
| "loss": 0.5119, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 2.9786276715410573, | |
| "grad_norm": 0.18596738197007068, | |
| "learning_rate": 3.360585734632448e-05, | |
| "loss": 0.5105, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 2.987626546681665, | |
| "grad_norm": 0.18012608783890774, | |
| "learning_rate": 3.3357392997075854e-05, | |
| "loss": 0.51, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 2.996625421822272, | |
| "grad_norm": 0.17695247800719235, | |
| "learning_rate": 3.3109191938684535e-05, | |
| "loss": 0.5096, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 3.0056242969628797, | |
| "grad_norm": 0.4041448073125774, | |
| "learning_rate": 3.28612640090153e-05, | |
| "loss": 0.9217, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 3.014623172103487, | |
| "grad_norm": 0.40335779907517166, | |
| "learning_rate": 3.261361903510703e-05, | |
| "loss": 0.4749, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 3.0236220472440944, | |
| "grad_norm": 0.23571838314313354, | |
| "learning_rate": 3.2366266832783145e-05, | |
| "loss": 0.4795, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 3.032620922384702, | |
| "grad_norm": 0.34962469353680325, | |
| "learning_rate": 3.211921720626258e-05, | |
| "loss": 0.4772, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 3.041619797525309, | |
| "grad_norm": 0.3250568724209024, | |
| "learning_rate": 3.187247994777119e-05, | |
| "loss": 0.4756, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 3.050618672665917, | |
| "grad_norm": 0.27683092123819425, | |
| "learning_rate": 3.1626064837153596e-05, | |
| "loss": 0.478, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 3.0596175478065244, | |
| "grad_norm": 0.29128324637433173, | |
| "learning_rate": 3.1379981641485524e-05, | |
| "loss": 0.4767, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 3.0686164229471316, | |
| "grad_norm": 0.2783998825962506, | |
| "learning_rate": 3.113424011468672e-05, | |
| "loss": 0.4758, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 3.077615298087739, | |
| "grad_norm": 0.25380987160914664, | |
| "learning_rate": 3.0888849997134316e-05, | |
| "loss": 0.4663, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 3.0866141732283463, | |
| "grad_norm": 0.23893069831482153, | |
| "learning_rate": 3.064382101527671e-05, | |
| "loss": 0.4788, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 3.095613048368954, | |
| "grad_norm": 0.2546443581638444, | |
| "learning_rate": 3.039916288124814e-05, | |
| "loss": 0.4794, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 3.1046119235095615, | |
| "grad_norm": 0.2193178093849395, | |
| "learning_rate": 3.015488529248362e-05, | |
| "loss": 0.479, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 3.1136107986501687, | |
| "grad_norm": 0.23966451017854798, | |
| "learning_rate": 2.9910997931334637e-05, | |
| "loss": 0.4754, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 3.1226096737907763, | |
| "grad_norm": 0.2258415034497983, | |
| "learning_rate": 2.9667510464685345e-05, | |
| "loss": 0.4825, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 3.1316085489313834, | |
| "grad_norm": 0.23731192604698906, | |
| "learning_rate": 2.9424432543569428e-05, | |
| "loss": 0.4723, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 3.140607424071991, | |
| "grad_norm": 0.17268744214169082, | |
| "learning_rate": 2.918177380278752e-05, | |
| "loss": 0.4763, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 3.1496062992125986, | |
| "grad_norm": 0.23327367459895917, | |
| "learning_rate": 2.893954386052535e-05, | |
| "loss": 0.4786, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 3.1586051743532058, | |
| "grad_norm": 0.1956862643734178, | |
| "learning_rate": 2.8697752317972513e-05, | |
| "loss": 0.4765, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 3.1676040494938134, | |
| "grad_norm": 0.18627973371325526, | |
| "learning_rate": 2.845640875894188e-05, | |
| "loss": 0.4745, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 3.1766029246344205, | |
| "grad_norm": 0.2105328504046573, | |
| "learning_rate": 2.8215522749489742e-05, | |
| "loss": 0.4804, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 3.185601799775028, | |
| "grad_norm": 0.16272715621667633, | |
| "learning_rate": 2.7975103837536672e-05, | |
| "loss": 0.472, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 3.1946006749156357, | |
| "grad_norm": 0.21097547189014748, | |
| "learning_rate": 2.7735161552489022e-05, | |
| "loss": 0.4749, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 3.203599550056243, | |
| "grad_norm": 0.17765983026821935, | |
| "learning_rate": 2.749570540486125e-05, | |
| "loss": 0.4774, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 3.2125984251968505, | |
| "grad_norm": 0.18185508796151498, | |
| "learning_rate": 2.7256744885898942e-05, | |
| "loss": 0.4769, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 3.2215973003374576, | |
| "grad_norm": 0.17545115382309487, | |
| "learning_rate": 2.7018289467202623e-05, | |
| "loss": 0.4707, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 3.230596175478065, | |
| "grad_norm": 0.16118530505673961, | |
| "learning_rate": 2.6780348600352284e-05, | |
| "loss": 0.4751, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 3.239595050618673, | |
| "grad_norm": 0.17928864347147178, | |
| "learning_rate": 2.6542931716532856e-05, | |
| "loss": 0.4807, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 3.24859392575928, | |
| "grad_norm": 0.16478061360464155, | |
| "learning_rate": 2.630604822616027e-05, | |
| "loss": 0.4736, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 3.2575928008998876, | |
| "grad_norm": 0.1478559482793346, | |
| "learning_rate": 2.6069707518508523e-05, | |
| "loss": 0.4781, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 3.2665916760404947, | |
| "grad_norm": 0.16844157556458098, | |
| "learning_rate": 2.583391896133753e-05, | |
| "loss": 0.4765, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 3.2755905511811023, | |
| "grad_norm": 0.1337175278664685, | |
| "learning_rate": 2.5598691900521778e-05, | |
| "loss": 0.4741, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 3.28458942632171, | |
| "grad_norm": 0.13894808378571294, | |
| "learning_rate": 2.5364035659679914e-05, | |
| "loss": 0.4762, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 3.293588301462317, | |
| "grad_norm": 0.14038211286961325, | |
| "learning_rate": 2.512995953980518e-05, | |
| "loss": 0.4785, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 3.3025871766029247, | |
| "grad_norm": 0.13577552755343492, | |
| "learning_rate": 2.4896472818896743e-05, | |
| "loss": 0.4798, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 3.3115860517435323, | |
| "grad_norm": 0.14414082887532978, | |
| "learning_rate": 2.4663584751591977e-05, | |
| "loss": 0.4784, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 3.3205849268841394, | |
| "grad_norm": 0.1499138881871864, | |
| "learning_rate": 2.443130456879958e-05, | |
| "loss": 0.475, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 3.329583802024747, | |
| "grad_norm": 0.15378837222105818, | |
| "learning_rate": 2.4199641477333766e-05, | |
| "loss": 0.4695, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 3.338582677165354, | |
| "grad_norm": 0.15500949142269607, | |
| "learning_rate": 2.3968604659549266e-05, | |
| "loss": 0.4808, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 3.3475815523059618, | |
| "grad_norm": 0.13510134534596135, | |
| "learning_rate": 2.3738203272977446e-05, | |
| "loss": 0.4762, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 3.3565804274465694, | |
| "grad_norm": 0.1580364610829008, | |
| "learning_rate": 2.350844644996325e-05, | |
| "loss": 0.4756, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 3.3655793025871765, | |
| "grad_norm": 0.1447596119360514, | |
| "learning_rate": 2.3279343297303293e-05, | |
| "loss": 0.476, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 3.374578177727784, | |
| "grad_norm": 0.13783465555399632, | |
| "learning_rate": 2.305090289588485e-05, | |
| "loss": 0.478, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 3.3835770528683913, | |
| "grad_norm": 0.1422413809528974, | |
| "learning_rate": 2.2823134300325948e-05, | |
| "loss": 0.4717, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 3.392575928008999, | |
| "grad_norm": 0.12982390766734286, | |
| "learning_rate": 2.25960465386165e-05, | |
| "loss": 0.4787, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 3.4015748031496065, | |
| "grad_norm": 0.1445554728575935, | |
| "learning_rate": 2.2369648611760352e-05, | |
| "loss": 0.4728, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 3.4105736782902136, | |
| "grad_norm": 0.14091047744757867, | |
| "learning_rate": 2.2143949493418654e-05, | |
| "loss": 0.4778, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 3.4195725534308212, | |
| "grad_norm": 0.15001963385136857, | |
| "learning_rate": 2.1918958129554106e-05, | |
| "loss": 0.4764, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 3.4285714285714284, | |
| "grad_norm": 0.13768741985271635, | |
| "learning_rate": 2.1694683438076317e-05, | |
| "loss": 0.471, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 3.437570303712036, | |
| "grad_norm": 0.14948373435356904, | |
| "learning_rate": 2.147113430848844e-05, | |
| "loss": 0.4697, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 3.4465691788526436, | |
| "grad_norm": 0.13361989178999917, | |
| "learning_rate": 2.1248319601534772e-05, | |
| "loss": 0.4707, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 3.4555680539932507, | |
| "grad_norm": 0.13151795053856452, | |
| "learning_rate": 2.102624814884949e-05, | |
| "loss": 0.4747, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 3.4645669291338583, | |
| "grad_norm": 0.14805117724302444, | |
| "learning_rate": 2.080492875260668e-05, | |
| "loss": 0.4765, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 3.4735658042744655, | |
| "grad_norm": 0.15118784969552582, | |
| "learning_rate": 2.0584370185171418e-05, | |
| "loss": 0.4727, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 3.482564679415073, | |
| "grad_norm": 0.14047665231367965, | |
| "learning_rate": 2.0364581188752012e-05, | |
| "loss": 0.4819, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 3.4915635545556807, | |
| "grad_norm": 0.15664210880968063, | |
| "learning_rate": 2.014557047505357e-05, | |
| "loss": 0.466, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 3.500562429696288, | |
| "grad_norm": 0.13241547005518312, | |
| "learning_rate": 1.992734672493267e-05, | |
| "loss": 0.4772, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 3.5095613048368954, | |
| "grad_norm": 0.15495785626518518, | |
| "learning_rate": 1.970991858805322e-05, | |
| "loss": 0.4768, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 3.518560179977503, | |
| "grad_norm": 0.1451652460628155, | |
| "learning_rate": 1.9493294682543715e-05, | |
| "loss": 0.4752, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 3.52755905511811, | |
| "grad_norm": 0.15292105555983718, | |
| "learning_rate": 1.927748359465558e-05, | |
| "loss": 0.4768, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 3.536557930258718, | |
| "grad_norm": 0.1457418788935837, | |
| "learning_rate": 1.9062493878422823e-05, | |
| "loss": 0.4768, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 3.545556805399325, | |
| "grad_norm": 0.13975287470844083, | |
| "learning_rate": 1.884833405532304e-05, | |
| "loss": 0.4757, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 3.5545556805399325, | |
| "grad_norm": 0.14640008673656155, | |
| "learning_rate": 1.863501261393963e-05, | |
| "loss": 0.4813, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 3.5635545556805397, | |
| "grad_norm": 0.12836792930570248, | |
| "learning_rate": 1.8422538009625285e-05, | |
| "loss": 0.4738, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 3.5725534308211473, | |
| "grad_norm": 0.13526217288902667, | |
| "learning_rate": 1.8210918664166945e-05, | |
| "loss": 0.4762, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 3.581552305961755, | |
| "grad_norm": 0.1420752062696319, | |
| "learning_rate": 1.8000162965451884e-05, | |
| "loss": 0.4803, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 3.590551181102362, | |
| "grad_norm": 0.1279945537635078, | |
| "learning_rate": 1.7790279267135317e-05, | |
| "loss": 0.4829, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 3.5995500562429696, | |
| "grad_norm": 0.13720312932073742, | |
| "learning_rate": 1.758127588830928e-05, | |
| "loss": 0.4756, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 3.6085489313835772, | |
| "grad_norm": 0.11571164848869406, | |
| "learning_rate": 1.737316111317281e-05, | |
| "loss": 0.4788, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 3.6175478065241844, | |
| "grad_norm": 0.1267184023160666, | |
| "learning_rate": 1.716594319070371e-05, | |
| "loss": 0.4672, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 3.626546681664792, | |
| "grad_norm": 0.12093756371303144, | |
| "learning_rate": 1.695963033433151e-05, | |
| "loss": 0.4767, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 3.6355455568053996, | |
| "grad_norm": 0.1281599915828755, | |
| "learning_rate": 1.6754230721611896e-05, | |
| "loss": 0.4762, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 3.6445444319460067, | |
| "grad_norm": 0.12299228584682197, | |
| "learning_rate": 1.654975249390265e-05, | |
| "loss": 0.477, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 3.653543307086614, | |
| "grad_norm": 0.13855705326136183, | |
| "learning_rate": 1.634620375604091e-05, | |
| "loss": 0.4791, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 3.6625421822272215, | |
| "grad_norm": 0.12445550210040791, | |
| "learning_rate": 1.6143592576021897e-05, | |
| "loss": 0.4746, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 3.671541057367829, | |
| "grad_norm": 0.1328319119905009, | |
| "learning_rate": 1.594192698467919e-05, | |
| "loss": 0.4753, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 3.6805399325084363, | |
| "grad_norm": 0.12084524067527391, | |
| "learning_rate": 1.574121497536638e-05, | |
| "loss": 0.4778, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 3.689538807649044, | |
| "grad_norm": 0.12320850922380423, | |
| "learning_rate": 1.5541464503640195e-05, | |
| "loss": 0.4718, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 3.6985376827896514, | |
| "grad_norm": 0.11554496711321419, | |
| "learning_rate": 1.534268348694524e-05, | |
| "loss": 0.4734, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 3.7075365579302586, | |
| "grad_norm": 0.11815716191127555, | |
| "learning_rate": 1.5144879804300163e-05, | |
| "loss": 0.4701, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 3.716535433070866, | |
| "grad_norm": 0.12388808731535488, | |
| "learning_rate": 1.4948061295985286e-05, | |
| "loss": 0.4786, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 3.725534308211474, | |
| "grad_norm": 0.11077052559901988, | |
| "learning_rate": 1.4752235763231944e-05, | |
| "loss": 0.4716, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 3.734533183352081, | |
| "grad_norm": 0.11829062763031385, | |
| "learning_rate": 1.4557410967913219e-05, | |
| "loss": 0.4819, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 3.7435320584926886, | |
| "grad_norm": 0.12116279675489595, | |
| "learning_rate": 1.4363594632236249e-05, | |
| "loss": 0.4746, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 3.7525309336332957, | |
| "grad_norm": 0.11924444626526026, | |
| "learning_rate": 1.4170794438436236e-05, | |
| "loss": 0.4826, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 3.7615298087739033, | |
| "grad_norm": 0.1117894145977932, | |
| "learning_rate": 1.3979018028471858e-05, | |
| "loss": 0.4719, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 3.7705286839145105, | |
| "grad_norm": 0.11651706087134892, | |
| "learning_rate": 1.3788273003722404e-05, | |
| "loss": 0.4789, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 3.779527559055118, | |
| "grad_norm": 0.11055442276894334, | |
| "learning_rate": 1.3598566924686511e-05, | |
| "loss": 0.4796, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 3.7885264341957257, | |
| "grad_norm": 0.10345352334778583, | |
| "learning_rate": 1.3409907310682462e-05, | |
| "loss": 0.4709, | |
| "step": 421 | |
| }, | |
| { | |
| "epoch": 3.797525309336333, | |
| "grad_norm": 0.10232180540617608, | |
| "learning_rate": 1.3222301639550099e-05, | |
| "loss": 0.4766, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 3.8065241844769404, | |
| "grad_norm": 0.10715279829216998, | |
| "learning_rate": 1.3035757347354526e-05, | |
| "loss": 0.4776, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 3.815523059617548, | |
| "grad_norm": 0.1047852036739672, | |
| "learning_rate": 1.2850281828091298e-05, | |
| "loss": 0.4854, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 3.824521934758155, | |
| "grad_norm": 0.10690185941320834, | |
| "learning_rate": 1.2665882433393338e-05, | |
| "loss": 0.4729, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 3.8335208098987628, | |
| "grad_norm": 0.11484140024382984, | |
| "learning_rate": 1.24825664722396e-05, | |
| "loss": 0.475, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 3.84251968503937, | |
| "grad_norm": 0.11568039124788146, | |
| "learning_rate": 1.2300341210665336e-05, | |
| "loss": 0.4758, | |
| "step": 427 | |
| }, | |
| { | |
| "epoch": 3.8515185601799775, | |
| "grad_norm": 0.10120504659325706, | |
| "learning_rate": 1.211921387147406e-05, | |
| "loss": 0.4732, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 3.8605174353205847, | |
| "grad_norm": 0.1194598344922678, | |
| "learning_rate": 1.1939191633951328e-05, | |
| "loss": 0.4768, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 3.8695163104611923, | |
| "grad_norm": 0.1090408000804906, | |
| "learning_rate": 1.1760281633580136e-05, | |
| "loss": 0.4741, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 3.8785151856018, | |
| "grad_norm": 0.10733618962308203, | |
| "learning_rate": 1.1582490961758057e-05, | |
| "loss": 0.479, | |
| "step": 431 | |
| }, | |
| { | |
| "epoch": 3.887514060742407, | |
| "grad_norm": 0.11031251576967621, | |
| "learning_rate": 1.1405826665516253e-05, | |
| "loss": 0.4768, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 3.8965129358830146, | |
| "grad_norm": 0.11348215054291673, | |
| "learning_rate": 1.1230295747240092e-05, | |
| "loss": 0.4714, | |
| "step": 433 | |
| }, | |
| { | |
| "epoch": 3.905511811023622, | |
| "grad_norm": 0.10993620934878778, | |
| "learning_rate": 1.1055905164391567e-05, | |
| "loss": 0.4785, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 3.9145106861642294, | |
| "grad_norm": 0.10346360621624287, | |
| "learning_rate": 1.0882661829233619e-05, | |
| "loss": 0.4789, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 3.923509561304837, | |
| "grad_norm": 0.10965908377773818, | |
| "learning_rate": 1.071057260855608e-05, | |
| "loss": 0.4759, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 3.9325084364454446, | |
| "grad_norm": 0.10838534658950197, | |
| "learning_rate": 1.0539644323403514e-05, | |
| "loss": 0.4732, | |
| "step": 437 | |
| }, | |
| { | |
| "epoch": 3.9415073115860517, | |
| "grad_norm": 0.10164569710162663, | |
| "learning_rate": 1.0369883748804868e-05, | |
| "loss": 0.4777, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 3.9505061867266593, | |
| "grad_norm": 0.11037796996055305, | |
| "learning_rate": 1.0201297613504946e-05, | |
| "loss": 0.4765, | |
| "step": 439 | |
| }, | |
| { | |
| "epoch": 3.9595050618672665, | |
| "grad_norm": 0.10427806979539742, | |
| "learning_rate": 1.0033892599697638e-05, | |
| "loss": 0.4724, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 3.968503937007874, | |
| "grad_norm": 0.10553954203749775, | |
| "learning_rate": 9.86767534276114e-06, | |
| "loss": 0.4827, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 3.9775028121484812, | |
| "grad_norm": 0.10616804220182864, | |
| "learning_rate": 9.702652430994917e-06, | |
| "loss": 0.4752, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 3.986501687289089, | |
| "grad_norm": 0.10240539280603339, | |
| "learning_rate": 9.538830405358523e-06, | |
| "loss": 0.4754, | |
| "step": 443 | |
| }, | |
| { | |
| "epoch": 3.9955005624296964, | |
| "grad_norm": 0.10491697645612037, | |
| "learning_rate": 9.376215759212423e-06, | |
| "loss": 0.477, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 4.008998875140607, | |
| "grad_norm": 0.17334062662555766, | |
| "learning_rate": 9.214814938060561e-06, | |
| "loss": 0.4526, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 4.017997750281215, | |
| "grad_norm": 0.13435271730849074, | |
| "learning_rate": 9.054634339294867e-06, | |
| "loss": 0.4497, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 4.026996625421822, | |
| "grad_norm": 0.11872819062021366, | |
| "learning_rate": 8.895680311941745e-06, | |
| "loss": 0.4455, | |
| "step": 447 | |
| }, | |
| { | |
| "epoch": 4.0359955005624295, | |
| "grad_norm": 0.1319008455279487, | |
| "learning_rate": 8.737959156410385e-06, | |
| "loss": 0.4527, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 4.0449943757030375, | |
| "grad_norm": 0.14206834817022773, | |
| "learning_rate": 8.581477124243002e-06, | |
| "loss": 0.451, | |
| "step": 449 | |
| }, | |
| { | |
| "epoch": 4.053993250843645, | |
| "grad_norm": 0.14857898517649198, | |
| "learning_rate": 8.426240417867121e-06, | |
| "loss": 0.4511, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 4.062992125984252, | |
| "grad_norm": 0.12827796758216584, | |
| "learning_rate": 8.272255190349678e-06, | |
| "loss": 0.4517, | |
| "step": 451 | |
| }, | |
| { | |
| "epoch": 4.071991001124859, | |
| "grad_norm": 0.12515412380900934, | |
| "learning_rate": 8.119527545153137e-06, | |
| "loss": 0.4514, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 4.080989876265467, | |
| "grad_norm": 0.13907555587381168, | |
| "learning_rate": 7.968063535893588e-06, | |
| "loss": 0.4465, | |
| "step": 453 | |
| }, | |
| { | |
| "epoch": 4.089988751406074, | |
| "grad_norm": 0.1388856297974169, | |
| "learning_rate": 7.817869166100812e-06, | |
| "loss": 0.4485, | |
| "step": 454 | |
| }, | |
| { | |
| "epoch": 4.098987626546681, | |
| "grad_norm": 0.12576078763204848, | |
| "learning_rate": 7.668950388980261e-06, | |
| "loss": 0.4506, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 4.107986501687289, | |
| "grad_norm": 0.12043647761005791, | |
| "learning_rate": 7.521313107177182e-06, | |
| "loss": 0.4507, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 4.116985376827897, | |
| "grad_norm": 0.12541943358764632, | |
| "learning_rate": 7.374963172542564e-06, | |
| "loss": 0.4524, | |
| "step": 457 | |
| }, | |
| { | |
| "epoch": 4.125984251968504, | |
| "grad_norm": 0.12465374418622191, | |
| "learning_rate": 7.229906385901264e-06, | |
| "loss": 0.4463, | |
| "step": 458 | |
| }, | |
| { | |
| "epoch": 4.134983127109112, | |
| "grad_norm": 0.1172582061392934, | |
| "learning_rate": 7.086148496822054e-06, | |
| "loss": 0.4541, | |
| "step": 459 | |
| }, | |
| { | |
| "epoch": 4.143982002249719, | |
| "grad_norm": 0.11434456228260406, | |
| "learning_rate": 6.943695203389689e-06, | |
| "loss": 0.4522, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 4.152980877390326, | |
| "grad_norm": 0.126721520706509, | |
| "learning_rate": 6.802552151979132e-06, | |
| "loss": 0.4496, | |
| "step": 461 | |
| }, | |
| { | |
| "epoch": 4.161979752530933, | |
| "grad_norm": 0.11357636546475693, | |
| "learning_rate": 6.662724937031697e-06, | |
| "loss": 0.4481, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 4.170978627671541, | |
| "grad_norm": 0.10812363979662522, | |
| "learning_rate": 6.524219100833291e-06, | |
| "loss": 0.45, | |
| "step": 463 | |
| }, | |
| { | |
| "epoch": 4.179977502812148, | |
| "grad_norm": 0.10736860028431898, | |
| "learning_rate": 6.387040133294786e-06, | |
| "loss": 0.4556, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 4.188976377952756, | |
| "grad_norm": 0.11484073744710532, | |
| "learning_rate": 6.2511934717343955e-06, | |
| "loss": 0.4585, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 4.197975253093364, | |
| "grad_norm": 0.1116733301395841, | |
| "learning_rate": 6.116684500662127e-06, | |
| "loss": 0.4509, | |
| "step": 466 | |
| }, | |
| { | |
| "epoch": 4.206974128233971, | |
| "grad_norm": 0.10184483968685969, | |
| "learning_rate": 5.983518551566403e-06, | |
| "loss": 0.4465, | |
| "step": 467 | |
| }, | |
| { | |
| "epoch": 4.215973003374578, | |
| "grad_norm": 0.10648831010048188, | |
| "learning_rate": 5.8517009027027285e-06, | |
| "loss": 0.4529, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 4.224971878515186, | |
| "grad_norm": 0.11078629147369119, | |
| "learning_rate": 5.72123677888444e-06, | |
| "loss": 0.4504, | |
| "step": 469 | |
| }, | |
| { | |
| "epoch": 4.233970753655793, | |
| "grad_norm": 0.10270056261316057, | |
| "learning_rate": 5.592131351275671e-06, | |
| "loss": 0.4488, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 4.2429696287964, | |
| "grad_norm": 0.0996620878043424, | |
| "learning_rate": 5.464389737186348e-06, | |
| "loss": 0.4479, | |
| "step": 471 | |
| }, | |
| { | |
| "epoch": 4.251968503937007, | |
| "grad_norm": 0.10061707844415065, | |
| "learning_rate": 5.338016999869351e-06, | |
| "loss": 0.4483, | |
| "step": 472 | |
| }, | |
| { | |
| "epoch": 4.2609673790776155, | |
| "grad_norm": 0.10888494466945672, | |
| "learning_rate": 5.213018148319835e-06, | |
| "loss": 0.4517, | |
| "step": 473 | |
| }, | |
| { | |
| "epoch": 4.269966254218223, | |
| "grad_norm": 0.10212521131683365, | |
| "learning_rate": 5.089398137076704e-06, | |
| "loss": 0.4506, | |
| "step": 474 | |
| }, | |
| { | |
| "epoch": 4.27896512935883, | |
| "grad_norm": 0.10500341792664687, | |
| "learning_rate": 4.967161866026229e-06, | |
| "loss": 0.451, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 4.287964004499438, | |
| "grad_norm": 0.10466244170953495, | |
| "learning_rate": 4.846314180207774e-06, | |
| "loss": 0.4473, | |
| "step": 476 | |
| }, | |
| { | |
| "epoch": 4.296962879640045, | |
| "grad_norm": 0.09722711168374878, | |
| "learning_rate": 4.726859869621847e-06, | |
| "loss": 0.4481, | |
| "step": 477 | |
| }, | |
| { | |
| "epoch": 4.305961754780652, | |
| "grad_norm": 0.10204450180802838, | |
| "learning_rate": 4.608803669040187e-06, | |
| "loss": 0.4454, | |
| "step": 478 | |
| }, | |
| { | |
| "epoch": 4.31496062992126, | |
| "grad_norm": 0.0995565612941236, | |
| "learning_rate": 4.492150257818066e-06, | |
| "loss": 0.4498, | |
| "step": 479 | |
| }, | |
| { | |
| "epoch": 4.323959505061867, | |
| "grad_norm": 0.10534766800687224, | |
| "learning_rate": 4.376904259708892e-06, | |
| "loss": 0.4528, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 4.3329583802024745, | |
| "grad_norm": 0.09462904906757555, | |
| "learning_rate": 4.263070242680866e-06, | |
| "loss": 0.447, | |
| "step": 481 | |
| }, | |
| { | |
| "epoch": 4.3419572553430825, | |
| "grad_norm": 0.09711700163085293, | |
| "learning_rate": 4.1506527187359765e-06, | |
| "loss": 0.4552, | |
| "step": 482 | |
| }, | |
| { | |
| "epoch": 4.35095613048369, | |
| "grad_norm": 0.09726591017318084, | |
| "learning_rate": 4.039656143731128e-06, | |
| "loss": 0.449, | |
| "step": 483 | |
| }, | |
| { | |
| "epoch": 4.359955005624297, | |
| "grad_norm": 0.10398933972434006, | |
| "learning_rate": 3.930084917201508e-06, | |
| "loss": 0.4541, | |
| "step": 484 | |
| }, | |
| { | |
| "epoch": 4.368953880764904, | |
| "grad_norm": 0.09827974212844832, | |
| "learning_rate": 3.821943382186275e-06, | |
| "loss": 0.4472, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 4.377952755905512, | |
| "grad_norm": 0.09502494293413481, | |
| "learning_rate": 3.715235825056338e-06, | |
| "loss": 0.4541, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 4.386951631046119, | |
| "grad_norm": 0.09221995391540017, | |
| "learning_rate": 3.609966475344493e-06, | |
| "loss": 0.4507, | |
| "step": 487 | |
| }, | |
| { | |
| "epoch": 4.395950506186726, | |
| "grad_norm": 0.09866352346696798, | |
| "learning_rate": 3.506139505577779e-06, | |
| "loss": 0.4538, | |
| "step": 488 | |
| }, | |
| { | |
| "epoch": 4.404949381327334, | |
| "grad_norm": 0.09418594379926497, | |
| "learning_rate": 3.4037590311121015e-06, | |
| "loss": 0.4547, | |
| "step": 489 | |
| }, | |
| { | |
| "epoch": 4.4139482564679415, | |
| "grad_norm": 0.09399762424115403, | |
| "learning_rate": 3.302829109969072e-06, | |
| "loss": 0.4516, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 4.422947131608549, | |
| "grad_norm": 0.09056395808047757, | |
| "learning_rate": 3.2033537426752236e-06, | |
| "loss": 0.4556, | |
| "step": 491 | |
| }, | |
| { | |
| "epoch": 4.431946006749157, | |
| "grad_norm": 0.09212248672508358, | |
| "learning_rate": 3.1053368721033974e-06, | |
| "loss": 0.4566, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 4.440944881889764, | |
| "grad_norm": 0.0899827038953071, | |
| "learning_rate": 3.0087823833164596e-06, | |
| "loss": 0.4455, | |
| "step": 493 | |
| }, | |
| { | |
| "epoch": 4.449943757030371, | |
| "grad_norm": 0.09311934934388971, | |
| "learning_rate": 2.9136941034133424e-06, | |
| "loss": 0.4469, | |
| "step": 494 | |
| }, | |
| { | |
| "epoch": 4.458942632170979, | |
| "grad_norm": 0.0942657183382691, | |
| "learning_rate": 2.8200758013773313e-06, | |
| "loss": 0.4501, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 4.467941507311586, | |
| "grad_norm": 0.09095763881082566, | |
| "learning_rate": 2.7279311879266645e-06, | |
| "loss": 0.4522, | |
| "step": 496 | |
| }, | |
| { | |
| "epoch": 4.476940382452193, | |
| "grad_norm": 0.09128227123247035, | |
| "learning_rate": 2.637263915367476e-06, | |
| "loss": 0.4539, | |
| "step": 497 | |
| }, | |
| { | |
| "epoch": 4.4859392575928005, | |
| "grad_norm": 0.0896975324322128, | |
| "learning_rate": 2.5480775774490195e-06, | |
| "loss": 0.4505, | |
| "step": 498 | |
| }, | |
| { | |
| "epoch": 4.494938132733409, | |
| "grad_norm": 0.09373464127242032, | |
| "learning_rate": 2.4603757092212057e-06, | |
| "loss": 0.4506, | |
| "step": 499 | |
| }, | |
| { | |
| "epoch": 4.503937007874016, | |
| "grad_norm": 0.09045392446753084, | |
| "learning_rate": 2.374161786894513e-06, | |
| "loss": 0.4535, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 4.512935883014623, | |
| "grad_norm": 0.09322805283679514, | |
| "learning_rate": 2.2894392277022125e-06, | |
| "loss": 0.4511, | |
| "step": 501 | |
| }, | |
| { | |
| "epoch": 4.521934758155231, | |
| "grad_norm": 0.09493164625872488, | |
| "learning_rate": 2.206211389764854e-06, | |
| "loss": 0.4531, | |
| "step": 502 | |
| }, | |
| { | |
| "epoch": 4.530933633295838, | |
| "grad_norm": 0.09141196271877726, | |
| "learning_rate": 2.124481571957242e-06, | |
| "loss": 0.4537, | |
| "step": 503 | |
| }, | |
| { | |
| "epoch": 4.539932508436445, | |
| "grad_norm": 0.0875486494434062, | |
| "learning_rate": 2.0442530137776374e-06, | |
| "loss": 0.4431, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 4.548931383577052, | |
| "grad_norm": 0.09177205535576874, | |
| "learning_rate": 1.9655288952193442e-06, | |
| "loss": 0.4567, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 4.55793025871766, | |
| "grad_norm": 0.08907941923419668, | |
| "learning_rate": 1.8883123366446955e-06, | |
| "loss": 0.4507, | |
| "step": 506 | |
| }, | |
| { | |
| "epoch": 4.566929133858268, | |
| "grad_norm": 0.09083887942548946, | |
| "learning_rate": 1.8126063986613652e-06, | |
| "loss": 0.45, | |
| "step": 507 | |
| }, | |
| { | |
| "epoch": 4.575928008998876, | |
| "grad_norm": 0.08937321221433324, | |
| "learning_rate": 1.7384140820010253e-06, | |
| "loss": 0.4489, | |
| "step": 508 | |
| }, | |
| { | |
| "epoch": 4.584926884139483, | |
| "grad_norm": 0.08801698232366557, | |
| "learning_rate": 1.6657383274004545e-06, | |
| "loss": 0.4534, | |
| "step": 509 | |
| }, | |
| { | |
| "epoch": 4.59392575928009, | |
| "grad_norm": 0.08808939037763655, | |
| "learning_rate": 1.5945820154849512e-06, | |
| "loss": 0.4469, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 4.602924634420697, | |
| "grad_norm": 0.0903921066592697, | |
| "learning_rate": 1.524947966654131e-06, | |
| "loss": 0.4535, | |
| "step": 511 | |
| }, | |
| { | |
| "epoch": 4.611923509561305, | |
| "grad_norm": 0.0899536812400529, | |
| "learning_rate": 1.4568389409702e-06, | |
| "loss": 0.4535, | |
| "step": 512 | |
| }, | |
| { | |
| "epoch": 4.620922384701912, | |
| "grad_norm": 0.08721249506811954, | |
| "learning_rate": 1.390257638048489e-06, | |
| "loss": 0.4552, | |
| "step": 513 | |
| }, | |
| { | |
| "epoch": 4.6299212598425195, | |
| "grad_norm": 0.08874618721675122, | |
| "learning_rate": 1.3252066969504874e-06, | |
| "loss": 0.4502, | |
| "step": 514 | |
| }, | |
| { | |
| "epoch": 4.6389201349831275, | |
| "grad_norm": 0.09036518689753495, | |
| "learning_rate": 1.261688696079233e-06, | |
| "loss": 0.451, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 4.647919010123735, | |
| "grad_norm": 0.08929772055357187, | |
| "learning_rate": 1.1997061530771004e-06, | |
| "loss": 0.4513, | |
| "step": 516 | |
| }, | |
| { | |
| "epoch": 4.656917885264342, | |
| "grad_norm": 0.08770289069775795, | |
| "learning_rate": 1.1392615247260275e-06, | |
| "loss": 0.4509, | |
| "step": 517 | |
| }, | |
| { | |
| "epoch": 4.665916760404949, | |
| "grad_norm": 0.0861779113751004, | |
| "learning_rate": 1.080357206850131e-06, | |
| "loss": 0.4497, | |
| "step": 518 | |
| }, | |
| { | |
| "epoch": 4.674915635545557, | |
| "grad_norm": 0.09018663786133461, | |
| "learning_rate": 1.0229955342207254e-06, | |
| "loss": 0.4556, | |
| "step": 519 | |
| }, | |
| { | |
| "epoch": 4.683914510686164, | |
| "grad_norm": 0.08621786512649365, | |
| "learning_rate": 9.67178780463809e-07, | |
| "loss": 0.4537, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 4.692913385826771, | |
| "grad_norm": 0.08431669753032205, | |
| "learning_rate": 9.129091579699412e-07, | |
| "loss": 0.4534, | |
| "step": 521 | |
| }, | |
| { | |
| "epoch": 4.701912260967379, | |
| "grad_norm": 0.08666217491427604, | |
| "learning_rate": 8.601888178065177e-07, | |
| "loss": 0.4501, | |
| "step": 522 | |
| }, | |
| { | |
| "epoch": 4.7109111361079865, | |
| "grad_norm": 0.08654873368758471, | |
| "learning_rate": 8.090198496325485e-07, | |
| "loss": 0.4506, | |
| "step": 523 | |
| }, | |
| { | |
| "epoch": 4.719910011248594, | |
| "grad_norm": 0.08652439676904433, | |
| "learning_rate": 7.594042816158187e-07, | |
| "loss": 0.4529, | |
| "step": 524 | |
| }, | |
| { | |
| "epoch": 4.728908886389202, | |
| "grad_norm": 0.08831640949919531, | |
| "learning_rate": 7.113440803524896e-07, | |
| "loss": 0.4531, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 4.737907761529809, | |
| "grad_norm": 0.0866028677777177, | |
| "learning_rate": 6.648411507891528e-07, | |
| "loss": 0.4522, | |
| "step": 526 | |
| }, | |
| { | |
| "epoch": 4.746906636670416, | |
| "grad_norm": 0.08731496032068242, | |
| "learning_rate": 6.198973361473349e-07, | |
| "loss": 0.4495, | |
| "step": 527 | |
| }, | |
| { | |
| "epoch": 4.755905511811024, | |
| "grad_norm": 0.08758993950549104, | |
| "learning_rate": 5.765144178504222e-07, | |
| "loss": 0.4526, | |
| "step": 528 | |
| }, | |
| { | |
| "epoch": 4.764904386951631, | |
| "grad_norm": 0.08831218479369252, | |
| "learning_rate": 5.34694115453065e-07, | |
| "loss": 0.4532, | |
| "step": 529 | |
| }, | |
| { | |
| "epoch": 4.773903262092238, | |
| "grad_norm": 0.0860726407842809, | |
| "learning_rate": 4.944380865730125e-07, | |
| "loss": 0.4529, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 4.7829021372328455, | |
| "grad_norm": 0.0840695127911042, | |
| "learning_rate": 4.5574792682541167e-07, | |
| "loss": 0.4502, | |
| "step": 531 | |
| }, | |
| { | |
| "epoch": 4.791901012373454, | |
| "grad_norm": 0.0827762260768475, | |
| "learning_rate": 4.186251697595678e-07, | |
| "loss": 0.4471, | |
| "step": 532 | |
| }, | |
| { | |
| "epoch": 4.800899887514061, | |
| "grad_norm": 0.08825259092439537, | |
| "learning_rate": 3.83071286798149e-07, | |
| "loss": 0.4434, | |
| "step": 533 | |
| }, | |
| { | |
| "epoch": 4.809898762654668, | |
| "grad_norm": 0.08855814892569033, | |
| "learning_rate": 3.4908768717887286e-07, | |
| "loss": 0.45, | |
| "step": 534 | |
| }, | |
| { | |
| "epoch": 4.818897637795276, | |
| "grad_norm": 0.08776896957164552, | |
| "learning_rate": 3.1667571789864015e-07, | |
| "loss": 0.4539, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 4.827896512935883, | |
| "grad_norm": 0.0848390478817017, | |
| "learning_rate": 2.858366636601639e-07, | |
| "loss": 0.451, | |
| "step": 536 | |
| }, | |
| { | |
| "epoch": 4.83689538807649, | |
| "grad_norm": 0.0832820856667789, | |
| "learning_rate": 2.5657174682101936e-07, | |
| "loss": 0.4501, | |
| "step": 537 | |
| }, | |
| { | |
| "epoch": 4.845894263217097, | |
| "grad_norm": 0.08506972205806178, | |
| "learning_rate": 2.288821273452113e-07, | |
| "loss": 0.4495, | |
| "step": 538 | |
| }, | |
| { | |
| "epoch": 4.854893138357705, | |
| "grad_norm": 0.08489253043277004, | |
| "learning_rate": 2.027689027572066e-07, | |
| "loss": 0.4527, | |
| "step": 539 | |
| }, | |
| { | |
| "epoch": 4.863892013498313, | |
| "grad_norm": 0.08426467385333178, | |
| "learning_rate": 1.7823310809840456e-07, | |
| "loss": 0.4464, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 4.872890888638921, | |
| "grad_norm": 0.08767870830184761, | |
| "learning_rate": 1.55275715886134e-07, | |
| "loss": 0.4519, | |
| "step": 541 | |
| }, | |
| { | |
| "epoch": 4.881889763779528, | |
| "grad_norm": 0.08723387825162499, | |
| "learning_rate": 1.3389763607509765e-07, | |
| "loss": 0.455, | |
| "step": 542 | |
| }, | |
| { | |
| "epoch": 4.890888638920135, | |
| "grad_norm": 0.08523501703956923, | |
| "learning_rate": 1.1409971602130754e-07, | |
| "loss": 0.4502, | |
| "step": 543 | |
| }, | |
| { | |
| "epoch": 4.899887514060742, | |
| "grad_norm": 0.08495566625522424, | |
| "learning_rate": 9.588274044848523e-08, | |
| "loss": 0.4541, | |
| "step": 544 | |
| }, | |
| { | |
| "epoch": 4.90888638920135, | |
| "grad_norm": 0.08415284728722193, | |
| "learning_rate": 7.924743141698888e-08, | |
| "loss": 0.4534, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 4.917885264341957, | |
| "grad_norm": 0.0899650650008716, | |
| "learning_rate": 6.419444829515175e-08, | |
| "loss": 0.4514, | |
| "step": 546 | |
| }, | |
| { | |
| "epoch": 4.926884139482564, | |
| "grad_norm": 0.08441613482621164, | |
| "learning_rate": 5.072438773318755e-08, | |
| "loss": 0.4483, | |
| "step": 547 | |
| }, | |
| { | |
| "epoch": 4.9358830146231725, | |
| "grad_norm": 0.08443189586686374, | |
| "learning_rate": 3.8837783639507076e-08, | |
| "loss": 0.453, | |
| "step": 548 | |
| }, | |
| { | |
| "epoch": 4.94488188976378, | |
| "grad_norm": 0.08693922903585893, | |
| "learning_rate": 2.8535107159584076e-08, | |
| "loss": 0.4519, | |
| "step": 549 | |
| }, | |
| { | |
| "epoch": 4.953880764904387, | |
| "grad_norm": 0.0869995258137961, | |
| "learning_rate": 1.9816766657254626e-08, | |
| "loss": 0.4498, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 4.962879640044994, | |
| "grad_norm": 0.08843911191962896, | |
| "learning_rate": 1.268310769855674e-08, | |
| "loss": 0.4495, | |
| "step": 551 | |
| }, | |
| { | |
| "epoch": 4.971878515185602, | |
| "grad_norm": 0.08550795511559632, | |
| "learning_rate": 7.134413038012433e-09, | |
| "loss": 0.4487, | |
| "step": 552 | |
| }, | |
| { | |
| "epoch": 4.980877390326209, | |
| "grad_norm": 0.08541041297147574, | |
| "learning_rate": 3.170902607432247e-09, | |
| "loss": 0.4502, | |
| "step": 553 | |
| }, | |
| { | |
| "epoch": 4.989876265466816, | |
| "grad_norm": 0.08600009481079209, | |
| "learning_rate": 7.92733507188892e-10, | |
| "loss": 0.4553, | |
| "step": 554 | |
| }, | |
| { | |
| "epoch": 4.998875140607424, | |
| "grad_norm": 0.0856629080592209, | |
| "learning_rate": 0.0, | |
| "loss": 0.4591, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 4.998875140607424, | |
| "step": 555, | |
| "total_flos": 2.8638155373837025e+18, | |
| "train_loss": 0.09022162760700192, | |
| "train_runtime": 11331.6206, | |
| "train_samples_per_second": 25.101, | |
| "train_steps_per_second": 0.049 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 555, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.8638155373837025e+18, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |